Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102628443
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Feb 22, 16:37
Size
119 KB
Mime Type
application/octet-stream
Expires
Mon, Feb 24, 16:37 (2 d)
Engine
blob
Format
Raw Data
Handle
24393616
Attached To
R8820 scATAC-seq
View Options
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh
index 5125c78..8f4c839 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh
index a4e36e8..3c42775 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh
index a7385c4..3e642d4 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh
index 723d374..8e8b816 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh
index 52cbdab..661fa5d 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.R
similarity index 100%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh
index 934f5ea..3bc651a 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh
@@ -1,5 +1,5 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.R
Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
similarity index 85%
rename from scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
rename to scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
index 2ef7199..f42ad19 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
@@ -1,104 +1,112 @@
setwd(file.path("/", "local", "groux", "scATAC-seq"))
# libraries
library(RColorBrewer)
# functions
source(file.path("scripts", "functions.R"))
# the number of classes searched
n.classes = c(23)
+class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1",
+ "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF",
+ "NR4A1", "EGR", "GATA", "NFAT", "RUNX")
+
# path to the images for the logo
path.a = file.path("res/A.png")
path.c = file.path("res/C.png")
path.g = file.path("res/G.png")
path.t = file.path("res/T.png")
-################## sequence patterns around ctcf motifs ##################
+################## plot architecture around TF motifs ##################
for(k in n.classes)
{
# sequence
data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k)))
model.seq = data$models
model.prob = data$prob
data = NULL
# open chromatin
model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
# nucleosomes
model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models
# plot classes
col = brewer.pal(3, "Set1")
- # X11(width=26, height=12)
+ # X11(width=24, height=12)
png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
- units="in", res=720, width=26, height=12)
+ units="in", res=720, width=24, height=12)
m = matrix(1:24, nrow=6, ncol=4, byrow=F)
layout(m)
# order from most to least probable class
ord = order(model.prob, decreasing=T)
ref.open = model.open[ord,, drop=F][,]
ref.nucl = model.nucl[ord,, drop=F][,]
ref.seq = model.seq[,,ord, drop=F][,,]
prob = model.prob[ord]
class = c(1:nrow(ref.open))[ord]
+ tf = class.tf[ord]
for(i in 1:nrow(ref.open))
{ # plot logo
plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
- main=sprintf("class %d (p=%.2f)", class[i], prob[i]))
+ main=sprintf("%s (p=%.2f)", tf[i], prob[i]))
# x-axis
x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
axis(1, at=x.at, labels=x.lab)
# y-axis is [0,1] for min/max signal
y.at = seq(0, 2, length.out=2)
y.lab = c("min", "max")
axis(2, at=y.at, labels=y.lab)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
}
# inlets with center
row_n = 1 # row counter
col_n = 1 # column counter
row_h = 1/nrow(m) # height of row
col_w = 1/ncol(m) # width of column
- row_cor = row_h / 2
+ row_cor = row_h / 3
col_cor = col_w / 3
for(i in 1:nrow(ref.open))
{ # plot logo center
left = (col_w*col_n) - col_w
right = left + col_w
left = right - col_cor
bottom = 1 - (row_h*row_n)
- top = bottom + row_h
+ top = bottom + (0.9*row_h)
bottom = top - row_cor
- par(fig=c(left, right, bottom, top), new=T)
+ p= par(fig=c(left, right, bottom, top),
+ mar=c(0,0,0,0),
+ new=T)
idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
# xaxis
# x.at = seq(1, length(idx), length.out = 3)
# x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
x.at = ceiling(length(idx)/2)
x.lab = 0
axis(1, at=x.at, labels=x.lab)
# yaxis
axis(2, at=y.at, labels=y.lab)
row_n = row_n + 1
if(i %% nrow(m) == 0)
{ col_n = col_n + 1
row_n = 1
}
+ par(p)
}
dev.off()
}
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
index f10ec18..0bfd24e 100755
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
@@ -1,76 +1,76 @@
# paths
## dir
data_dir_p="data/10xgenomics_PBMC_5k_peaks"
data_dir="data/10xgenomics_PBMC_5k"
pwm_dir="data/pwm/jaspar_2018_clustering/"
hg19_dir="data/genomes"
results_dir="results/10xgenomics_PBMC_5k_peaks_classification_6"
## matrix files
file_mat_open=$data_dir_p/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat'
file_mat_nucl=$data_dir_p/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat'
file_mat_seq=$data_dir_p/'peaks_rmsk_sampled_sequences_1kb.mat'
## file with seeds
file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt'
mkdir -p $results_dir
touch $file_seed
# EM param
n_iter='1'
n_shift='971'
-n_core=24
+n_core=32
## PWM files
jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat"
hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat"
myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat"
pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat"
cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat"
irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat"
irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat"
lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat"
foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat"
sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat"
mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat"
elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat"
stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat"
nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat"
ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat"
e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat"
ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat"
klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat"
nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat"
egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat"
gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat"
nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat"
runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat"
# classify
for k in 23
do
## results files
file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d'
file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat'
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
# extend models
file_mod1_ext=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model_extended.mat'
file_mod2_ext=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat'
file_mod3_ext=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model_extended.mat'
file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk_sampled.bed'
file_fasta=$hg19_dir/'hg19.fasta'
file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam'
file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai'
file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam'
file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai'
bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext
bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext
bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext
done
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh
index 1803c05..415a4c7 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh
@@ -1,4 +1,4 @@
scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R
deleted file mode 100644
index 0531449..0000000
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R
+++ /dev/null
@@ -1,180 +0,0 @@
-setwd(file.path("/", "local", "groux", "scATAC-seq"))
-
-# libraries
-library(RColorBrewer)
-
-# functions
-source(file.path("scripts", "functions.R"))
-
-# the number of classes searched
-n.classes = c(23)
-
-# path to the images for the logo
-path.a = file.path("res/A.png")
-path.c = file.path("res/C.png")
-path.g = file.path("res/G.png")
-path.t = file.path("res/T.png")
-
-################## sequence patterns around ctcf motifs ##################
-
-for(k in n.classes)
-{
- # sequence
- data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
- sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k)))
- model.seq = data$models
- model.prob = data$prob
- data = NULL
-
- # open chromatin
- model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
- sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
- # nucleosomes
- model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
- sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models
-
- # plot classes
- col = brewer.pal(3, "Set1")
- X11(width=26, height=12)
- # png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
- # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
- # units="in", res=720, width=18, height=12)
- m = matrix(1:24, nrow=6, ncol=4, byrow=F)
- layout(m)
- # order from most to least probable class
- ord = order(model.prob, decreasing=T)
- ref.open = model.open[ord,, drop=F][,]
- ref.nucl = model.nucl[ord,, drop=F][,]
- ref.seq = model.seq[,,ord, drop=F][,,]
- prob = model.prob[ord]
- class = c(1:nrow(ref.open))[ord]
- for(i in 1:nrow(ref.open))
- { # plot logo
- par(mar=c(2,2,2,0))
- plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
- main=sprintf("class %d (p=%.2f)", class[i], prob[i]))
- # x-axis
- x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
- x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
- axis(1, at=x.at, labels=x.lab)
- # y-axis is [0,1] for min/max signal
- y.at = seq(0, 2, length.out=2)
- y.lab = c("min", "max")
- axis(2, at=y.at, labels=y.lab)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
- lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
- }
-
- # inlets with center
- row_n = 1 # row counter
- col_n = 1 # column counter
- row_h = 1/nrow(m) # height of row
- col_w = 1/ncol(m) # width of column
- row_cor = row_h / 2
- col_cor = col_w / 3
- for(i in 1:nrow(ref.open))
- { # plot logo center
- left = (col_w*col_n) - col_w
- right = left + col_w
- left = right - col_cor
- bottom = 1 - (row_h*row_n)
- top = bottom + row_h
- bottom = top - row_cor
-
- par(fig=c(left, right, bottom, top), new=T)
- idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
- plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
- lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
- # xaxis
- # x.at = seq(1, length(idx), length.out = 3)
- # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
- x.at = ceiling(length(idx)/2)
- x.lab = 0
- axis(1, at=x.at, labels=x.lab)
- # yaxis
- axis(2, at=y.at, labels=y.lab)
- row_n = row_n + 1
- if(i %% nrow(m) == 0)
- { col_n = col_n + 1
- row_n = 1
- }
- }
- # dev.off()
-}
-
-
-# PU.1
-X11(width=18, height=9)
- par(mfrow=c(2,1))
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,,4], path.a, path.c, path.g, path.t,
- main=sprintf("class 4 (p=%.2f)", model.prob[4]))
- # x-axis
- x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3)
- x.at = seq(1, ncol(model.seq), length.out=length(x.lab))
- axis(1, at=x.at, labels=x.lab)
- # y-axis is [0,1] for min/max signal
- y.at = seq(0, 2, length.out=2)
- y.lab = c("min", "max")
- axis(2, at=y.at, labels=y.lab)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(model.open[4,] / max(model.open[4,])), lwd=1, col=col[1])
- lines(2*(model.nucl[4,] / max(model.nucl[4,])), lwd=1, col=col[2])
-
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,495:535,4], path.a, path.c, path.g, path.t,
- main=sprintf("class 4 (p=%.2f)", model.prob[4]))
- lines(2*(model.open[4,495:535] / max(model.open[4,])), lwd=1, col=col[1])
- lines(2*(model.nucl[4,495:535] / max(model.nucl[4,])), lwd=1, col=col[2])
-
-# NFE2
-X11(width=18, height=9)
- par(mfrow=c(2,1))
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,,14], path.a, path.c, path.g, path.t,
- main=sprintf("class 14 (p=%.2f)", model.prob[14]))
- # x-axis
- x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3)
- x.at = seq(1, ncol(model.seq), length.out=length(x.lab))
- axis(1, at=x.at, labels=x.lab)
- # y-axis is [0,1] for min/max signal
- y.at = seq(0, 2, length.out=2)
- y.lab = c("min", "max")
- axis(2, at=y.at, labels=y.lab)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(model.open[14,] / max(model.open[14,])), lwd=1, col=col[1])
- lines(2*(model.nucl[14,] / max(model.nucl[14,])), lwd=1, col=col[2])
-
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,490:540,14], path.a, path.c, path.g, path.t,
- main=sprintf("class 14 (p=%.2f)", model.prob[14]))
- lines(2*(model.open[14,490:540] / max(model.open[14,])), lwd=1, col=col[1])
- lines(2*(model.nucl[14,490:540] / max(model.nucl[14,])), lwd=1, col=col[2])
-
-# CEPBB
-X11(width=18, height=9)
- par(mfrow=c(2,1))
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,,5], path.a, path.c, path.g, path.t,
- main=sprintf("class 5 (p=%.2f)", model.prob[5]))
- # x-axis
- x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3)
- x.at = seq(1, ncol(model.seq), length.out=length(x.lab))
- axis(1, at=x.at, labels=x.lab)
- # y-axis is [0,1] for min/max signal
- y.at = seq(0, 2, length.out=2)
- y.lab = c("min", "max")
- axis(2, at=y.at, labels=y.lab)
- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
- lines(2*(model.open[5,] / max(model.open[5,])), lwd=1, col=col[1])
- lines(2*(model.nucl[5,] / max(model.nucl[5,])), lwd=1, col=col[2])
-
- par(mar=c(2,2,2,0))
- plot.logo(model.seq[,495:535,5], path.a, path.c, path.g, path.t,
- main=sprintf("class 5 (p=%.2f)", model.prob[5]))
- lines(2*(model.open[5,495:535] / max(model.open[5,])), lwd=1, col=col[1])
- lines(2*(model.nucl[5,495:535] / max(model.nucl[5,])), lwd=1, col=col[2])
-
\ No newline at end of file
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R
new file mode 100644
index 0000000..43375ee
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R
@@ -0,0 +1,210 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+# the number of classes searched
+n.classes = c(23)
+
+class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1",
+ "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF",
+ "NR4A1", "EGR", "GATA", "NFAT", "RUNX")
+
+# path to the images for the logo
+path.a = file.path("res/A.png")
+path.c = file.path("res/C.png")
+path.g = file.path("res/G.png")
+path.t = file.path("res/T.png")
+
+################## plot architecture around TF motifs ##################
+
+for(k in n.classes)
+{
+ # sequence
+ data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k)))
+ model.seq = data$models
+ model.prob = data$prob
+ data = NULL
+
+ # open chromatin
+ model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
+ # nucleosomes
+ model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models
+
+ # plot classes
+ col = brewer.pal(3, "Set1")
+ # X11(width=24, height=12)
+ png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
+ units="in", res=720, width=18, height=12)
+ m = matrix(1:24, nrow=6, ncol=4, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(model.prob, decreasing=T)
+ ref.open = model.open[ord,, drop=F][,]
+ ref.nucl = model.nucl[ord,, drop=F][,]
+ ref.seq = model.seq[,,ord, drop=F][,,]
+ prob = model.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ tf = class.tf[ord]
+ for(i in 1:nrow(ref.open))
+ { # plot logo
+ par(mar=c(2,2,2,0))
+ plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
+ main=sprintf("%s (p=%.2f)", tf[i], prob[i]))
+ # x-axis
+ x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
+ x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
+ axis(1, at=x.at, labels=x.lab)
+ # y-axis is [0,1] for min/max signal
+ y.at = seq(0, 2, length.out=2)
+ y.lab = c("min", "max")
+ axis(2, at=y.at, labels=y.lab)
+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
+ lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
+ lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
+ }
+
+ # inlets with center
+ row_n = 1 # row counter
+ col_n = 1 # column counter
+ row_h = 1/nrow(m) # height of row
+ col_w = 1/ncol(m) # width of column
+ row_cor = row_h / 3
+ col_cor = col_w / 3
+ for(i in 1:nrow(ref.open))
+ { # plot logo center
+ left = (col_w*col_n) - col_w
+ right = left + col_w
+ left = right - col_cor
+ bottom = 1 - (row_h*row_n)
+ top = bottom + row_h
+ bottom = top - row_cor
+
+ p= par(fig=c(left, right, bottom, top),
+ mar=c(0,0,0,0),
+ new=T)
+ idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
+ plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
+ lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
+ lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
+ # xaxis
+ # x.at = seq(1, length(idx), length.out = 3)
+ # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
+ x.at = ceiling(length(idx)/2)
+ x.lab = 0
+ axis(1, at=x.at, labels=x.lab)
+ # yaxis
+ axis(2, at=y.at, labels=y.lab)
+ row_n = row_n + 1
+ if(i %% nrow(m) == 0)
+ { col_n = col_n + 1
+ row_n = 1
+ }
+ par(p)
+ }
+ dev.off()
+}
+
+
+
+################## zoom in the center ##################
+
+for(k in n.classes)
+{ idx = 516 + c(-100:+100)
+ # sequence
+ data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k)))
+ model.seq = data$models[,idx,]
+ model.prob = data$prob
+ data = NULL
+
+ # open chromatin
+ model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models[,idx]
+ # nucleosomes
+ model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models[,idx]
+
+ # plot classes
+ col = brewer.pal(3, "Set1")
+ # X11(width=24, height=12)
+ png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
+ sprintf("peaks_rmsk_sampled_sequences_%dclass_2.png", k)),
+ units="in", res=720, width=18, height=12)
+ m = matrix(1:24, nrow=6, ncol=4, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(model.prob, decreasing=T)
+ ref.open = model.open[ord,, drop=F][,]
+ ref.nucl = model.nucl[ord,, drop=F][,]
+ ref.seq = model.seq[,,ord, drop=F][,,]
+ prob = model.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ tf = class.tf[ord]
+ for(i in 1:nrow(ref.open))
+ { # plot logo
+ par(mar=c(2,2,2,0))
+ plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
+ main=sprintf("%s (p=%.2f)", tf[i], prob[i]))
+ # x-axis
+ x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
+ x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
+ axis(1, at=x.at, labels=x.lab)
+ # y-axis is [0,1] for min/max signal
+ y.at = seq(0, 2, length.out=2)
+ y.lab = c("min", "max")
+ axis(2, at=y.at, labels=y.lab)
+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
+ lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
+ lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
+ }
+
+ # inlets with center
+ row_n = 1 # row counter
+ col_n = 1 # column counter
+ row_h = 1/nrow(m) # height of row
+ col_w = 1/ncol(m) # width of column
+ row_cor = row_h / 3
+ col_cor = col_w / 3
+ for(i in 1:nrow(ref.open))
+ { # plot logo center
+ left = (col_w*col_n) - col_w
+ right = left + col_w
+ left = right - col_cor
+ bottom = 1 - (row_h*row_n)
+ top = bottom + row_h
+ bottom = top - row_cor
+
+ p= par(fig=c(left, right, bottom, top),
+ mar=c(0,0,0,0),
+ new=T)
+ idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
+ plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
+ lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
+ lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
+ # xaxis
+ # x.at = seq(1, length(idx), length.out = 3)
+ # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
+ x.at = ceiling(length(idx)/2)
+ x.lab = 0
+ axis(1, at=x.at, labels=x.lab)
+ # yaxis
+ axis(2, at=y.at, labels=y.lab)
+ row_n = row_n + 1
+ if(i %% nrow(m) == 0)
+ { col_n = col_n + 1
+ row_n = 1
+ }
+ par(p)
+ }
+ dev.off()
+}
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
index 80b9afd..33f3be9 100755
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
@@ -1,76 +1,76 @@
# paths
## dir
data_dir_p="data/10xgenomics_PBMC_5k_peaks"
data_dir="data/10xgenomics_PBMC_5k"
pwm_dir="data/pwm/jaspar_2018_clustering/"
hg19_dir="data/genomes"
results_dir="results/10xgenomics_PBMC_5k_peaks_classification_7"
## matrix files
file_mat_open=$data_dir_p/'peaks_rmsk_openchromatin_1kb_read_atac.mat'
file_mat_nucl=$data_dir_p/'peaks_rmsk_nucleosomes_1kb_fragment_center.mat'
file_mat_seq=$data_dir_p/'peaks_rmsk_sequences_1kb.mat'
## file with seeds
file_seed=$results_dir'/peaks_rmsk_seed.txt'
mkdir -p $results_dir
touch $file_seed
# EM param
n_iter='1'
n_shift='971'
-n_core=24
+n_core=32
## PWM files
jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat"
hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat"
myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat"
pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat"
cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat"
irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat"
irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat"
lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat"
foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat"
sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat"
mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat"
elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat"
stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat"
nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat"
ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat"
e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat"
ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat"
klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat"
nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat"
egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat"
gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat"
nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat"
runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat"
# classify
for k in 23
do
## results files
file_prob=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_prob.mat4d'
file_mod1=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model.mat'
file_mod2=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model.mat'
file_mod3=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model.mat'
seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
echo "$file_prob $seed" >> $file_seed
bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2
bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3
# extend models
file_mod1_ext=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model_extended.mat'
file_mod2_ext=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat'
file_mod3_ext=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model_extended.mat'
file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk.bed'
file_fasta=$hg19_dir/'hg19.fasta'
file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam'
file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai'
file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam'
file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai'
bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext
bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext
bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext
done
diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh
index 2e4fecb..bbbfd09 100644
--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh
+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh
@@ -1,4 +1,4 @@
-scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks_sampled.sh
+scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test_sampled.R
+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R
diff --git a/src/Applications/.ProbToModelApplication.cpp.swp b/src/Applications/.ProbToModelApplication.cpp.swp
new file mode 100644
index 0000000..ace6708
Binary files /dev/null and b/src/Applications/.ProbToModelApplication.cpp.swp differ
diff --git a/src/Applications/ProbToModelApplication.cpp b/src/Applications/ProbToModelApplication.cpp
index dce386b..8a0613f 100644
--- a/src/Applications/ProbToModelApplication.cpp
+++ b/src/Applications/ProbToModelApplication.cpp
@@ -1,320 +1,320 @@
#include <ProbToModelApplication.hpp>
#include <boost/program_options.hpp>
#include <string>
#include <vector>
#include <iostream>
#include <utility> // std::move()
#include <stdexcept> // std::invalid_argument, std::runtime_error
#include <ModelComputer.hpp>
#include <ReadModelComputer.hpp>
#include <SequenceModelComputer.hpp>
#include <ConsensusSequenceModelComputer.hpp>
#include <Matrix2D.hpp>
#include <Matrix4D.hpp>
#include <matrix_utility.hpp> // filter_rows()
namespace po = boost::program_options ;
typedef std::vector<double> vector_d ;
ProbToModelApplication::ProbToModelApplication(int argn, char** argv)
: file_read(""), file_seq(""), file_prob(""), file_filter(""),
bckg_class(false),
n_threads(0), runnable(false)
{ this->parseOptions(argn, argv) ; }
ProbToModelApplication::~ProbToModelApplication()
{}
int ProbToModelApplication::run()
{ if(this->runnable)
{
// load data
std::string file_data ;
bool read_data = false ;
bool seq_data = false ;
bool consseq_data = false ;
if(this->file_read != "")
{ file_data = this->file_read ;
seq_data = consseq_data = false ; read_data = true ;
}
else if(this->file_seq != "")
{ file_data = this->file_seq ;
read_data = consseq_data = false ; seq_data = true ;
}
else if(this->file_consseq != "")
{ file_data = this->file_consseq ;
read_data = seq_data = false ; consseq_data = true ;
}
else
{ std::string msg("Error! Could not determine the type of the data!") ;
throw std::runtime_error(msg) ;
}
// row filter
std::vector<size_t> filter ;
if(this->file_filter != "")
{ // it is a column vector, easier to use the Matrix2D interface
// to read it rather than coding a function for :)
filter = Matrix2D<size_t>(this->file_filter).get_data() ;
std::sort(filter.begin(), filter.end()) ;
}
// prob
Matrix4D<double> prob ; prob.load(this->file_prob) ;
// get the data model
ModelComputer* ptr = nullptr ;
if(read_data)
{ Matrix2D<int> data(file_data) ;
// filter out some rows if needed
if(filter.size())
{ data = filter_rows(filter, data) ; }
this->check_dimensions(data, prob) ;
ptr = new ReadModelComputer(std::move(data),
prob,
this->bckg_class,
this->n_threads) ;
}
else if(seq_data)
{
Matrix2D<int> data(file_data) ;
// filter out some rows if needed
if(filter.size())
{ data = filter_rows(filter, data) ; }
this->check_dimensions(data, prob) ;
ptr = new SequenceModelComputer(std::move(data),
prob,
this->bckg_class,
this->n_threads) ;
}
else if(consseq_data)
{ Matrix3D<double> data ;
data.load(file_data) ;
// filter out some rows if needed
if(filter.size())
{ data = filter_rows(filter, data) ; }
this->check_dimensions(data, prob) ;
ptr = new ConsensusSequenceModelComputer(std::move(data),
prob,
this->bckg_class,
this->n_threads) ;
}
Matrix2D<double> model = ptr->get_model() ;
delete ptr ;
ptr = nullptr ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
vector_d class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
// 1st column contain the class prob
if(read_data)
{ for(size_t i=0; i<model_final.get_nrow(); i++)
{ model_final(i,0) = class_prob[i] ; }
}
- else if(seq_data)
+ else if(seq_data or consseq_data)
{ size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
std::cout << model_final << std::endl ;
return 0 ;
}
else
{ return 1 ; }
}
void ProbToModelApplication::parseOptions(int argn, char** argv)
{ // no option to parse
if(argv == nullptr)
{ std::string message = "no options to parse!" ;
throw std::invalid_argument(message) ;
}
// help messages
std::string desc_msg = "\n"
"ProbToRef reconstructs the class models \n"
"from the classification results (posterior \n"
"probabilities computed by ChIPPartitioning \n"
"and the data).\n\n" ;
std::string opt_help_msg = "Produces this help message." ;
std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n "
"by default 0 (no parallelization)." ;
std::string opt_read_msg = "The path to the file containing the data, if the data are\n"
"read counts.";
std::string opt_seq_msg = "The path to the file containing the data, if the data are\n"
"sequences.";
std::string opt_consseq_msg = "The path to the file containing the data, if the data are\n"
"consensus sequences.";
std::string opt_prob_msg = "The path to the file containing the posterior \n"
"probabilities." ;
std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n"
"indices of rows to filter out in the data. Once the rows have been\n"
"filtered out from the data, its rows order should correspond to the\n"
"probabilities." ;
std::string opt_bckg_msg = "Whether the last class of the classification (posterior\n"
"probabilities) is a background class." ;
// option parser
boost::program_options::variables_map vm ;
boost::program_options::options_description desc(desc_msg) ;
std::string seeding_tmp ;
desc.add_options()
("help,h", opt_help_msg.c_str())
("read,", po::value<std::string>(&(this->file_read)), opt_read_msg.c_str())
("seq,", po::value<std::string>(&(this->file_seq)), opt_seq_msg.c_str())
("consseq,", po::value<std::string>(&(this->file_consseq)), opt_seq_msg.c_str())
("prob,", po::value<std::string>(&(this->file_prob)), opt_prob_msg.c_str())
("filter", po::value<std::string>(&(this->file_filter)), opt_filter_msg.c_str())
("bgclass", opt_bckg_msg.c_str())
("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;
// parse
try
{ po::store(po::parse_command_line(argn, argv, desc), vm) ;
po::notify(vm) ;
}
catch(std::invalid_argument& e)
{ std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
throw std::invalid_argument(msg) ;
}
catch(...)
{ throw std::invalid_argument("An unknown error occured while parsing the options") ; }
bool help = vm.count("help") ;
// checks unproper option settings
if((this->file_read == "") and
(this->file_seq == "") and
(this->file_consseq == "") and
(not help))
{ std::string msg("Error! No data file was given (none of --read --seq --consseq)!") ;
throw std::invalid_argument(msg) ;
}
else if((this->file_read != "") and
(this->file_seq != "") and
(not help))
{ std::string msg("Error! --read and --seq are mutually exclusive!") ;
throw std::invalid_argument(msg) ;
}
else if((this->file_read != "") and
(this->file_consseq != "") and
(not help))
{ std::string msg("Error! --read and --consseq are mutually exclusive!") ;
throw std::invalid_argument(msg) ;
}
else if((this->file_seq != "") and
(this->file_consseq != "") and
(not help))
{ std::string msg("Error! --seq and --consseq are mutually exclusive!") ;
throw std::invalid_argument(msg) ;
}
else if(this->file_prob == "" and (not help))
{ std::string msg("Error! No posterior probabily file was given (--prob)!") ;
throw std::invalid_argument(msg) ;
}
// set background class
if(vm.count("bgclass"))
{ this->bckg_class = true ; }
// help invoked, run() cannot be invoked
if(help)
{ std::cout << desc << std::endl ;
this->runnable = false ;
return ;
}
// everything fine, run() can be called
else
{ this->runnable = true ;
return ;
}
}
void ProbToModelApplication::check_dimensions(const Matrix2D<int>& data,
const Matrix4D<double>& prob) const
{ if(data.get_nrow() != prob.get_dim()[0])
{ char msg[4096] ;
sprintf(msg, "Error! data and prob matrices have unequal "
"row numbers (%zu / %zu)!",
data.get_nrow(), prob.get_dim()[0]) ;
throw std::runtime_error(msg) ;
}
else if(data.get_ncol() < prob.get_dim()[2])
{ char msg[4096] ;
sprintf(msg, "Error! too many shift states for the data width!"
"(%zu shift states and %zu columns in data)!",
prob.get_dim()[2], data.get_ncol()) ;
throw std::runtime_error(msg) ;
}
}
void ProbToModelApplication::check_dimensions(const Matrix3D<double>& data,
const Matrix4D<double>& prob) const
{ if(data.get_dim()[0] != prob.get_dim()[0])
{ char msg[4096] ;
sprintf(msg, "Error! data and prob matrices have unequal "
"row numbers (%zu / %zu)!",
data.get_dim()[0], prob.get_dim()[0]) ;
throw std::runtime_error(msg) ;
}
else if(data.get_dim()[1] < prob.get_dim()[2])
{ char msg[4096] ;
sprintf(msg, "Error! too many shift states for the data width!"
"(%zu shift states and %zu data width)!",
prob.get_dim()[2], data.get_dim()[1]) ;
throw std::runtime_error(msg) ;
}
else if(data.get_dim()[2]!= 4)
{ char msg[4096] ;
sprintf(msg, "Error! data 3rd dimension is not 4!"
"(%zu)!",
data.get_dim()[2]) ;
throw std::runtime_error(msg) ;
}
}
int main(int argn, char** argv)
{ ProbToModelApplication app(argn, argv) ;
return app.run() ;
}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7fe6210..3086e51 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,157 +1,158 @@
# compiler options
add_compile_options(-std=c++14)
add_compile_options(-O3)
add_compile_options(-Wall)
add_compile_options(-Wextra)
add_compile_options(-Werror)
add_compile_options(-Wfatal-errors)
add_compile_options(-pedantic)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}")
add_definitions (${SEQAN_DEFINITIONS})
# include file location
include_directories(${Boost_INCLUDE_DIRS})
include_directories(${SEQAN_INCLUDE_DIRS})
include_directories("${scATACseq_SOURCE_DIR}/src/Matrix")
include_directories("${scATACseq_SOURCE_DIR}/src/Clustering")
include_directories("${scATACseq_SOURCE_DIR}/src/Random")
include_directories("${scATACseq_SOURCE_DIR}/src/Parallel")
include_directories("${scATACseq_SOURCE_DIR}/src/Statistics")
include_directories("${scATACseq_SOURCE_DIR}/src/GUI")
include_directories("${scATACseq_SOURCE_DIR}/src/Applications")
include_directories("${scATACseq_SOURCE_DIR}/src/Matrix")
include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools")
include_directories("${scATACseq_SOURCE_DIR}/src/Utility")
# compile modules into static libraries
## set output directory
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib")
## build instructions
add_library(Clustering "Clustering/DataLayer.cpp"
"Clustering/Data2DLayer.cpp"
"Clustering/Data3DLayer.cpp"
"Clustering/ReadLayer.cpp"
"Clustering/SequenceLayer.cpp"
"Clustering/ConsensusSequenceLayer.cpp"
"Clustering/ModelComputer.cpp"
"Clustering/ReadModelComputer.cpp"
"Clustering/SequenceModelComputer.cpp"
"Clustering/ConsensusSequenceModelComputer.cpp"
"Clustering/EMBase.cpp"
"Clustering/EMRead.cpp"
"Clustering/EMSequence.cpp"
"Clustering/EMConsensusSequence.cpp"
"Clustering/EMJoint.cpp")
add_library(Random "Random/Random.cpp"
"Random/RandomNumberGenerator.cpp")
add_library(Parallel "Parallel/ThreadPool.cpp")
-add_library(Statistics "Statistics/Statistics.cpp")
+add_library(Statistics "Statistics/Statistics.cpp"
+ "Statistics/KmersStatistics.cpp")
add_library(GUI "GUI/ConsoleProgressBar.cpp"
"GUI/Diplayable.cpp"
"GUI/Updatable.cpp")
add_library(GenomicTools "GenomicTools/MatrixCreator.cpp"
"GenomicTools/ReadMatrixCreator.cpp"
"GenomicTools/CorrelationMatrixCreator.cpp"
"GenomicTools/ClassReadDataCreator.cpp"
"GenomicTools/ClassSequenceDataCreator.cpp"
"GenomicTools/SequenceMatrixCreator.cpp"
"GenomicTools/GenomeRegion.cpp")
add_library(Utility "Utility/matrix_utility.cpp"
"Utility/dna_utility.cpp")
## resolve dependencies
target_link_libraries(Utility ${SEQAN_LIBRARIES})
target_link_libraries(Clustering Utility Random Statistics GUI Parallel ${SEQAN_LIBRARIES})
target_link_libraries(Parallel Threads::Threads)
target_link_libraries(GenomicTools Utility ${SEQAN_LIBRARIES})
# executables
## a toy for seqan
set(EXE_MAIN_TEST "main_test")
add_executable(${EXE_MAIN_TEST} "main_test.cpp")
target_link_libraries(${EXE_MAIN_TEST} GenomicTools Clustering)
set_target_properties(${EXE_MAIN_TEST} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## a toy for correlation matrix
set(EXE_MAIN_CORMAT "main_cormat")
add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp")
target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} Utility GenomicTools Random)
set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an application to read binary matrices and display them in txt
set(EXE_MAIN_MATBIN2TXT "MatrixBinToTxt")
add_executable(${EXE_MAIN_MATBIN2TXT} "Applications/MatrixBinToTxtApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_MAIN_MATBIN2TXT} Utility Boost::program_options)
set_target_properties(${EXE_MAIN_MATBIN2TXT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an application to know the indices of 0-signal rows in integer 2D matrices
set(EXE_MAIN_WHICHNULLROWS "WhichNullRows")
add_executable(${EXE_MAIN_WHICHNULLROWS} "Applications/WhichNullRowsApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_MAIN_WHICHNULLROWS} Boost::program_options)
set_target_properties(${EXE_MAIN_WHICHNULLROWS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an application to create a matrix from BED and a BAM file
set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator")
add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an application to create a sequence matrix from BED and a fasta file
set(EXE_MAIN_SEQMATRIX "SequenceMatrixCreator")
add_executable(${EXE_MAIN_SEQMATRIX} "Applications/SequenceMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_MAIN_SEQMATRIX} GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_MAIN_SEQMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an EMRead standalone
set(EXE_EMREAD "EMRead")
add_executable(${EXE_EMREAD} "Applications/EMReadApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_EMREAD} Clustering Boost::program_options)
set_target_properties(${EXE_EMREAD} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an EMSequence standalone
set(EXE_EMSEQ "EMSequence")
add_executable(${EXE_EMSEQ} "Applications/EMSequenceApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_EMSEQ} Clustering Utility Boost::program_options)
set_target_properties(${EXE_EMSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an EMConsensusSequence standalone
set(EXE_EMCONSSEQ "EMConsensusSequence")
add_executable(${EXE_EMCONSSEQ} "Applications/EMConsensusSequenceApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_EMCONSSEQ} Clustering Utility Boost::program_options)
set_target_properties(${EXE_EMCONSSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an EMJoint standalone
set(EXE_EMJOINT "EMJoint")
add_executable(${EXE_EMJOINT} "Applications/EMJointApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_EMJOINT} Clustering Utility Boost::program_options)
set_target_properties(${EXE_EMJOINT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to compute data models from the data and the post prob of an EM classification
set(EXE_PROB2REF "ProbToModel")
add_executable(${EXE_PROB2REF} "Applications/ProbToModelApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_PROB2REF} Clustering Utility Boost::program_options)
set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to extend read models from an EM classification
set(EXE_READMODELEXTENDER "ReadModelExtender")
add_executable(${EXE_READMODELEXTENDER} "Applications/ReadModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_READMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_READMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to extend read models from an EM classification
set(EXE_SEQUENCEMODELEXTENDER "SequenceModelExtender")
add_executable(${EXE_SEQUENCEMODELEXTENDER} "Applications/SequenceModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_SEQUENCEMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_SEQUENCEMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to extract read data from an EM classification
set(EXE_CLASSREADDATACREATOR "ClassReadDataCreator")
add_executable(${EXE_CLASSREADDATACREATOR} "Applications/ClassReadDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_CLASSREADDATACREATOR} Clustering GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_CLASSREADDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to extract sequence data from an EM classification
set(EXE_CLASSSEQDATACREATOR "ClassSequenceDataCreator")
add_executable(${EXE_CLASSSEQDATACREATOR} "Applications/ClassSequenceDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_CLASSSEQDATACREATOR} Clustering GenomicTools Utility Boost::program_options)
set_target_properties(${EXE_CLASSSEQDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## a test suite
set(EXE_TESTS "unittests")
add_executable(${EXE_TESTS} "unittests.cpp"
"Unittests/unittests_matrix.cpp"
"Unittests/unittests_genomictools.cpp")
target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools)
set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
diff --git a/src/Statistics/KmersStatistics.cpp b/src/Statistics/KmersStatistics.cpp
new file mode 100644
index 0000000..58533a8
--- /dev/null
+++ b/src/Statistics/KmersStatistics.cpp
@@ -0,0 +1,230 @@
+#include <KmersStatistics.hpp>
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include <Matrix2D.hpp>
+#include <Matrix3D.hpp>
+#include <Statistics.hpp> // poisson_cdf()
+#include <dna_utility.hpp> // dna::consensus_to_kmer()
+
+Matrix2D<int> kmers::compute_mononucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts)
+{
+ Matrix2D<int> comp(k, 4, 0) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ {
+ std::string kmer = kmers[i] ;
+ int count = counts[i] ;
+
+ for (size_t j=0; j<k ; j++)
+ { int base = int(kmer[j]) - int('0') ;
+ comp(j, base) += count ;
+ }
+ }
+ return(comp);
+}
+
+Matrix3D<int> kmers::compute_dinucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts)
+{
+ Matrix3D<int> comp2(4, k-1, 4) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ { std::string kmer = kmers[i] ;
+ int count = counts[i] ;
+ for (size_t j=0; j<k-1; j++)
+ { int base1 = int(kmer[j]) - int('0') ;
+ int base2 = int(kmer[j+1]) - int('0') ;
+ comp2(base1, j, base2) += count ;
+ }
+ }
+ return(comp2);
+}
+
+std::vector<double> kmers::compute_exp_values(size_t k,
+ const std::vector<std::string>& kmers,
+ const Matrix2D<int>& comp1,
+ const Matrix3D<int>& comp2)
+{ std::vector<double> expected(kmers.size()) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ { std::string kmer = kmers[i] ;
+ double exp = comp1(0, int(kmer[0]) - int('0')) ;
+
+ for(size_t j=0; j<k-1; j++)
+ { int base1 = int(kmer[j]) - int('0') ;
+ int base2 = int(kmer[j+1]) - int('0') ;
+ exp *= ((double)comp2(base1, j, base2) /
+ (double)comp1(j+1, base2)) ;
+ }
+
+ expected[i] = exp ;
+ }
+ return expected;
+}
+
+
+std::vector<double> kmers::compute_pvalues(const std::vector<int>& counts,
+ const std::vector<double>& expected)
+{
+ std::vector<double> pvalues(counts.size()) ;
+
+ for(size_t i=0; i<counts.size(); i++)
+ {
+ pvalues[i] = poisson_cdf((double)counts[i],
+ expected[i],
+ false) ;
+ }
+ return pvalues ;
+}
+
+std::unordered_map<std::string, int>
+kmers::compute_kmer_count(const Matrix3D<double>& consensus_sequence,
+ size_t length)
+{ std::unordered_map<std::string, int> kmer_count ;
+
+ size_t n_row = consensus_sequence.get_dim()[0] ;
+ size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;
+
+ for(size_t i=0; i<n_row; i++)
+ { for(size_t s=0; s<n_shift; s++)
+ { // get kmer
+ std::string kmer = dna::consensus_to_kmer(consensus_sequence,
+ i,
+ s,
+ s+length) ;
+ // insert
+ auto iter = kmer_count.find(kmer) ;
+ if(iter == kmer_count.end())
+ { kmer_count.emplace(kmer, 1) ; }
+ // update
+ else
+ { iter->second += 1 ; }
+ }
+ }
+ return kmer_count ;
+}
+
+std::pair<std::vector<std::string>, std::vector<double>>
+kmers::compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
+ size_t k)
+{
+
+ // get kmer count
+ std::unordered_map<std::string, int> kmer_count =
+ kmers::compute_kmer_count(consensus_sequences, k) ;
+
+ /*
+ consensus_sequences.get_dim() ;
+ std::unordered_map<std::string, int> kmer_count ;
+ // ANN
+ kmer_count["000"] = 1 ;
+ kmer_count["001"] = 2 ;
+ kmer_count["002"] = 3 ;
+ kmer_count["003"] = 4 ;
+ kmer_count["010"] = 5 ;
+ kmer_count["011"] = 6 ;
+ kmer_count["012"] = 7 ;
+ kmer_count["013"] = 8 ;
+ kmer_count["020"] = 9 ;
+ kmer_count["021"] = 10 ;
+ kmer_count["022"] = 11 ;
+ kmer_count["023"] = 12 ;
+ kmer_count["030"] = 13 ;
+ kmer_count["031"] = 14 ;
+ kmer_count["032"] = 15 ;
+ kmer_count["033"] = 16 ;
+ // CNN
+ kmer_count["100"] = 1 ;
+ kmer_count["101"] = 2 ;
+ kmer_count["102"] = 3 ;
+ kmer_count["103"] = 4 ;
+ kmer_count["110"] = 5 ;
+ kmer_count["111"] = 6 ;
+ kmer_count["112"] = 7 ;
+ kmer_count["113"] = 8 ;
+ kmer_count["120"] = 9 ;
+ kmer_count["121"] = 10 ;
+ kmer_count["122"] = 11 ;
+ kmer_count["123"] = 12 ;
+ kmer_count["130"] = 13 ;
+ kmer_count["131"] = 14 ;
+ kmer_count["132"] = 15 ;
+ kmer_count["133"] = 16 ;
+ // GNN
+ kmer_count["200"] = 1 ;
+ kmer_count["201"] = 2 ;
+ kmer_count["202"] = 3 ;
+ kmer_count["203"] = 4 ;
+ kmer_count["210"] = 5 ;
+ kmer_count["211"] = 6 ;
+ kmer_count["212"] = 7 ;
+ kmer_count["213"] = 8 ;
+ kmer_count["220"] = 9 ;
+ kmer_count["221"] = 10 ;
+ kmer_count["222"] = 11 ;
+ kmer_count["223"] = 12 ;
+ kmer_count["230"] = 13 ;
+ kmer_count["231"] = 14 ;
+ kmer_count["232"] = 15 ;
+ kmer_count["233"] = 16 ;
+ // TNN
+ kmer_count["300"] = 1 ;
+ kmer_count["301"] = 2 ;
+ kmer_count["302"] = 3 ;
+ kmer_count["303"] = 4 ;
+ kmer_count["310"] = 5 ;
+ kmer_count["311"] = 6 ;
+ kmer_count["312"] = 7 ;
+ kmer_count["313"] = 8 ;
+ kmer_count["320"] = 9 ;
+ kmer_count["321"] = 10 ;
+ kmer_count["322"] = 11 ;
+ kmer_count["323"] = 12 ;
+ kmer_count["330"] = 13 ;
+ kmer_count["331"] = 14 ;
+ kmer_count["332"] = 15 ;
+ kmer_count["333"] = 16 ;
+ */
+
+ // turn to vectors
+ std::vector<std::string> kmers(kmer_count.size()) ;
+ size_t i=0 ;
+ for(const auto& iter : kmer_count)
+ { kmers[i] = iter.first ;
+ i++ ;
+ }
+ std::sort(kmers.begin(), kmers.end()) ;
+ std::vector<int> counts(kmer_count.size()) ;
+ i=0 ;
+ for(const auto& kmer : kmers)
+ { counts[i] = kmer_count[kmer] ;
+ i++ ;
+ }
+ kmer_count.clear() ;
+
+ // get mononucleotide composition in kmer
+ // this is a probability matrix modeling the kmer
+ Matrix2D<int> comp1 = kmers::compute_mononucleotide(k, kmers, counts) ;
+
+ // get dinucleotide composition in kmer
+ // this is a probability matrix modeling the kmer
+ Matrix3D<int> comp2 = kmers::compute_dinucleotide(k, kmers, counts) ;
+
+ // get expected number of occurence for each kmer
+ std::vector<double> expected = kmers::compute_exp_values(k,
+ kmers,
+ comp1,
+ comp2) ;
+
+ // compute the pvalue associated to each kmer
+ std::vector<double> pvalues = kmers::compute_pvalues(counts,
+ expected) ;
+
+ return std::make_pair(kmers, pvalues) ;
+}
diff --git a/src/Statistics/KmersStatistics.hpp b/src/Statistics/KmersStatistics.hpp
new file mode 100644
index 0000000..fcc4f60
--- /dev/null
+++ b/src/Statistics/KmersStatistics.hpp
@@ -0,0 +1,133 @@
+#ifndef KMER_STATISTICS_HPP
+#define KMER_STATISTICS_HPP
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include <Matrix2D.hpp>
+#include <Matrix3D.hpp>
+
+
+namespace kmers {
+
+ /*!
+ * \brief Creates a table of all kmer found in
+ * a set of consensus sequences and return an
+ * associated pvalue for each one of them.
+ * To transform a consensus sequence into a kmer,
+ * the major base at each position is considered
+ * only.
+ * The pvalue is computed as 1 - p(count, exp)
+ * where p is the CDF of a Poisson distribution
+ * with <count> being the number of time a given
+ * kmer is observed in the data and <exp> its
+ * expected number of occurences. The expected
+ * number of occurences is computed from the
+ * observed di-nucleotide occurences in the given
+ * data.
+ * \param consensus_sequences
+ * \param k the kmer length.
+ * \return a pair of vectors containing the kmers
+ * in integer format (A:0, C:1, G:2, T:3) and
+ * their number of occurences (in the same order).
+ */
+ std::pair<std::vector<std::string>, std::vector<double>>
+ compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
+ size_t k) ;
+
+ /*!
+ * \brief Computes the number of occurences of each kmer
+ * appearing in a set of consensus sequences.
+ * To transform a consensus sequence into a kmer,
+ * the major base at each position is considered
+ * only.
+ * \param consensus_sequence a matrix containing the
+ * consensus sequences as a probability matrix. It has
+ * the following dimensions :
+ * 1st the number of consensus sequences
+ * 2nd the length of the consensus sequences
+ * 3rd 4 for A, C, G, T
+ * The summing over the 1st and 2nd dimensions should
+ * give 1s.
+ * \param k the kmer length.
+ * \return a map with the kmer found as keys and their
+ * number of occurences as value.
+ */
+ std::unordered_map<std::string, int>
+ compute_kmer_count(const Matrix3D<double>& consensus_sequence,
+ size_t k) ;
+
+ /*!
+ * \brief Compute the pvalue of a set of kmers given their
+ * number of occurences and their expected number of
+ * occurences.
+ * \param counts the number of occurences of each kmer.
+ * \param expected the expected number of occurences of
+ * each kmer.
+ * \return the pvalue of each kmer.
+ */
+ std::vector<double> compute_pvalues(const std::vector<int>& counts,
+ const std::vector<double>& expected) ;
+
+ /*!
+ * \brief Computes the number of expected occurences of a set
+ * of kmers given the mononucleotide and dinucleotide composition
+ * of the kmers.
+ * \param k the length of the kmers.
+ * \param kmers the kmers in integer format (A:0, C:1, G:2, T:3)
+ * \param comp1 a matrix with the number of counts of each
+ * base at each position in the kmers. It has the following
+ * dimensions :
+ * 1st dim k
+ * 2nd dim 4 for A, C, G, T
+ * \param comp2 a matrix containing the number of counts of
+ * each pair of dinucleotide in the kmers. It has the
+ * following dimensions :
+ * 1st 4 for A, C, G, T (1st base of the dinucleotide)
+ * 2nd k-1
+ * 3rd 4 for A, C, G, T (2n base of the dinucleotide)
+ * \return the expected number of occurences for each
+ * kmer.
+ */
+ std::vector<double> compute_exp_values(size_t k,
+ const std::vector<std::string>& kmers,
+ const Matrix2D<int>& comp1,
+ const Matrix3D<int>& comp2) ;
+
+ /*!
+ * \brief Computes the number of occurences of each dinucleotide
+ * pair at each position of a given set of kmers.
+ * \param k the length of the kmers.
+ * \param kmers the kmers in integer format (A:0, C:1, G:2, T:3).
+ * \param counts the number of occurences of each kmer.
+ * \return a matrix containing the number of counts of
+ * each pair of dinucleotide in the kmers. It has the
+ * following dimensions :
+ * 1st 4 for A, C, G, T (1st base of the dinucleotide)
+ * 2nd k-1
+ * 3rd 4 for A, C, G, T (2n base of the dinucleotide)
+ */
+ Matrix3D<int> compute_dinucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts) ;
+
+ /*!
+ * \brief Computes the number of occurences of each nucleotide
+ * at each position of a given set of kmers.
+ * \param k the length of the kmers.
+ * \param kmers the kmers in integer format (A:0, C:1, G:2, T:3).
+ * \param counts the number of occurences of each kmer
+ * \return a matrix with the number of counts of each
+ * base at each position in the kmers. It has the following
+ * dimensions :
+ * 1st dim k
+ * 2nd dim 4 for A, C, G, T
+ */
+ Matrix2D<int> compute_mononucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts) ;
+
+}
+
+#endif // KMER_STATISTICS_HPP
diff --git a/src/Statistics/Statistics.cpp b/src/Statistics/Statistics.cpp
index c26f789..c671237 100644
--- a/src/Statistics/Statistics.cpp
+++ b/src/Statistics/Statistics.cpp
@@ -1,31 +1,45 @@
#include <Statistics.hpp>
#include <cmath> // M_PI, pow(), sqrt(), log(), lgamma()
#include <boost/math/distributions.hpp> // beta_distribution
#include <boost/random.hpp>
double normal_pmf(double x, double mean, double sd)
{ static double pi_2 = 2.*M_PI ;
return ( 1. / ( sd * sqrt(pi_2) )) * exp(-0.5 * pow((x-mean)/sd, 2.0 ) );
}
double poisson_pmf(int x, double lambda)
{ if((x != 0) and (lambda == 0))
{ return 0. ; }
else if((x == 0) and (lambda == 0))
{ return 1. ; }
// if(x + lambda == 0)
// { return 1. ; }
return exp(x * log(lambda) - lgamma(x + 1.0) - lambda);
}
+double poisson_cdf(int x, double lambda, bool lower_tail)
+{ double prob = 1. ;
+ if(lambda != 0)
+ { boost::math::poisson_distribution<> pois_dist(lambda) ;
+ prob = cdf(pois_dist, x) ;
+ }
+ // prob of an observation as big as x
+ if(lower_tail)
+ { return prob ; }
+ // prob of an observation at least as big as x (pvalue)
+ else
+ { return 1 - prob ; }
+}
+
double beta_pmf(double x, double alpha, double beta)
{
boost::math::beta_distribution<> beta_dist(alpha, beta) ;
double y = quantile(beta_dist, x) ;
return y ;
}
diff --git a/src/Statistics/Statistics.hpp b/src/Statistics/Statistics.hpp
index fdc9808..eb0a966 100755
--- a/src/Statistics/Statistics.hpp
+++ b/src/Statistics/Statistics.hpp
@@ -1,291 +1,307 @@
#ifndef STATISTCS_HPP
#define STATISTICS_HPP
#include <vector>
#include <numeric>
#include <cmath> // pow(), sqrt()
#include <assert.h>
#include <iostream>
/*!
* \brief Computes the density of x given a gaussian
* distribution with the given mean and standard
* deviation parameters.
* \param x the value of interest.
* \param mean the gaussian mean parameter value.
* \param sd the gaussian standard deviation value.
* \return the probability of x p(x|N(mean,sd))
*/
double normal_pmf(double x, double mean, double sd) ;
/*!
* \brief Computes the probability of x given a Poisson
* distribution with the given lambda parameter.
* \param x the value of interest.
* \param lambda the lambda parameter.
* \return the probability of the value given the
* distribution.
*/
double poisson_pmf(int x, double lambda) ;
+/*!
+ * \brief Computes the probability of observing a
+ * value samller or equal to x, given a poisson
+ * distribution of parameter lambda.
+ * \param x the value.
+ * \param lambda lambda the lambda parameter.
+ * \param lower_tail whether the lower tail of the
+ * distribution should be returned. If true, the
+ * probability p of observing a value smaller or as
+ * big as x is returned. Otherwised the probability
+ * of observing a value at least as big as x
+ * is returned (1-p).
+ * \return the probability of observing a value
+ * smaller or as big as x.
+ */
+double poisson_cdf(int x, double lambda, bool lower_tail=true) ;
/*!
* \brief Computes the probability of x given a Beta
* distribution with the given alpha and beta parameters.
* \param x the value of interest
* \param alpha the alpha parameter.
* \param beta the beta parameter.
* \return the probability of the value given the
* distribution.
*/
double beta_pmf(double x, double alpha, double beta) ;
/*!
* \brief Computes the weighted mean of a vector of measures <x> given their
* probability <p>. The sum of <p> is expected to be 1, if not it will
* be normalized.
* \param x a vector of measures.
* \param p a vector of probability associated to each element of <x>.
* \param from the starting index, if different from -1.
* \param to the ending index (not included), if different from -1.
* \return the mean.
*/
template<class T>
inline double mean(const std::vector<T>& x, const std::vector<double>& p) ;
/*!
* \brief Computes the standard deviation of a vector of measures <x> given
* their probability <p>.
* \param x a vector of measures.
* \param p a vector of probability associated to each element of <x>.
* \param unbiased whether the unbiased standard deviation should be returned
* (the square root of the variance is divided by a normalizing factor).
* \return the standard deviation.
*/
template<class T>
inline double sd(const std::vector<T>& x, const std::vector<double>& p, bool unbiased=true) ;
/*! \brief Computes the unbiased standard deviation of a vector of
* measures <x> given their probability <p>.
* \param x a vector of measures.
* \param p a vector of probability associated to each element of <x>.
* \return the unbiased standard deviation.
*/
template<class T>
inline double sd_unbiased(const std::vector<T> &x, const std::vector<double> &p) ;
/*! \brief Computes the biased (non-normalized) standard deviation of a vector of
* measures <x> given their probability <p>.
* \param x a vector of measures.
* \param p a vector of probability associated to each element of <x>.
* \return the biased standard deviation.
*/
template<class T>
inline double sd_biased(const std::vector<T> &x, const std::vector<double> &p) ;
/*!
* \brief Computes the pearson correlation coefficient for two given vectors
* using v1[from1-->to1) and v2[from2-->to2). If from1, to1, from2, to2 are equal
* to -1, they are not accounted and the first and last elements considered are
* the beginning and the end of the vectors.
* \param v1 the first vector.
* \param v2 the second vector.
* \param from1 the index of the first value to use in the first vector.
* \param to1 the index after the last value to use in the fist vector.
* \param from2 the index of the first value to use in the second vector.
* \param to2 the index after the last value to use in the second vector.
* \return the correlation coefficient
*/
template<class T>
inline double cor_pearson(const std::vector<T>& v1,
const std::vector<T>& v2,
int from1=-1, int to1=-1,
int from2=-1, int to2=-1) ;
/*!
* \brief Computes the pearson correlation coefficient for two given vectors
* using v1[from1-->to1) and v2(to2<--from2] (the second vector is read backward,
* from the end).
* If from1, to1, from2, to2 are equal to -1, they are not accounted and the first
* and last elements considered are the beginning and the end of the vectors (for
* the last element considered in v2 to be the 1st of the vector (the 0th), to2
* should be set to -1, which is ignored. However this enables the default behaviour
* using the 0th element as the last one).
* \param v1 the first vector.
* \param v2 the second vector.
* \param from1 the index of the first value to use in the first vector.
* \param to1 the index after the last value to use in the fist vector.
* \param from2 the index of the first value to use in the second vector.
* \param to2 the index after the last value to use in the second vector.
* \return the correlation coefficient
*/
template<class T>
inline double cor_pearson_rev(const std::vector<T>& v1,
const std::vector<T>& v2,
int from1=-1, int to1=-1,
int from2=-1, int to2=-1) ;
template<class T>
inline double mean(const std::vector<T>& x, const std::vector<double>& p)
{
assert(x.size() == p.size()) ;
double mean = 0. ;
double total = 0. ;
for(auto i : p)
{ total += i ; }
for(size_t i=0; i<x.size(); i++)
{ mean += x[i] * (p[i]/total) ; }
return mean ;
}
template<class T>
inline double sd(const std::vector<T>& x, const std::vector<double>& p, bool unbiased)
{ if(unbiased)
{ return sd_unbiased(x, p) ; }
else
{ return sd_biased(x,p) ; }
}
template<class T>
inline double sd_unbiased(const std::vector<T>& x, const std::vector<double>& p)
{
assert(x.size() == p.size()) ;
double v1 = 0. ;
double v2 = 0. ;
double m = mean(x,p) ;
double sd = 0. ;
double total = 0. ;
for(auto i : p)
{ total += i ; }
for(size_t i=0; i<x.size(); i++)
{ double p_norm = p[i] / total ;
sd += pow(x[i] - m, 2) * (p_norm) ;
v1 += p_norm ;
v2 += pow(p_norm,2) ;
}
return sqrt(sd / (v1 - (v2/v1))) ;
}
template<class T>
inline double sd_biased(const std::vector<T>& x, const std::vector<double>& p)
{
assert(x.size() == p.size()) ;
double m = mean(x,p) ;
double sd = 0. ;
double total = 0. ;
for(auto i : p)
{ total += i ; }
for(size_t i=0; i<x.size(); i++)
{ double p_norm = p[i] / total ;
sd += pow(x[i] - m, 2) * (p_norm) ;
}
return sqrt(sd) ;
}
template<class T>
inline double cor_pearson(const std::vector<T>& v1,
const std::vector<T>& v2,
int from1, int to1,
int from2, int to2)
{
if(from1 == -1)
{ from1 = 0 ; }
if(to1 == -1)
{ to1 = v1.size() ; }
if(from2 == -1)
{ from2 = 0 ; }
if(to2 == -1)
{ to2 = v2.size() ; }
// 0 <= from < to < v.size()
// if from == to -> no sense, no value : return nan
// if from > to -> no sense, backward : does a segfault
assert(from1 >= 0) ;
assert(from1 < to1) ;
assert(to1 <= v1.size()) ;
assert(from2 >= 0) ;
assert(from2 < to2) ;
assert(to2 <= v2.size()) ;
// the terms of the formula
double sum_v1v2 = 0. ;
double sum_v1 = 0. ;
double sum_v2 = 0. ;
double sum_v1_2 = 0. ;
double sum_v2_2 = 0. ;
// the effective number of values considered
double n = to1 - from1 ;
for(size_t i1=from1, i2=from2; i1<to1; i1++,i2++)
{ sum_v1v2 += v1[i1]*v2[i2] ;
sum_v1 += v1[i1] ;
sum_v2 += v2[i2] ;
sum_v1_2 += pow(v1[i1],2) ;
sum_v2_2 += pow(v2[i2],2) ;
}
return (n*sum_v1v2 - (sum_v1*sum_v2)) /
(sqrt(n*sum_v1_2 - pow(sum_v1, 2)) * sqrt(n*sum_v2_2 - pow(sum_v2, 2)) ) ;
}
template<class T>
inline double cor_pearson_rev(const std::vector<T>& v1,
const std::vector<T>& v2,
int from1, int to1,
int from2, int to2)
{
if(from1 == -1)
{ from1 = 0 ; }
if(to1 == -1)
{ to1 = v1.size() ; }
if(from2 == -1)
{ from2 = v2.size() - 1 ; }
if(to2 == -1)
{ to2 = 0 ; }
// beware -> for v1 [from1, to1)
// for v2 [to2, from2]
// use from1 and to1 for the loop conditions
// 0 <= from < to < v1.size()
// 0 <= to2 < from2 < v2.size() because for v2 [to2, from2]
// if from == to -> no sense, no value : return nan
// if from > to -> no sense, backward : does a segfault
assert(from1 >= 0) ;
assert(from1 < to1) ;
assert(to1 <= v1.size()) ;
assert(to2 >= 0) ;
assert(to2 < from2) ;
assert(from2 < v2.size()) ;
// the terms of the formula
double sum_v1v2 = 0. ;
double sum_v1 = 0. ;
double sum_v2 = 0. ;
double sum_v1_2 = 0. ;
double sum_v2_2 = 0. ;
// the effective number of values considered
double n = to1 - from1 ;
for(int i1=from1, i2=from2; i1<to1; i1++,i2--)
{ sum_v1v2 += v1[i1]*v2[i2] ;
sum_v1 += v1[i1] ;
sum_v2 += v2[i2] ;
sum_v1_2 += pow(v1[i1],2) ;
sum_v2_2 += pow(v2[i2],2) ;
}
return (n*sum_v1v2 - (sum_v1*sum_v2)) /
(sqrt(n*sum_v1_2 - pow(sum_v1, 2)) * sqrt(n*sum_v2_2 - pow(sum_v2, 2)) ) ;
}
#endif // STATISTICS_HPP
diff --git a/src/Utility/dna_utility.cpp b/src/Utility/dna_utility.cpp
index 2902fb1..3db580a 100644
--- a/src/Utility/dna_utility.cpp
+++ b/src/Utility/dna_utility.cpp
@@ -1,178 +1,210 @@
#include<dna_utility.hpp>
-#include <string>
+#include <string> // string, to_string()
#include <unordered_map>
#include <stdexcept> // std::invalid_argument
+
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <seqan/seq_io.h> // seqan::SeqFileIn
int dna::map(char base, bool rev_compl)
{
static bool init = false ;
static std::unordered_map<char,int> hash_map ;
static std::unordered_map<char,int> hash_map_rev ;
if(not init)
{ hash_map['A'] = 0 ;
hash_map['a'] = 0 ;
hash_map['C'] = 1 ;
hash_map['c'] = 1 ;
hash_map['G'] = 2 ;
hash_map['g'] = 2 ;
hash_map['T'] = 3 ;
hash_map['t'] = 3 ;
hash_map['N'] = 4 ;
hash_map['n'] = 4 ;
hash_map_rev['A'] = hash_map['T'] ;
hash_map_rev['a'] = hash_map['t'] ;
hash_map_rev['C'] = hash_map['G'] ;
hash_map_rev['c'] = hash_map['g'] ;
hash_map_rev['G'] = hash_map['C'] ;
hash_map_rev['g'] = hash_map['c'] ;
hash_map_rev['T'] = hash_map['A'] ;
hash_map_rev['t'] = hash_map['a'] ;
hash_map_rev['N'] = hash_map['N'] ;
hash_map_rev['n'] = hash_map['n'] ;
init = true ;
}
try
{ if(rev_compl)
{ return hash_map_rev.at(base) ; }
else
{ return hash_map.at(base) ; }
}
// key could not be found
catch(std::out_of_range& e)
{ char msg[256] ;
sprintf(msg, "Error! Invalid DNA base : %c", base) ;
throw std::invalid_argument(msg) ;
}
}
char dna::map(int base, bool rev_compl)
{
static bool init = false ;
static std::unordered_map<int,char> hash_map ;
static std::unordered_map<int,char> hash_map_rev ;
if(not init)
{ hash_map[0] = 'A' ;
hash_map[1] = 'C' ;
hash_map[2] = 'G' ;
hash_map[3] = 'T' ;
hash_map[4] = 'N' ;
hash_map_rev[4] = hash_map[4] ;
hash_map_rev[3] = hash_map[0] ;
hash_map_rev[2] = hash_map[1] ;
hash_map_rev[1] = hash_map[2] ;
hash_map_rev[0] = hash_map[3] ;
init = true ;
}
try
{ if(rev_compl)
{ return hash_map_rev.at(base) ; }
else
{ return hash_map.at(base) ; }
}
// key could not be found
catch(std::out_of_range& e)
{ char msg[256] ;
sprintf(msg, "Error! Invalid DNA code : %i", base) ;
throw std::invalid_argument(msg) ;
}
}
int dna::char_to_int(char c, bool rev_compl)
{ return dna::map(c, rev_compl) ; }
Matrix2D<int> dna::char_to_int(const Matrix2D<int>& matrix)
{
size_t n_row = matrix.get_nrow() ;
size_t n_col = matrix.get_ncol() ;
Matrix2D<int> data_int(n_row, n_col) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ data_int(i,j) = dna::char_to_int(matrix(i,j)) ; }
}
return data_int ;
}
char dna::int_to_char(int n, bool rev_compl)
{ return dna::map(n, rev_compl) ; }
std::vector<double> dna::base_composition(const Matrix2D<int>& sequences, bool both_strands)
{
double total = 0. ;
std::vector<double> base_comp(4,0.) ;
int base_N = dna::map('N') ;
for(size_t i=0; i<sequences.get_nrow(); i++)
{ for(size_t j=0; j<sequences.get_ncol(); j++)
{ // forward strand
int base = sequences(i,j) ;
// do not account for N's
if(base == base_N)
{ continue ; }
else
{ base_comp[base] += 1;
total += 1. ;
}
// reverse complement strand
if(both_strands)
{ // size_t c_hash_rev = dna::hash(c, true) ;
base_comp[4-base-1] += 1. ;
total += 1. ;
}
}
}
// normalize
for(auto& i : base_comp)
{ i /= total ; }
return base_comp ;
}
std::vector<double> dna::base_composition(const Matrix3D<double>& consensus_sequences, bool both_strands)
{
if(consensus_sequences.get_dim()[2] != 4)
{ char msg[4096] ;
sprintf(msg, "Error! consensus sequences 3rd dimension not equal to 4 (%zu)",
consensus_sequences.get_dim()[2]) ;
throw std::invalid_argument(msg) ;
}
double total = 0. ;
std::vector<double> base_comp(4,0.) ;
for(size_t i=0; i<consensus_sequences.get_dim()[0]; i++)
{ for(size_t j=0; j<consensus_sequences.get_dim()[1]; j++)
{ for(size_t k=0; k<4; k++)
{ // forward strand
{ base_comp[k] += consensus_sequences(i,j,k) ;
total += consensus_sequences(i,j,k) ;
}
// revers strand
if(both_strands)
{ size_t k_comp = 4 - k - 1 ;
base_comp[k_comp] += consensus_sequences(i,j,k) ;
total += consensus_sequences(i,j,k) ;
}
}
}
}
// normalize
for(auto& i : base_comp)
{ i /= total ; }
return base_comp ;
}
+
+std::string dna::consensus_to_kmer(const Matrix3D<double>& consensus_sequences,
+ size_t row,
+ size_t from,
+ size_t to)
+{ // dna letter codes
+ static int n_code = dna::char_to_int('N') ;
+
+ // some dimensions
+ size_t length = to - from ;
+ size_t n_dim3 = consensus_sequences.get_dim()[2] ;
+
+ // kmer
+ std::string kmer(length, n_code) ;
+
+
+ for(size_t i_mat=from, i_kmer=0; i_mat<to; i_mat++, i_kmer++)
+ { // get majority base
+ int max_j = 5 ;
+ double max_n = -1. ;
+ for(size_t j=0; j<n_dim3; j++)
+ { if(consensus_sequences(row,i_mat,j) > max_n)
+ { max_n = consensus_sequences(row,i_mat,j) ;
+ max_j = j ;
+ }
+ }
+ kmer[i_kmer] = std::to_string(max_j)[0] ;
+ }
+
+ return kmer ;
+}
diff --git a/src/Utility/dna_utility.hpp b/src/Utility/dna_utility.hpp
index 03bde59..d89d770 100644
--- a/src/Utility/dna_utility.hpp
+++ b/src/Utility/dna_utility.hpp
@@ -1,106 +1,133 @@
#ifndef DNA_UTILITY_HPP
#define DNA_UTILITY_HPP
+#include <string>
+
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
namespace dna
{
/*!
* \brief Contains the mapping to convert
* DNA characters to integer codes.
* Lower and capital characters are accepted.
* \param base the character of interest.
* \param rev_compl whether the reverse
* complement of the character is of interest.
* \return the corresponding DNA code.
*/
int map(char base, bool rev_compl=false) ;
/*!
* \brief Contains the mapping to convert
* DNA code to characters.
* Only capital characters are returned.
* \param base the code of interest.
* \param rev_compl whether the reverse
* complement of the code is of interest.
* \return the corresponding DNA character.
*/
char map(int base, bool rev_compl=false) ;
/*!
* \brief Converts a DNA character (A, C,
* G, T) to an integer.
* \param c the DNA character of interest.
* \return the character integer code.
* \throw std::invalid_argument if the
* given character is not a valid DNA
* character.
*/
int char_to_int(char c, bool rev_compl= false) ;
/*!
* \brief Converts a DNA character matrix (A, C,
* G, T) to an integer matrix.
* The DNA characters are converted using
* SequenceLayer::char_to_int(char).
* param file_address the address of the file to load.
* \return the corresponding int matrix.
*/
Matrix2D<int> char_to_int(const Matrix2D<int>& matrix) ;
/*!
* \brief Converts an int DNA code to
* a regular DNA character (A, C, G, T).
* This method is the reverse method of
* char_to_int(char).
* \param n the DNA code of interest.
* \return the DNA character.
* \throw std::invalid_argument if the
* given int is not a valid DNA
* code.
*/
char int_to_char(int n, bool rev_compl=false) ;
/*!
* \brief Computes the base composition of a set of
* sequences, in integer format, contained in a matrix.
* \param sequences a matrix containing the sequences
* of interest.
* \param both_strands also accounts for the reverse
* complement of the sequences.
* \throw std::invalid_argument if a non-supported
* character is found in the matrix.
* \return a vector of 4 values corresponding to the
* frequencies of A,C,G and T
* respectively.
*/
std::vector<double> base_composition(const Matrix2D<int>& sequences,
bool both_strands) ;
/*!
* \brief Computes the base composition of a set of
* consensus sequences, contained in a probability
* matrix with the following dimensions:
* 1st the number of sequences
* 2nd the length of the sequences
* 3rd 4 for A,C,G,T
* The sum over the 1st and 2nd dimension should be 1.
* The overall sum of the matrix values should be equal
* to the 1st dimension.
* \param consensus_sequences a matrix containing the
* consensus sequences of interest.
* \param both_strands also accounts for the reverse
* complement of the sequences.
* \throw std::invalid_argument if the 3rd dimension
* of the consensus sequence matrix is not equal to 4.
* \return a vector of 4 values corresponding to the
* frequencies of A,C,G and T
* respectively.
*/
std::vector<double> base_composition(const Matrix3D<double>& consensus_sequences,
bool both_strands) ;
+
+ /*!
+ * \brief Constructs a kmer from a piece of consensus
+ * sequence piece contained in a probability matrix with
+ * the following dimensions:
+ * 1st the number of sequences
+ * 2nd the length of the sequences
+ * 3rd 4 for A,C,G,T
+ * The kmer of a consensus sequence is defined as the
+ * kmer composed of the most important base at each
+ * position.
+ * \param consensus_sequences the matrix containing the
+ * consensus sequence "kmer".
+ * \param row the index of the row containing the
+ * consensus sequence piece.
+ * \param from the starting position of the consensus
+ * sequence piece.
+ * \param to the past last ending position of the
+ * consensus sequence piece.
+ * \return the kmer.
+ */
+ std::string consensus_to_kmer(const Matrix3D<double>& consensus_sequences,
+ size_t row,
+ size_t from,
+ size_t to) ;
}
#endif // DNA_UTILITY_HPP
diff --git a/src/Utility/sorting_utility.hpp b/src/Utility/sorting_utility.hpp
new file mode 100755
index 0000000..8ce0fbe
--- /dev/null
+++ b/src/Utility/sorting_utility.hpp
@@ -0,0 +1,36 @@
+#ifndef SORTING_HPP
+#define SORTING_HPP
+
+#include <vector>
+#include <algorithm>
+
+/*! \brief Given a vector of data, it returns a vector of index
+ * corresponding to the index of each element after data sorting
+ * in ascending (by default) order.
+ * \param data a vector of interest.
+ * \param ascending specifies that the index should be returned in
+ * ascending order
+ * \return the index after sorting.
+ */
+template<class T>
+std::vector<std::size_t> order(const std::vector<T>& data, bool ascending=true)
+{
+ std::vector<std::size_t> index(data.size(), 0) ;
+
+ for(std::size_t i=0; i<index.size(); i++)
+ { index[i] = i ; }
+
+ sort(index.begin(), index.end(),
+ [&](const int& a, const int& b)
+ { return (data[a] < data[b]) ; }
+ ) ;
+
+ if(not ascending)
+ { std::reverse(index.begin(), index.end()) ; }
+
+ return index ;
+}
+
+
+
+#endif // SORTING_HPP
diff --git a/src/main_test.cpp b/src/main_test.cpp
index 7e2a898..a827475 100644
--- a/src/main_test.cpp
+++ b/src/main_test.cpp
@@ -1,453 +1,713 @@
#include <iostream>
#include <string>
#include <vector>
#include <utility>
#include <cmath>
+#include <unordered_map>
#include <Matrix2D.hpp>
#include <Matrix3D.hpp>
#include <Matrix4D.hpp>
#include <ConsensusSequenceModelComputer.hpp>
#include <EMConsensusSequence.hpp>
#include <SequenceModelComputer.hpp>
#include <EMSequence.hpp>
#include <dna_utility.hpp>
+#include <Statistics.hpp>
+#include <sorting_utility.hpp>
+#include <boost/math/distributions.hpp>
+
+#include <KmersStatistics.hpp>
template<class T>
std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
{ for(const auto& x : v)
{ stream << x << " " ; }
return stream ;
}
+template<class T, class U>
+std::ostream& operator << (std::ostream& stream, const std::unordered_map<T,U>& m)
+{ for(const auto& bucket : m)
+ { stream << bucket.first << " " << bucket.second << std::endl ; }
+ return stream ;
+}
+
Matrix3D<double> compute_consensus_sequences(const Matrix2D<int>& seq,
const Matrix4D<double>& prob,
size_t class_k)
{
size_t n_seq = seq.get_nrow() ;
size_t l_seq = seq.get_ncol() ;
// size_t n_class = prob.get_dim()[1] ;
size_t n_shift = prob.get_dim()[2] ;
size_t n_flip = prob.get_dim()[3] ;
bool flip = n_flip == 2 ;
size_t l_model = l_seq - n_shift + 1 ;
Matrix3D<double> cons_seq(n_seq, l_model, 4, 0.) ;
// aggregate
for(size_t i=0; i<n_seq; i++)
{ std::cerr << "i " << i << std::endl ;
for(size_t s=0; s<n_shift; s++)
{ //std::cerr << "s " << s << std::endl ;
double tot = 0. ;
// aggregate
for(size_t j=0; j<l_model; j++)
{ // std::cerr << "j " << j << std::endl ;
// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
int base = seq(i,s+j) ;
// N should participate to A,C,G,T equivalently -> same as doing nothing
if(base == dna::char_to_int('N'))
{ continue ; }
int base_rv = 4 - base - 1 ;
// --------- forward ---------
// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
cons_seq(i,j,base) += prob(i,class_k,s,0) ;
tot += prob(i,class_k,s,0) ;
// --------- reverse ---------
if(flip)
{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ;
tot += prob(i,class_k,s,1) ;
}
}
}
}
// normalize
for(size_t i=0; i<n_seq; i++)
{ for(size_t j=0; j<l_model; j++)
{ double tot = 0. ;
for(size_t k=0; k<4; k++)
{ tot += cons_seq(i,j,k) ; }
for(size_t k=0; k<4; k++)
{ cons_seq(i,j,k) /= tot ; }
}
}
return cons_seq ;
}
Matrix2D<int> char_to_int(const Matrix2D<char>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
Matrix2D<int> iseq(n_row, n_col, 0.) ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ iseq(i,j) = dna::char_to_int(seq(i,j)) ; }
}
return iseq ;
}
Matrix3D<double> sequence_to_consensus(const Matrix2D<int>& seq)
{
size_t n_row = seq.get_nrow() ;
size_t n_col = seq.get_ncol() ;
size_t n_dim3 = 4 ; // A, C, G, T
int base_n = dna::char_to_int('N') ;
Matrix3D<double> cseq(n_row, n_col, n_dim3, 1e-100) ;
double prob = 1. - 3*(1e-100) ; // p(base) - p(!base)
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_col; j++)
{ int base = seq(i,j) ;
// N
if(base == base_n)
{ cseq(i,j,0) = 0.25 ;
cseq(i,j,1) = 0.25 ;
cseq(i,j,2) = 0.25 ;
cseq(i,j,3) = 0.25 ;
}
// A C G T
else
{ cseq(i,j, base) = prob ; }
}
}
return cseq ;
}
double score_consensus_subseq(const Matrix3D<double>& cons_seq,
size_t row,
size_t start,
const Matrix2D<double>& model)
{ size_t ncol = cons_seq.get_dim()[1] ;
// size_t dim1 = cons_seq.get_dim()[0] ;
// size_t dim2 = cons_seq.get_dim()[1] ;
size_t dim3 = cons_seq.get_dim()[2] ;
if(start > ncol - model.get_nrow())
{ char msg[4096] ;
sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
start, ncol - model.get_nrow()) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_nrow() > ncol)
{ char msg[4096] ;
sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
model.get_nrow(), ncol) ;
throw std::invalid_argument(msg) ;
}
else if(model.get_ncol() != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
model.get_ncol()) ;
throw std::invalid_argument(msg) ;
}
else if(dim3 != 4)
{ char msg[4096] ;
sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
dim3) ;
throw std::invalid_argument(msg) ;
}
/*
double ll = 0 ;
for(size_t j_seq=from, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ for(size_t k=0; k<dim3; k++)
{ std::cerr << ll
<< " *= "
<< "("
<< model(j_mod,k)
<< " * "
<< cons_seq(row,j_seq,k)
<< ")"
<< std::endl ;
ll *= model(j_mod,k) * cons_seq(row,j_seq,k) ;
}
}*/
size_t from = start ;
size_t to = from + model.get_nrow() ; // will score [from, to)
double ll = 0. ;
for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
{ double sum = 0. ;
for(size_t k=0; k<dim3; k++)
{ std::cerr << sum
<< "+= ("
<< cons_seq(row, j_seq, k)
<< " * "
<< model(j_mod, k)
<< ")"
<< std::endl ;
sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ;
}
std::cerr << sum << std::endl ;
std::cerr << "---------------" << std::endl ;
ll += log(sum) ;
}
return ll ;
}
Matrix2D<double> test_consensus(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
// partition
EMConsensusSequence em(cseq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
// compute models
ConsensusSequenceModelComputer mc(std::move(cseq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_sequence(const std::string& file,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
bool bckg_class,
const std::string& seed,
size_t n_thread)
{ size_t n_flip = flip + 1 ;
Matrix2D<int> seq(file) ;
// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
// partition
EMSequence em(seq,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
em.classify() ;
// get post prob
Matrix4D<double> prob = em.get_post_prob() ;
// compute models
SequenceModelComputer mc(std::move(seq),
prob,
bckg_class,
n_thread) ;
Matrix2D<double> model = mc.get_model() ;
// compute the class prob
size_t n_row = prob.get_dim()[0] ;
std::vector<double> class_prob(n_class, 0.) ;
double p_tot = 0. ;
for(size_t i=0; i<n_row; i++)
{ for(size_t j=0; j<n_class; j++)
{ for(size_t k=0; k<n_shift; k++)
{ for(size_t l=0; l<n_flip; l++)
{ class_prob[j] += prob(i,j,k,l) ;
p_tot += prob(i,j,k,l) ;
}
}
}
}
for(auto& prob : class_prob)
{ prob /= p_tot ; }
// create a matrix containing the class prob in the 1st
// column and the model in the remaining columns
Matrix2D<double> model_final(model.get_nrow(),
model.get_ncol() + 1) ;
size_t i_class = 0 ;
for(size_t i=0; i<model_final.get_nrow(); i++)
{ if((i != 0) and (i % 4 == 0))
{ i_class++ ; }
model_final(i,0) = class_prob[i_class] ;
}
// fill the remaining with the model parameters
for(size_t i=0; i<model.get_nrow(); i++)
{ for(size_t j=0; j<model.get_ncol(); j++)
{ model_final(i,j+1) = model(i,j) ; }
}
return model_final ;
}
Matrix2D<double> test_aggregate(const std::string& file)
{ Matrix2D<int> seq(file) ;
size_t nrow = seq.get_nrow() ;
size_t ncol = seq.get_ncol() ;
Matrix2D<double> model_final(4, ncol+1, 1e-100) ;
for(size_t i=0; i<4; i++)
{ model_final(i,0) = 1. ; }
std::vector<double> sum(ncol+1, 0.) ;
for(size_t i=0; i<nrow; i++)
{ for(size_t j=0; j<ncol; j++)
{ int base = seq(i,j) ;
if(base == dna::char_to_int('N'))
{ continue ; }
model_final(base, j+1) += 1. ;
sum[j+1] += 1. ;
}
}
for(size_t i=0; i<model_final.get_nrow(); i++)
{ for(size_t j=1; j<model_final.get_ncol()+1; j++)
{ model_final(i,j) /= sum[j] ; }
}
return model_final ;
}
+
+std::unordered_map<std::string, int> get_kmer_count(const Matrix3D<double>& consensus_sequence,
+ size_t length)
+{ std::unordered_map<std::string, int> kmer_count ;
+
+ size_t n_row = consensus_sequence.get_dim()[0] ;
+ size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;
+
+ for(size_t i=0; i<n_row; i++)
+ { for(size_t s=0; s<n_shift; s++)
+ { // get kmer
+ std::string kmer = dna::consensus_to_kmer(consensus_sequence,
+ i,
+ s,
+ s+length) ;
+ // insert
+ auto iter = kmer_count.find(kmer) ;
+ if(iter == kmer_count.end())
+ { kmer_count.emplace(kmer, 1) ; }
+ // update
+ else
+ { iter->second += 1 ; }
+ }
+ }
+ return kmer_count ;
+}
+
+Matrix2D<int> compute_mononucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts)
+{
+ Matrix2D<int> comp(k, 4, 0) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ {
+ std::string kmer = kmers[i] ;
+ int count = counts[i] ;
+
+ for (size_t j=0; j<k ; j++)
+ { int base = int(kmer[j]) - int('0') ;
+ comp(j, base) += count ;
+ }
+ }
+ return(comp);
+}
+
+Matrix3D<int> compute_dinucleotide(size_t k,
+ const std::vector<std::string>& kmers,
+ const std::vector<int>& counts)
+{
+ Matrix3D<int> comp2(4, k-1, 4) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ { std::string kmer = kmers[i] ;
+ int count = counts[i] ;
+ for (size_t j=0; j<k-1; j++)
+ { int base1 = int(kmer[j]) - int('0') ;
+ int base2 = int(kmer[j+1]) - int('0') ;
+ comp2(base1, j, base2) += count ;
+ }
+ }
+ return(comp2);
+}
+
+std::vector<double> compute_exp_values(size_t k,
+ const std::vector<std::string>& kmers,
+ const Matrix2D<int>& comp1,
+ const Matrix3D<int>& comp2)
+{ std::vector<double> expected(kmers.size()) ;
+
+ for(size_t i=0; i<kmers.size(); i++)
+ { std::string kmer = kmers[i] ;
+ double exp = comp1(0, int(kmer[0]) - int('0')) ;
+
+ for(size_t j=0; j<k-1; j++)
+ { int base1 = int(kmer[j]) - int('0') ;
+ int base2 = int(kmer[j+1]) - int('0') ;
+ exp *= ((double)comp2(base1, j, base2) /
+ (double)comp1(j+1, base2)) ;
+ }
+
+ expected[i] = exp ;
+ }
+ return expected;
+}
+
+std::vector<double> compute_pvalues(const std::vector<int>& counts,
+ const std::vector<double>& expected)
+{
+ std::vector<double> pvalues(counts.size()) ;
+
+ for(size_t i=0; i<counts.size(); i++)
+ {
+ pvalues[i] = poisson_cdf((double)counts[i],
+ expected[i],
+ false) ;
+ }
+ return pvalues ;
+}
+
+std::pair<std::vector<std::string>, std::vector<double>>
+compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
+ size_t k)
+{
+
+ // get kmer count
+ std::unordered_map<std::string, int> kmer_count = get_kmer_count(consensus_sequences,
+ k) ;
+
+ /*
+ consensus_sequences.get_dim() ;
+ std::unordered_map<std::string, int> kmer_count ;
+ // ANN
+ kmer_count["000"] = 1 ;
+ kmer_count["001"] = 2 ;
+ kmer_count["002"] = 3 ;
+ kmer_count["003"] = 4 ;
+ kmer_count["010"] = 5 ;
+ kmer_count["011"] = 6 ;
+ kmer_count["012"] = 7 ;
+ kmer_count["013"] = 8 ;
+ kmer_count["020"] = 9 ;
+ kmer_count["021"] = 10 ;
+ kmer_count["022"] = 11 ;
+ kmer_count["023"] = 12 ;
+ kmer_count["030"] = 13 ;
+ kmer_count["031"] = 14 ;
+ kmer_count["032"] = 15 ;
+ kmer_count["033"] = 16 ;
+ // CNN
+ kmer_count["100"] = 1 ;
+ kmer_count["101"] = 2 ;
+ kmer_count["102"] = 3 ;
+ kmer_count["103"] = 4 ;
+ kmer_count["110"] = 5 ;
+ kmer_count["111"] = 6 ;
+ kmer_count["112"] = 7 ;
+ kmer_count["113"] = 8 ;
+ kmer_count["120"] = 9 ;
+ kmer_count["121"] = 10 ;
+ kmer_count["122"] = 11 ;
+ kmer_count["123"] = 12 ;
+ kmer_count["130"] = 13 ;
+ kmer_count["131"] = 14 ;
+ kmer_count["132"] = 15 ;
+ kmer_count["133"] = 16 ;
+ // GNN
+ kmer_count["200"] = 1 ;
+ kmer_count["201"] = 2 ;
+ kmer_count["202"] = 3 ;
+ kmer_count["203"] = 4 ;
+ kmer_count["210"] = 5 ;
+ kmer_count["211"] = 6 ;
+ kmer_count["212"] = 7 ;
+ kmer_count["213"] = 8 ;
+ kmer_count["220"] = 9 ;
+ kmer_count["221"] = 10 ;
+ kmer_count["222"] = 11 ;
+ kmer_count["223"] = 12 ;
+ kmer_count["230"] = 13 ;
+ kmer_count["231"] = 14 ;
+ kmer_count["232"] = 15 ;
+ kmer_count["233"] = 16 ;
+ // TNN
+ kmer_count["300"] = 1 ;
+ kmer_count["301"] = 2 ;
+ kmer_count["302"] = 3 ;
+ kmer_count["303"] = 4 ;
+ kmer_count["310"] = 5 ;
+ kmer_count["311"] = 6 ;
+ kmer_count["312"] = 7 ;
+ kmer_count["313"] = 8 ;
+ kmer_count["320"] = 9 ;
+ kmer_count["321"] = 10 ;
+ kmer_count["322"] = 11 ;
+ kmer_count["323"] = 12 ;
+ kmer_count["330"] = 13 ;
+ kmer_count["331"] = 14 ;
+ kmer_count["332"] = 15 ;
+ kmer_count["333"] = 16 ;
+ */
+
+ // turn to vectors
+ std::vector<std::string> kmers(kmer_count.size()) ;
+ size_t i=0 ;
+ for(const auto& iter : kmer_count)
+ { kmers[i] = iter.first ;
+ i++ ;
+ }
+ std::sort(kmers.begin(), kmers.end()) ;
+ std::vector<int> counts(kmer_count.size()) ;
+ i=0 ;
+ for(const auto& kmer : kmers)
+ { counts[i] = kmer_count[kmer] ;
+ i++ ;
+ }
+ kmer_count.clear() ;
+
+ // get mononucleotide composition in kmer
+ // this is a probability matrix modeling the kmer
+ Matrix2D<int> comp1 = compute_mononucleotide(k, kmers, counts) ;
+
+ // get dinucleotide composition in kmer
+ // this is a probability matrix modeling the kmer
+ Matrix3D<int> comp2 = compute_dinucleotide(k, kmers, counts) ;
+
+ // get expected number of occurence for each kmer
+ std::vector<double> expected = compute_exp_values(k,
+ kmers,
+ comp1,
+ comp2) ;
+
+ // compute the pvalue associated to each kmer
+ std::vector<double> pvalues = compute_pvalues(counts,
+ expected) ;
+
+ return std::make_pair(kmers, pvalues) ;
+}
+
int main()
{
+ // Matrix3D<double> mat(1,5,4,0.) ;
+ // mat(0,0,0) = 1. ; mat(0,1,1) = 1. ; mat(0,2,2) = 1. ; mat(0,3,3) = 1. ; mat(0,4,2) = 1. ;
+
+
+
+ std::string file = "/local/groux/scATAC-seq/data/"
+ "10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
+ Matrix3D<double> mat = sequence_to_consensus(Matrix2D<int>(file)) ;
+
+
+ size_t k = 12 ;
+ auto kmers_pvalues = kmers::compute_kmer_pvalue(mat, k) ;
+
+ // for(size_t i=0; i<kmers_pvalues.first.size(); i++)
+ // { std::cerr << kmers_pvalues.first[i] << " " << kmers_pvalues.second[i] << std::endl ; }
+
+ std::vector<size_t> index = order(kmers_pvalues.second, true) ;
+
+ for(size_t i=0 ; i<10; i++)
+ { size_t idx = index[i] ;
+ std::cout << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ;
+ }
+
+ /*
size_t n_class = 2 ;
- size_t n_iter = 50 ;
+ size_t n_iter = 50 ;sequence_to_consensus
size_t n_shift = 1 ;
bool flip = true ;
bool bckg_class = false ;
std::string seed = "" ;
size_t n_thread = 1 ;
std::string file = "/local/groux/scATAC-seq/data/"
"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
+ */
// std::string file = "/local/groux/scATAC-seq/test.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_noflip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_flip.mat" ;
// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;
/*
Matrix2D<double> model_final = test_consensus(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
*/
+ /*
Matrix2D<double> model_final = test_sequence(file,
n_class,
n_iter,
n_shift,
flip,
bckg_class,
seed,
n_thread) ;
+ */
/*
Matrix2D<double> model_final = test_aggregate(file) ;
*/
- std::cout << model_final << std::endl ;
+ // std::cout << model_final << std::endl ;
/*
double p0 = 0 ;
// double l0 = log(p0) ;
double p1 = 1. - (3.*p0) ;
// double l1 = log(p1) ;
// Matrix3D<double> cseq(1, 4, 4) ;
// cseq(0, 0, 0) = p1 ; cseq(0, 0, 1) = p0 ; cseq(0, 0, 2) = p0 ; cseq(0, 0, 3) = p0 ;
// cseq(0, 1, 0) = p0 ; cseq(0, 1, 1) = p1 ; cseq(0, 1, 2) = p0 ; cseq(0, 1, 3) = p0 ;
// cseq(0, 2, 0) = p0 ; cseq(0, 2, 1) = p0 ; cseq(0, 2, 2) = p1 ; cseq(0, 2, 3) = p0 ;
// cseq(0, 3, 0) = p0 ; cseq(0, 3, 1) = p0 ; cseq(0, 3, 2) = p0 ; cseq(0, 3, 3) = p1 ;
Matrix2D<int> seq(1, 4, -1) ;
seq(0, 0) = 0 ; seq(0, 1) = 1 ; seq(0, 2) = 2 ; seq(0, 3) = 3 ;
Matrix3D<double> cseq = sequence_to_consensus(seq) ;
std::cout << cseq << std::endl << std::endl ;
Matrix2D<double> mod(4, 4, -1) ;
mod(0,0) = p1 ; mod(0,1) = p0 ; mod(0,2) = p0 ; mod(0,3) = p0 ;
mod(1,0) = p0 ; mod(1,1) = p1 ; mod(1,2) = p0 ; mod(1,3) = p0 ;
mod(2,0) = p0 ; mod(2,1) = p0 ; mod(2,2) = p1 ; mod(2,3) = p0 ;
mod(3,0) = p0 ; mod(3,1) = p0 ; mod(3,2) = p0 ; mod(3,3) = p1 ;
std::cout << mod << std::endl ;
std::cout << score_consensus_subseq(cseq, 0,0,mod)
<< std::endl ;
*/
+
return EXIT_SUCCESS ;
}
Event Timeline
Log In to Comment