No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Feb 22, 16:37

View Options

	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.R
	similarity index 100%
	rename from scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R
	rename to scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh
	index 5125c78..8f4c839 100644
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh
	@@ -1,5 +1,5 @@

	scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh

	-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R
	+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.R
	Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/motif_tree.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.R
	similarity index 100%
	rename from scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R
	rename to scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh
	index a4e36e8..3c42775 100644
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh
	@@ -1,5 +1,5 @@

	scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh

	-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R
	+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.R
	Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.R
	similarity index 100%
	rename from scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R
	rename to scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh
	index a7385c4..3e642d4 100644
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh
	@@ -1,5 +1,5 @@

	scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh

	-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R
	+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.R
	Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.R
	similarity index 100%
	rename from scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R
	rename to scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh
	index 723d374..8e8b816 100644
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh
	@@ -1,5 +1,5 @@

	scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh

	-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R
	+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.R
	Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.R
	similarity index 100%
	rename from scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R
	rename to scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh
	index 52cbdab..661fa5d 100644
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh
	@@ -1,5 +1,5 @@

	scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh

	-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R
	+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.R
	Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/motif_tree.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.R
	similarity index 100%
	rename from scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R
	rename to scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh
	index 934f5ea..3bc651a 100644
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh
	@@ -1,5 +1,5 @@

	scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh

	-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R
	+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.R
	Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
	similarity index 85%
	rename from scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
	rename to scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
	index 2ef7199..f42ad19 100644
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
	@@ -1,104 +1,112 @@
	setwd(file.path("/", "local", "groux", "scATAC-seq"))

	# libraries
	library(RColorBrewer)

	# functions
	source(file.path("scripts", "functions.R"))

	# the number of classes searched
	n.classes = c(23)

	+class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1",
	+ "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF",
	+ "NR4A1", "EGR", "GATA", "NFAT", "RUNX")
	+
	# path to the images for the logo
	path.a = file.path("res/A.png")
	path.c = file.path("res/C.png")
	path.g = file.path("res/G.png")
	path.t = file.path("res/T.png")

	-################## sequence patterns around ctcf motifs ##################
	+################## plot architecture around TF motifs ##################

	for(k in n.classes)
	{
	# sequence
	data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
	sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k)))
	model.seq = data$models
	model.prob = data$prob
	data = NULL

	# open chromatin
	model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
	sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
	# nucleosomes
	model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
	sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models

	# plot classes
	col = brewer.pal(3, "Set1")
	- # X11(width=26, height=12)
	+ # X11(width=24, height=12)
	png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6",
	sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
	- units="in", res=720, width=26, height=12)
	+ units="in", res=720, width=24, height=12)
	m = matrix(1:24, nrow=6, ncol=4, byrow=F)
	layout(m)
	# order from most to least probable class
	ord = order(model.prob, decreasing=T)
	ref.open = model.open[ord,, drop=F][,]
	ref.nucl = model.nucl[ord,, drop=F][,]
	ref.seq = model.seq[,,ord, drop=F][,,]
	prob = model.prob[ord]
	class = c(1:nrow(ref.open))[ord]
	+ tf = class.tf[ord]
	for(i in 1:nrow(ref.open))
	{ # plot logo
	plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
	- main=sprintf("class %d (p=%.2f)", class[i], prob[i]))
	+ main=sprintf("%s (p=%.2f)", tf[i], prob[i]))
	# x-axis
	x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
	x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
	axis(1, at=x.at, labels=x.lab)
	# y-axis is [0,1] for min/max signal
	y.at = seq(0, 2, length.out=2)
	y.lab = c("min", "max")
	axis(2, at=y.at, labels=y.lab)
	# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
	lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
	}
	# inlets with center
	row_n = 1 # row counter
	col_n = 1 # column counter
	row_h = 1/nrow(m) # height of row
	col_w = 1/ncol(m) # width of column
	- row_cor = row_h / 2
	+ row_cor = row_h / 3
	col_cor = col_w / 3
	for(i in 1:nrow(ref.open))
	{ # plot logo center
	left = (col_w*col_n) - col_w
	right = left + col_w
	left = right - col_cor
	bottom = 1 - (row_h*row_n)
	- top = bottom + row_h
	+ top = bottom + (0.9*row_h)
	bottom = top - row_cor

	- par(fig=c(left, right, bottom, top), new=T)
	+ p= par(fig=c(left, right, bottom, top),
	+ mar=c(0,0,0,0),
	+ new=T)
	idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
	plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
	# plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
	lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
	# xaxis
	# x.at = seq(1, length(idx), length.out = 3)
	# x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
	x.at = ceiling(length(idx)/2)
	x.lab = 0
	axis(1, at=x.at, labels=x.lab)
	# yaxis
	axis(2, at=y.at, labels=y.lab)
	row_n = row_n + 1
	if(i %% nrow(m) == 0)
	{ col_n = col_n + 1
	row_n = 1
	}
	+ par(p)
	}
	dev.off()
	}
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
	index f10ec18..0bfd24e 100755
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh
	@@ -1,76 +1,76 @@

	# paths
	## dir
	data_dir_p="data/10xgenomics_PBMC_5k_peaks"
	data_dir="data/10xgenomics_PBMC_5k"
	pwm_dir="data/pwm/jaspar_2018_clustering/"
	hg19_dir="data/genomes"
	results_dir="results/10xgenomics_PBMC_5k_peaks_classification_6"
	## matrix files
	file_mat_open=$data_dir_p/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat'
	file_mat_nucl=$data_dir_p/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat'
	file_mat_seq=$data_dir_p/'peaks_rmsk_sampled_sequences_1kb.mat'
	## file with seeds
	file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt'

	mkdir -p $results_dir
	touch $file_seed

	# EM param
	n_iter='1'
	n_shift='971'
	-n_core=24
	+n_core=32
	## PWM files
	jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat"
	hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat"
	myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat"
	pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat"
	cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat"
	irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat"
	irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat"
	lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat"
	foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat"
	sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat"
	mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat"
	elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat"
	stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat"
	nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat"
	ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat"
	e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat"
	ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat"
	klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat"
	nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat"
	egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat"
	gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat"
	nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat"
	runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat"

	# classify
	for k in 23
	do
	## results files
	file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d'
	file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat'
	file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat'
	file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat'
	seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 \| head -c${1:-15};echo)
	echo "$file_prob $seed" >> $file_seed
	bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
	bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
	bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2
	bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3

	# extend models
	file_mod1_ext=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model_extended.mat'
	file_mod2_ext=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat'
	file_mod3_ext=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model_extended.mat'
	file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk_sampled.bed'
	file_fasta=$hg19_dir/'hg19.fasta'
	file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam'
	file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai'
	file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam'
	file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai'
	bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext
	bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext
	bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext
	done
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh
	index 1803c05..415a4c7 100644
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh
	@@ -1,4 +1,4 @@

	scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh

	-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R
	+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.R
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R
	deleted file mode 100644
	index 0531449..0000000
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test.R
	+++ /dev/null
	@@ -1,180 +0,0 @@
	-setwd(file.path("/", "local", "groux", "scATAC-seq"))
	-
	-# libraries
	-library(RColorBrewer)
	-
	-# functions
	-source(file.path("scripts", "functions.R"))
	-
	-# the number of classes searched
	-n.classes = c(23)
	-
	-# path to the images for the logo
	-path.a = file.path("res/A.png")
	-path.c = file.path("res/C.png")
	-path.g = file.path("res/G.png")
	-path.t = file.path("res/T.png")
	-
	-################## sequence patterns around ctcf motifs ##################
	-
	-for(k in n.classes)
	-{
	- # sequence
	- data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	- sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k)))
	- model.seq = data$models
	- model.prob = data$prob
	- data = NULL
	-
	- # open chromatin
	- model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	- sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
	- # nucleosomes
	- model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	- sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models
	-
	- # plot classes
	- col = brewer.pal(3, "Set1")
	- X11(width=26, height=12)
	- # png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	- # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
	- # units="in", res=720, width=18, height=12)
	- m = matrix(1:24, nrow=6, ncol=4, byrow=F)
	- layout(m)
	- # order from most to least probable class
	- ord = order(model.prob, decreasing=T)
	- ref.open = model.open[ord,, drop=F][,]
	- ref.nucl = model.nucl[ord,, drop=F][,]
	- ref.seq = model.seq[,,ord, drop=F][,,]
	- prob = model.prob[ord]
	- class = c(1:nrow(ref.open))[ord]
	- for(i in 1:nrow(ref.open))
	- { # plot logo
	- par(mar=c(2,2,2,0))
	- plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
	- main=sprintf("class %d (p=%.2f)", class[i], prob[i]))
	- # x-axis
	- x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
	- x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
	- axis(1, at=x.at, labels=x.lab)
	- # y-axis is [0,1] for min/max signal
	- y.at = seq(0, 2, length.out=2)
	- y.lab = c("min", "max")
	- axis(2, at=y.at, labels=y.lab)
	- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	- lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
	- lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
	- }
	-
	- # inlets with center
	- row_n = 1 # row counter
	- col_n = 1 # column counter
	- row_h = 1/nrow(m) # height of row
	- col_w = 1/ncol(m) # width of column
	- row_cor = row_h / 2
	- col_cor = col_w / 3
	- for(i in 1:nrow(ref.open))
	- { # plot logo center
	- left = (col_w*col_n) - col_w
	- right = left + col_w
	- left = right - col_cor
	- bottom = 1 - (row_h*row_n)
	- top = bottom + row_h
	- bottom = top - row_cor
	-
	- par(fig=c(left, right, bottom, top), new=T)
	- idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
	- plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
	- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	- lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
	- lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
	- # xaxis
	- # x.at = seq(1, length(idx), length.out = 3)
	- # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
	- x.at = ceiling(length(idx)/2)
	- x.lab = 0
	- axis(1, at=x.at, labels=x.lab)
	- # yaxis
	- axis(2, at=y.at, labels=y.lab)
	- row_n = row_n + 1
	- if(i %% nrow(m) == 0)
	- { col_n = col_n + 1
	- row_n = 1
	- }
	- }
	- # dev.off()
	-}
	-
	-
	-# PU.1
	-X11(width=18, height=9)
	- par(mfrow=c(2,1))
	- par(mar=c(2,2,2,0))
	- plot.logo(model.seq[,,4], path.a, path.c, path.g, path.t,
	- main=sprintf("class 4 (p=%.2f)", model.prob[4]))
	- # x-axis
	- x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3)
	- x.at = seq(1, ncol(model.seq), length.out=length(x.lab))
	- axis(1, at=x.at, labels=x.lab)
	- # y-axis is [0,1] for min/max signal
	- y.at = seq(0, 2, length.out=2)
	- y.lab = c("min", "max")
	- axis(2, at=y.at, labels=y.lab)
	- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	- lines(2*(model.open[4,] / max(model.open[4,])), lwd=1, col=col[1])
	- lines(2*(model.nucl[4,] / max(model.nucl[4,])), lwd=1, col=col[2])
	-
	- par(mar=c(2,2,2,0))
	- plot.logo(model.seq[,495:535,4], path.a, path.c, path.g, path.t,
	- main=sprintf("class 4 (p=%.2f)", model.prob[4]))
	- lines(2*(model.open[4,495:535] / max(model.open[4,])), lwd=1, col=col[1])
	- lines(2*(model.nucl[4,495:535] / max(model.nucl[4,])), lwd=1, col=col[2])
	-
	-# NFE2
	-X11(width=18, height=9)
	- par(mfrow=c(2,1))
	- par(mar=c(2,2,2,0))
	- plot.logo(model.seq[,,14], path.a, path.c, path.g, path.t,
	- main=sprintf("class 14 (p=%.2f)", model.prob[14]))
	- # x-axis
	- x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3)
	- x.at = seq(1, ncol(model.seq), length.out=length(x.lab))
	- axis(1, at=x.at, labels=x.lab)
	- # y-axis is [0,1] for min/max signal
	- y.at = seq(0, 2, length.out=2)
	- y.lab = c("min", "max")
	- axis(2, at=y.at, labels=y.lab)
	- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	- lines(2*(model.open[14,] / max(model.open[14,])), lwd=1, col=col[1])
	- lines(2*(model.nucl[14,] / max(model.nucl[14,])), lwd=1, col=col[2])
	-
	- par(mar=c(2,2,2,0))
	- plot.logo(model.seq[,490:540,14], path.a, path.c, path.g, path.t,
	- main=sprintf("class 14 (p=%.2f)", model.prob[14]))
	- lines(2*(model.open[14,490:540] / max(model.open[14,])), lwd=1, col=col[1])
	- lines(2*(model.nucl[14,490:540] / max(model.nucl[14,])), lwd=1, col=col[2])
	-
	-# CEPBB
	-X11(width=18, height=9)
	- par(mfrow=c(2,1))
	- par(mar=c(2,2,2,0))
	- plot.logo(model.seq[,,5], path.a, path.c, path.g, path.t,
	- main=sprintf("class 5 (p=%.2f)", model.prob[5]))
	- # x-axis
	- x.lab = seq(-(ncol(model.seq)-1)/2, (ncol(model.seq)-1)/2, length.out=3)
	- x.at = seq(1, ncol(model.seq), length.out=length(x.lab))
	- axis(1, at=x.at, labels=x.lab)
	- # y-axis is [0,1] for min/max signal
	- y.at = seq(0, 2, length.out=2)
	- y.lab = c("min", "max")
	- axis(2, at=y.at, labels=y.lab)
	- # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	- lines(2*(model.open[5,] / max(model.open[5,])), lwd=1, col=col[1])
	- lines(2*(model.nucl[5,] / max(model.nucl[5,])), lwd=1, col=col[2])
	-
	- par(mar=c(2,2,2,0))
	- plot.logo(model.seq[,495:535,5], path.a, path.c, path.g, path.t,
	- main=sprintf("class 5 (p=%.2f)", model.prob[5]))
	- lines(2*(model.open[5,495:535] / max(model.open[5,])), lwd=1, col=col[1])
	- lines(2*(model.nucl[5,495:535] / max(model.nucl[5,])), lwd=1, col=col[2])
	-
	\ No newline at end of file
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R
	new file mode 100644
	index 0000000..43375ee
	--- /dev/null
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R
	@@ -0,0 +1,210 @@
	+setwd(file.path("/", "local", "groux", "scATAC-seq"))
	+
	+# libraries
	+library(RColorBrewer)
	+
	+# functions
	+source(file.path("scripts", "functions.R"))
	+
	+# the number of classes searched
	+n.classes = c(23)
	+
	+class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1",
	+ "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF",
	+ "NR4A1", "EGR", "GATA", "NFAT", "RUNX")
	+
	+# path to the images for the logo
	+path.a = file.path("res/A.png")
	+path.c = file.path("res/C.png")
	+path.g = file.path("res/G.png")
	+path.t = file.path("res/T.png")
	+
	+################## plot architecture around TF motifs ##################
	+
	+for(k in n.classes)
	+{
	+ # sequence
	+ data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	+ sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k)))
	+ model.seq = data$models
	+ model.prob = data$prob
	+ data = NULL
	+
	+ # open chromatin
	+ model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	+ sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models
	+ # nucleosomes
	+ model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	+ sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models
	+
	+ # plot classes
	+ col = brewer.pal(3, "Set1")
	+ # X11(width=24, height=12)
	+ png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	+ sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)),
	+ units="in", res=720, width=18, height=12)
	+ m = matrix(1:24, nrow=6, ncol=4, byrow=F)
	+ layout(m)
	+ # order from most to least probable class
	+ ord = order(model.prob, decreasing=T)
	+ ref.open = model.open[ord,, drop=F][,]
	+ ref.nucl = model.nucl[ord,, drop=F][,]
	+ ref.seq = model.seq[,,ord, drop=F][,,]
	+ prob = model.prob[ord]
	+ class = c(1:nrow(ref.open))[ord]
	+ tf = class.tf[ord]
	+ for(i in 1:nrow(ref.open))
	+ { # plot logo
	+ par(mar=c(2,2,2,0))
	+ plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
	+ main=sprintf("%s (p=%.2f)", tf[i], prob[i]))
	+ # x-axis
	+ x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
	+ x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
	+ axis(1, at=x.at, labels=x.lab)
	+ # y-axis is [0,1] for min/max signal
	+ y.at = seq(0, 2, length.out=2)
	+ y.lab = c("min", "max")
	+ axis(2, at=y.at, labels=y.lab)
	+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	+ lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
	+ lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
	+ }
	+
	+ # inlets with center
	+ row_n = 1 # row counter
	+ col_n = 1 # column counter
	+ row_h = 1/nrow(m) # height of row
	+ col_w = 1/ncol(m) # width of column
	+ row_cor = row_h / 3
	+ col_cor = col_w / 3
	+ for(i in 1:nrow(ref.open))
	+ { # plot logo center
	+ left = (col_w*col_n) - col_w
	+ right = left + col_w
	+ left = right - col_cor
	+ bottom = 1 - (row_h*row_n)
	+ top = bottom + row_h
	+ bottom = top - row_cor
	+
	+ p= par(fig=c(left, right, bottom, top),
	+ mar=c(0,0,0,0),
	+ new=T)
	+ idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
	+ plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
	+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	+ lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
	+ lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
	+ # xaxis
	+ # x.at = seq(1, length(idx), length.out = 3)
	+ # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
	+ x.at = ceiling(length(idx)/2)
	+ x.lab = 0
	+ axis(1, at=x.at, labels=x.lab)
	+ # yaxis
	+ axis(2, at=y.at, labels=y.lab)
	+ row_n = row_n + 1
	+ if(i %% nrow(m) == 0)
	+ { col_n = col_n + 1
	+ row_n = 1
	+ }
	+ par(p)
	+ }
	+ dev.off()
	+}
	+
	+
	+
	+################## zoom in the center ##################
	+
	+for(k in n.classes)
	+{ idx = 516 + c(-100:+100)
	+ # sequence
	+ data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	+ sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k)))
	+ model.seq = data$models[,idx,]
	+ model.prob = data$prob
	+ data = NULL
	+
	+ # open chromatin
	+ model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	+ sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models[,idx]
	+ # nucleosomes
	+ model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	+ sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models[,idx]
	+
	+ # plot classes
	+ col = brewer.pal(3, "Set1")
	+ # X11(width=24, height=12)
	+ png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7",
	+ sprintf("peaks_rmsk_sampled_sequences_%dclass_2.png", k)),
	+ units="in", res=720, width=18, height=12)
	+ m = matrix(1:24, nrow=6, ncol=4, byrow=F)
	+ layout(m)
	+ # order from most to least probable class
	+ ord = order(model.prob, decreasing=T)
	+ ref.open = model.open[ord,, drop=F][,]
	+ ref.nucl = model.nucl[ord,, drop=F][,]
	+ ref.seq = model.seq[,,ord, drop=F][,,]
	+ prob = model.prob[ord]
	+ class = c(1:nrow(ref.open))[ord]
	+ tf = class.tf[ord]
	+ for(i in 1:nrow(ref.open))
	+ { # plot logo
	+ par(mar=c(2,2,2,0))
	+ plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t,
	+ main=sprintf("%s (p=%.2f)", tf[i], prob[i]))
	+ # x-axis
	+ x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3)
	+ x.at = seq(1, ncol(ref.open), length.out=length(x.lab))
	+ axis(1, at=x.at, labels=x.lab)
	+ # y-axis is [0,1] for min/max signal
	+ y.at = seq(0, 2, length.out=2)
	+ y.lab = c("min", "max")
	+ axis(2, at=y.at, labels=y.lab)
	+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	+ lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1])
	+ lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2])
	+ }
	+
	+ # inlets with center
	+ row_n = 1 # row counter
	+ col_n = 1 # column counter
	+ row_h = 1/nrow(m) # height of row
	+ col_w = 1/ncol(m) # width of column
	+ row_cor = row_h / 3
	+ col_cor = col_w / 3
	+ for(i in 1:nrow(ref.open))
	+ { # plot logo center
	+ left = (col_w*col_n) - col_w
	+ right = left + col_w
	+ left = right - col_cor
	+ bottom = 1 - (row_h*row_n)
	+ top = bottom + row_h
	+ bottom = top - row_cor
	+
	+ p= par(fig=c(left, right, bottom, top),
	+ mar=c(0,0,0,0),
	+ new=T)
	+ idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10)
	+ plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t)
	+ # plot signal (multiplies by 2 because the y-axis goes to 2 bits)
	+ lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1])
	+ lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2])
	+ # xaxis
	+ # x.at = seq(1, length(idx), length.out = 3)
	+ # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at]
	+ x.at = ceiling(length(idx)/2)
	+ x.lab = 0
	+ axis(1, at=x.at, labels=x.lab)
	+ # yaxis
	+ axis(2, at=y.at, labels=y.lab)
	+ row_n = row_n + 1
	+ if(i %% nrow(m) == 0)
	+ { col_n = col_n + 1
	+ row_n = 1
	+ }
	+ par(p)
	+ }
	+ dev.off()
	+}
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
	index 80b9afd..33f3be9 100755
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh
	@@ -1,76 +1,76 @@

	# paths
	## dir
	data_dir_p="data/10xgenomics_PBMC_5k_peaks"
	data_dir="data/10xgenomics_PBMC_5k"
	pwm_dir="data/pwm/jaspar_2018_clustering/"
	hg19_dir="data/genomes"
	results_dir="results/10xgenomics_PBMC_5k_peaks_classification_7"
	## matrix files
	file_mat_open=$data_dir_p/'peaks_rmsk_openchromatin_1kb_read_atac.mat'
	file_mat_nucl=$data_dir_p/'peaks_rmsk_nucleosomes_1kb_fragment_center.mat'
	file_mat_seq=$data_dir_p/'peaks_rmsk_sequences_1kb.mat'
	## file with seeds
	file_seed=$results_dir'/peaks_rmsk_seed.txt'

	mkdir -p $results_dir
	touch $file_seed

	# EM param
	n_iter='1'
	n_shift='971'
	-n_core=24
	+n_core=32
	## PWM files
	jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat"
	hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat"
	myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat"
	pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat"
	cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat"
	irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat"
	irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat"
	lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat"
	foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat"
	sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat"
	mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat"
	elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat"
	stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat"
	nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat"
	ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat"
	e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat"
	ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat"
	klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat"
	nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat"
	egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat"
	gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat"
	nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat"
	runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat"

	# classify
	for k in 23
	do
	## results files
	file_prob=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_prob.mat4d'
	file_mod1=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model.mat'
	file_mod2=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model.mat'
	file_mod3=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model.mat'
	seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 \| head -c${1:-15};echo)
	echo "$file_prob $seed" >> $file_seed
	bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob
	bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1
	bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2
	bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3

	# extend models
	file_mod1_ext=$results_dir/'peaks_rmsk_openchromatin_1kb_read_atac_'$k'class_model_extended.mat'
	file_mod2_ext=$results_dir/'peaks_rmsk_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat'
	file_mod3_ext=$results_dir/'peaks_rmsk_sequences_1kb_'$k'class_model_extended.mat'
	file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk.bed'
	file_fasta=$hg19_dir/'hg19.fasta'
	file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam'
	file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai'
	file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam'
	file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai'
	bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext
	bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext
	bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext
	done
	diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh
	index 2e4fecb..bbbfd09 100644
	--- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh
	+++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh
	@@ -1,4 +1,4 @@

	-scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks_sampled.sh
	+scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.sh

	-Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test_sampled.R
	+Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R
	diff --git a/src/Applications/.ProbToModelApplication.cpp.swp b/src/Applications/.ProbToModelApplication.cpp.swp
	new file mode 100644
	index 0000000..ace6708
	Binary files /dev/null and b/src/Applications/.ProbToModelApplication.cpp.swp differ
	diff --git a/src/Applications/ProbToModelApplication.cpp b/src/Applications/ProbToModelApplication.cpp
	index dce386b..8a0613f 100644
	--- a/src/Applications/ProbToModelApplication.cpp
	+++ b/src/Applications/ProbToModelApplication.cpp
	@@ -1,320 +1,320 @@
	#include <ProbToModelApplication.hpp>

	#include <boost/program_options.hpp>
	#include <string>
	#include <vector>
	#include <iostream>
	#include <utility> // std::move()
	#include <stdexcept> // std::invalid_argument, std::runtime_error

	#include <ModelComputer.hpp>
	#include <ReadModelComputer.hpp>
	#include <SequenceModelComputer.hpp>
	#include <ConsensusSequenceModelComputer.hpp>
	#include <Matrix2D.hpp>
	#include <Matrix4D.hpp>
	#include <matrix_utility.hpp> // filter_rows()

	namespace po = boost::program_options ;

	typedef std::vector<double> vector_d ;


	ProbToModelApplication::ProbToModelApplication(int argn, char** argv)
	: file_read(""), file_seq(""), file_prob(""), file_filter(""),
	bckg_class(false),
	n_threads(0), runnable(false)
	{ this->parseOptions(argn, argv) ; }

	ProbToModelApplication::~ProbToModelApplication()
	{}

	int ProbToModelApplication::run()
	{ if(this->runnable)
	{
	// load data
	std::string file_data ;
	bool read_data = false ;
	bool seq_data = false ;
	bool consseq_data = false ;
	if(this->file_read != "")
	{ file_data = this->file_read ;
	seq_data = consseq_data = false ; read_data = true ;
	}
	else if(this->file_seq != "")
	{ file_data = this->file_seq ;
	read_data = consseq_data = false ; seq_data = true ;
	}
	else if(this->file_consseq != "")
	{ file_data = this->file_consseq ;
	read_data = seq_data = false ; consseq_data = true ;
	}
	else
	{ std::string msg("Error! Could not determine the type of the data!") ;
	throw std::runtime_error(msg) ;
	}


	// row filter
	std::vector<size_t> filter ;
	if(this->file_filter != "")
	{ // it is a column vector, easier to use the Matrix2D interface
	// to read it rather than coding a function for :)
	filter = Matrix2D<size_t>(this->file_filter).get_data() ;
	std::sort(filter.begin(), filter.end()) ;
	}

	// prob
	Matrix4D<double> prob ; prob.load(this->file_prob) ;

	// get the data model
	ModelComputer* ptr = nullptr ;
	if(read_data)
	{ Matrix2D<int> data(file_data) ;
	// filter out some rows if needed
	if(filter.size())
	{ data = filter_rows(filter, data) ; }
	this->check_dimensions(data, prob) ;
	ptr = new ReadModelComputer(std::move(data),
	prob,
	this->bckg_class,
	this->n_threads) ;
	}
	else if(seq_data)
	{
	Matrix2D<int> data(file_data) ;
	// filter out some rows if needed
	if(filter.size())
	{ data = filter_rows(filter, data) ; }
	this->check_dimensions(data, prob) ;
	ptr = new SequenceModelComputer(std::move(data),
	prob,
	this->bckg_class,
	this->n_threads) ;
	}
	else if(consseq_data)
	{ Matrix3D<double> data ;
	data.load(file_data) ;
	// filter out some rows if needed
	if(filter.size())
	{ data = filter_rows(filter, data) ; }
	this->check_dimensions(data, prob) ;
	ptr = new ConsensusSequenceModelComputer(std::move(data),
	prob,
	this->bckg_class,
	this->n_threads) ;
	}

	Matrix2D<double> model = ptr->get_model() ;
	delete ptr ;
	ptr = nullptr ;

	// compute the class prob
	size_t n_row = prob.get_dim()[0] ;
	size_t n_class = prob.get_dim()[1] ;
	size_t n_shift = prob.get_dim()[2] ;
	size_t n_flip = prob.get_dim()[3] ;

	vector_d class_prob(n_class, 0.) ;
	double p_tot = 0. ;
	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_class; j++)
	{ for(size_t k=0; k<n_shift; k++)
	{ for(size_t l=0; l<n_flip; l++)
	{ class_prob[j] += prob(i,j,k,l) ;
	p_tot += prob(i,j,k,l) ;
	}
	}
	}
	}
	for(auto& prob : class_prob)
	{ prob /= p_tot ; }

	// create a matrix containing the class prob in the 1st
	// column and the model in the remaining columns
	Matrix2D<double> model_final(model.get_nrow(),
	model.get_ncol() + 1) ;
	// 1st column contain the class prob
	if(read_data)
	{ for(size_t i=0; i<model_final.get_nrow(); i++)
	{ model_final(i,0) = class_prob[i] ; }
	}
	- else if(seq_data)
	+ else if(seq_data or consseq_data)
	{ size_t i_class = 0 ;
	for(size_t i=0; i<model_final.get_nrow(); i++)
	{ if((i != 0) and (i % 4 == 0))
	{ i_class++ ; }
	model_final(i,0) = class_prob[i_class] ;
	}
	}
	// fill the remaining with the model parameters
	for(size_t i=0; i<model.get_nrow(); i++)
	{ for(size_t j=0; j<model.get_ncol(); j++)
	{ model_final(i,j+1) = model(i,j) ; }
	}
	std::cout << model_final << std::endl ;
	return 0 ;
	}
	else
	{ return 1 ; }
	}

	void ProbToModelApplication::parseOptions(int argn, char** argv)
	{ // no option to parse
	if(argv == nullptr)
	{ std::string message = "no options to parse!" ;
	throw std::invalid_argument(message) ;
	}

	// help messages
	std::string desc_msg = "\n"
	"ProbToRef reconstructs the class models \n"
	"from the classification results (posterior \n"
	"probabilities computed by ChIPPartitioning \n"
	"and the data).\n\n" ;
	std::string opt_help_msg = "Produces this help message." ;
	std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n "
	"by default 0 (no parallelization)." ;
	std::string opt_read_msg = "The path to the file containing the data, if the data are\n"
	"read counts.";
	std::string opt_seq_msg = "The path to the file containing the data, if the data are\n"
	"sequences.";
	std::string opt_consseq_msg = "The path to the file containing the data, if the data are\n"
	"consensus sequences.";
	std::string opt_prob_msg = "The path to the file containing the posterior \n"
	"probabilities." ;
	std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n"
	"indices of rows to filter out in the data. Once the rows have been\n"
	"filtered out from the data, its rows order should correspond to the\n"
	"probabilities." ;
	std::string opt_bckg_msg = "Whether the last class of the classification (posterior\n"
	"probabilities) is a background class." ;
	// option parser
	boost::program_options::variables_map vm ;
	boost::program_options::options_description desc(desc_msg) ;

	std::string seeding_tmp ;

	desc.add_options()
	("help,h", opt_help_msg.c_str())

	("read,", po::value<std::string>(&(this->file_read)), opt_read_msg.c_str())
	("seq,", po::value<std::string>(&(this->file_seq)), opt_seq_msg.c_str())
	("consseq,", po::value<std::string>(&(this->file_consseq)), opt_seq_msg.c_str())
	("prob,", po::value<std::string>(&(this->file_prob)), opt_prob_msg.c_str())
	("filter", po::value<std::string>(&(this->file_filter)), opt_filter_msg.c_str())

	("bgclass", opt_bckg_msg.c_str())

	("thread", po::value<std::size_t>(&(this->n_threads)), opt_thread_msg.c_str()) ;

	// parse
	try
	{ po::store(po::parse_command_line(argn, argv, desc), vm) ;
	po::notify(vm) ;
	}
	catch(std::invalid_argument& e)
	{ std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
	throw std::invalid_argument(msg) ;
	}
	catch(...)
	{ throw std::invalid_argument("An unknown error occured while parsing the options") ; }

	bool help = vm.count("help") ;

	// checks unproper option settings
	if((this->file_read == "") and
	(this->file_seq == "") and
	(this->file_consseq == "") and
	(not help))
	{ std::string msg("Error! No data file was given (none of --read --seq --consseq)!") ;
	throw std::invalid_argument(msg) ;
	}
	else if((this->file_read != "") and
	(this->file_seq != "") and
	(not help))
	{ std::string msg("Error! --read and --seq are mutually exclusive!") ;
	throw std::invalid_argument(msg) ;
	}
	else if((this->file_read != "") and
	(this->file_consseq != "") and
	(not help))
	{ std::string msg("Error! --read and --consseq are mutually exclusive!") ;
	throw std::invalid_argument(msg) ;
	}
	else if((this->file_seq != "") and
	(this->file_consseq != "") and
	(not help))
	{ std::string msg("Error! --seq and --consseq are mutually exclusive!") ;
	throw std::invalid_argument(msg) ;
	}
	else if(this->file_prob == "" and (not help))
	{ std::string msg("Error! No posterior probabily file was given (--prob)!") ;
	throw std::invalid_argument(msg) ;
	}

	// set background class
	if(vm.count("bgclass"))
	{ this->bckg_class = true ; }

	// help invoked, run() cannot be invoked
	if(help)
	{ std::cout << desc << std::endl ;
	this->runnable = false ;
	return ;
	}
	// everything fine, run() can be called
	else
	{ this->runnable = true ;
	return ;
	}
	}

	void ProbToModelApplication::check_dimensions(const Matrix2D<int>& data,
	const Matrix4D<double>& prob) const
	{ if(data.get_nrow() != prob.get_dim()[0])
	{ char msg[4096] ;
	sprintf(msg, "Error! data and prob matrices have unequal "
	"row numbers (%zu / %zu)!",
	data.get_nrow(), prob.get_dim()[0]) ;
	throw std::runtime_error(msg) ;
	}
	else if(data.get_ncol() < prob.get_dim()[2])
	{ char msg[4096] ;
	sprintf(msg, "Error! too many shift states for the data width!"
	"(%zu shift states and %zu columns in data)!",
	prob.get_dim()[2], data.get_ncol()) ;
	throw std::runtime_error(msg) ;
	}
	}

	void ProbToModelApplication::check_dimensions(const Matrix3D<double>& data,
	const Matrix4D<double>& prob) const
	{ if(data.get_dim()[0] != prob.get_dim()[0])
	{ char msg[4096] ;
	sprintf(msg, "Error! data and prob matrices have unequal "
	"row numbers (%zu / %zu)!",
	data.get_dim()[0], prob.get_dim()[0]) ;
	throw std::runtime_error(msg) ;
	}
	else if(data.get_dim()[1] < prob.get_dim()[2])
	{ char msg[4096] ;
	sprintf(msg, "Error! too many shift states for the data width!"
	"(%zu shift states and %zu data width)!",
	prob.get_dim()[2], data.get_dim()[1]) ;
	throw std::runtime_error(msg) ;
	}
	else if(data.get_dim()[2]!= 4)
	{ char msg[4096] ;
	sprintf(msg, "Error! data 3rd dimension is not 4!"
	"(%zu)!",
	data.get_dim()[2]) ;
	throw std::runtime_error(msg) ;
	}
	}


	int main(int argn, char** argv)
	{ ProbToModelApplication app(argn, argv) ;
	return app.run() ;
	}
	diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
	index 7fe6210..3086e51 100644
	--- a/src/CMakeLists.txt
	+++ b/src/CMakeLists.txt
	@@ -1,157 +1,158 @@

	# compiler options
	add_compile_options(-std=c++14)
	add_compile_options(-O3)
	add_compile_options(-Wall)
	add_compile_options(-Wextra)
	add_compile_options(-Werror)
	add_compile_options(-Wfatal-errors)
	add_compile_options(-pedantic)
	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}")

	add_definitions (${SEQAN_DEFINITIONS})

	# include file location
	include_directories(${Boost_INCLUDE_DIRS})
	include_directories(${SEQAN_INCLUDE_DIRS})
	include_directories("${scATACseq_SOURCE_DIR}/src/Matrix")
	include_directories("${scATACseq_SOURCE_DIR}/src/Clustering")
	include_directories("${scATACseq_SOURCE_DIR}/src/Random")
	include_directories("${scATACseq_SOURCE_DIR}/src/Parallel")
	include_directories("${scATACseq_SOURCE_DIR}/src/Statistics")
	include_directories("${scATACseq_SOURCE_DIR}/src/GUI")
	include_directories("${scATACseq_SOURCE_DIR}/src/Applications")
	include_directories("${scATACseq_SOURCE_DIR}/src/Matrix")
	include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools")
	include_directories("${scATACseq_SOURCE_DIR}/src/Utility")
	# compile modules into static libraries
	## set output directory
	set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib")
	## build instructions
	add_library(Clustering "Clustering/DataLayer.cpp"
	"Clustering/Data2DLayer.cpp"
	"Clustering/Data3DLayer.cpp"
	"Clustering/ReadLayer.cpp"
	"Clustering/SequenceLayer.cpp"
	"Clustering/ConsensusSequenceLayer.cpp"
	"Clustering/ModelComputer.cpp"
	"Clustering/ReadModelComputer.cpp"
	"Clustering/SequenceModelComputer.cpp"
	"Clustering/ConsensusSequenceModelComputer.cpp"
	"Clustering/EMBase.cpp"
	"Clustering/EMRead.cpp"
	"Clustering/EMSequence.cpp"
	"Clustering/EMConsensusSequence.cpp"
	"Clustering/EMJoint.cpp")

	add_library(Random "Random/Random.cpp"
	"Random/RandomNumberGenerator.cpp")
	add_library(Parallel "Parallel/ThreadPool.cpp")
	-add_library(Statistics "Statistics/Statistics.cpp")
	+add_library(Statistics "Statistics/Statistics.cpp"
	+ "Statistics/KmersStatistics.cpp")
	add_library(GUI "GUI/ConsoleProgressBar.cpp"
	"GUI/Diplayable.cpp"
	"GUI/Updatable.cpp")
	add_library(GenomicTools "GenomicTools/MatrixCreator.cpp"
	"GenomicTools/ReadMatrixCreator.cpp"
	"GenomicTools/CorrelationMatrixCreator.cpp"
	"GenomicTools/ClassReadDataCreator.cpp"
	"GenomicTools/ClassSequenceDataCreator.cpp"
	"GenomicTools/SequenceMatrixCreator.cpp"
	"GenomicTools/GenomeRegion.cpp")
	add_library(Utility "Utility/matrix_utility.cpp"
	"Utility/dna_utility.cpp")

	## resolve dependencies
	target_link_libraries(Utility ${SEQAN_LIBRARIES})
	target_link_libraries(Clustering Utility Random Statistics GUI Parallel ${SEQAN_LIBRARIES})
	target_link_libraries(Parallel Threads::Threads)
	target_link_libraries(GenomicTools Utility ${SEQAN_LIBRARIES})

	# executables
	## a toy for seqan
	set(EXE_MAIN_TEST "main_test")
	add_executable(${EXE_MAIN_TEST} "main_test.cpp")
	target_link_libraries(${EXE_MAIN_TEST} GenomicTools Clustering)
	set_target_properties(${EXE_MAIN_TEST} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## a toy for correlation matrix
	set(EXE_MAIN_CORMAT "main_cormat")
	add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp")
	target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} Utility GenomicTools Random)
	set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an application to read binary matrices and display them in txt
	set(EXE_MAIN_MATBIN2TXT "MatrixBinToTxt")
	add_executable(${EXE_MAIN_MATBIN2TXT} "Applications/MatrixBinToTxtApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_MAIN_MATBIN2TXT} Utility Boost::program_options)
	set_target_properties(${EXE_MAIN_MATBIN2TXT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an application to know the indices of 0-signal rows in integer 2D matrices
	set(EXE_MAIN_WHICHNULLROWS "WhichNullRows")
	add_executable(${EXE_MAIN_WHICHNULLROWS} "Applications/WhichNullRowsApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_MAIN_WHICHNULLROWS} Boost::program_options)
	set_target_properties(${EXE_MAIN_WHICHNULLROWS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an application to create a matrix from BED and a BAM file
	set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator")
	add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Utility Boost::program_options)
	set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an application to create a sequence matrix from BED and a fasta file
	set(EXE_MAIN_SEQMATRIX "SequenceMatrixCreator")
	add_executable(${EXE_MAIN_SEQMATRIX} "Applications/SequenceMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_MAIN_SEQMATRIX} GenomicTools Utility Boost::program_options)
	set_target_properties(${EXE_MAIN_SEQMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an EMRead standalone
	set(EXE_EMREAD "EMRead")
	add_executable(${EXE_EMREAD} "Applications/EMReadApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_EMREAD} Clustering Boost::program_options)
	set_target_properties(${EXE_EMREAD} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an EMSequence standalone
	set(EXE_EMSEQ "EMSequence")
	add_executable(${EXE_EMSEQ} "Applications/EMSequenceApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_EMSEQ} Clustering Utility Boost::program_options)
	set_target_properties(${EXE_EMSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an EMConsensusSequence standalone
	set(EXE_EMCONSSEQ "EMConsensusSequence")
	add_executable(${EXE_EMCONSSEQ} "Applications/EMConsensusSequenceApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_EMCONSSEQ} Clustering Utility Boost::program_options)
	set_target_properties(${EXE_EMCONSSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an EMJoint standalone
	set(EXE_EMJOINT "EMJoint")
	add_executable(${EXE_EMJOINT} "Applications/EMJointApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_EMJOINT} Clustering Utility Boost::program_options)
	set_target_properties(${EXE_EMJOINT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an executable to compute data models from the data and the post prob of an EM classification
	set(EXE_PROB2REF "ProbToModel")
	add_executable(${EXE_PROB2REF} "Applications/ProbToModelApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_PROB2REF} Clustering Utility Boost::program_options)
	set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	## an executable to extend read models from an EM classification
	set(EXE_READMODELEXTENDER "ReadModelExtender")
	add_executable(${EXE_READMODELEXTENDER} "Applications/ReadModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_READMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options)
	set_target_properties(${EXE_READMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")

	## an executable to extend read models from an EM classification
	set(EXE_SEQUENCEMODELEXTENDER "SequenceModelExtender")
	add_executable(${EXE_SEQUENCEMODELEXTENDER} "Applications/SequenceModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_SEQUENCEMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options)
	set_target_properties(${EXE_SEQUENCEMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")

	## an executable to extract read data from an EM classification
	set(EXE_CLASSREADDATACREATOR "ClassReadDataCreator")
	add_executable(${EXE_CLASSREADDATACREATOR} "Applications/ClassReadDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_CLASSREADDATACREATOR} Clustering GenomicTools Utility Boost::program_options)
	set_target_properties(${EXE_CLASSREADDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")

	## an executable to extract sequence data from an EM classification
	set(EXE_CLASSSEQDATACREATOR "ClassSequenceDataCreator")
	add_executable(${EXE_CLASSSEQDATACREATOR} "Applications/ClassSequenceDataCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
	target_link_libraries(${EXE_CLASSSEQDATACREATOR} Clustering GenomicTools Utility Boost::program_options)
	set_target_properties(${EXE_CLASSSEQDATACREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")


	## a test suite
	set(EXE_TESTS "unittests")
	add_executable(${EXE_TESTS} "unittests.cpp"
	"Unittests/unittests_matrix.cpp"
	"Unittests/unittests_genomictools.cpp")
	target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools)
	set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
	diff --git a/src/Statistics/KmersStatistics.cpp b/src/Statistics/KmersStatistics.cpp
	new file mode 100644
	index 0000000..58533a8
	--- /dev/null
	+++ b/src/Statistics/KmersStatistics.cpp
	@@ -0,0 +1,230 @@
	+#include <KmersStatistics.hpp>
	+
	+#include <string>
	+#include <vector>
	+#include <unordered_map>
	+
	+#include <Matrix2D.hpp>
	+#include <Matrix3D.hpp>
	+#include <Statistics.hpp> // poisson_cdf()
	+#include <dna_utility.hpp> // dna::consensus_to_kmer()
	+
	+Matrix2D<int> kmers::compute_mononucleotide(size_t k,
	+ const std::vector<std::string>& kmers,
	+ const std::vector<int>& counts)
	+{
	+ Matrix2D<int> comp(k, 4, 0) ;
	+
	+ for(size_t i=0; i<kmers.size(); i++)
	+ {
	+ std::string kmer = kmers[i] ;
	+ int count = counts[i] ;
	+
	+ for (size_t j=0; j<k ; j++)
	+ { int base = int(kmer[j]) - int('0') ;
	+ comp(j, base) += count ;
	+ }
	+ }
	+ return(comp);
	+}
	+
	+Matrix3D<int> kmers::compute_dinucleotide(size_t k,
	+ const std::vector<std::string>& kmers,
	+ const std::vector<int>& counts)
	+{
	+ Matrix3D<int> comp2(4, k-1, 4) ;
	+
	+ for(size_t i=0; i<kmers.size(); i++)
	+ { std::string kmer = kmers[i] ;
	+ int count = counts[i] ;
	+ for (size_t j=0; j<k-1; j++)
	+ { int base1 = int(kmer[j]) - int('0') ;
	+ int base2 = int(kmer[j+1]) - int('0') ;
	+ comp2(base1, j, base2) += count ;
	+ }
	+ }
	+ return(comp2);
	+}
	+
	+std::vector<double> kmers::compute_exp_values(size_t k,
	+ const std::vector<std::string>& kmers,
	+ const Matrix2D<int>& comp1,
	+ const Matrix3D<int>& comp2)
	+{ std::vector<double> expected(kmers.size()) ;
	+
	+ for(size_t i=0; i<kmers.size(); i++)
	+ { std::string kmer = kmers[i] ;
	+ double exp = comp1(0, int(kmer[0]) - int('0')) ;
	+
	+ for(size_t j=0; j<k-1; j++)
	+ { int base1 = int(kmer[j]) - int('0') ;
	+ int base2 = int(kmer[j+1]) - int('0') ;
	+ exp *= ((double)comp2(base1, j, base2) /
	+ (double)comp1(j+1, base2)) ;
	+ }
	+
	+ expected[i] = exp ;
	+ }
	+ return expected;
	+}
	+
	+
	+std::vector<double> kmers::compute_pvalues(const std::vector<int>& counts,
	+ const std::vector<double>& expected)
	+{
	+ std::vector<double> pvalues(counts.size()) ;
	+
	+ for(size_t i=0; i<counts.size(); i++)
	+ {
	+ pvalues[i] = poisson_cdf((double)counts[i],
	+ expected[i],
	+ false) ;
	+ }
	+ return pvalues ;
	+}
	+
	+std::unordered_map<std::string, int>
	+kmers::compute_kmer_count(const Matrix3D<double>& consensus_sequence,
	+ size_t length)
	+{ std::unordered_map<std::string, int> kmer_count ;
	+
	+ size_t n_row = consensus_sequence.get_dim()[0] ;
	+ size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;
	+
	+ for(size_t i=0; i<n_row; i++)
	+ { for(size_t s=0; s<n_shift; s++)
	+ { // get kmer
	+ std::string kmer = dna::consensus_to_kmer(consensus_sequence,
	+ i,
	+ s,
	+ s+length) ;
	+ // insert
	+ auto iter = kmer_count.find(kmer) ;
	+ if(iter == kmer_count.end())
	+ { kmer_count.emplace(kmer, 1) ; }
	+ // update
	+ else
	+ { iter->second += 1 ; }
	+ }
	+ }
	+ return kmer_count ;
	+}
	+
	+std::pair<std::vector<std::string>, std::vector<double>>
	+kmers::compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
	+ size_t k)
	+{
	+
	+ // get kmer count
	+ std::unordered_map<std::string, int> kmer_count =
	+ kmers::compute_kmer_count(consensus_sequences, k) ;
	+
	+ /*
	+ consensus_sequences.get_dim() ;
	+ std::unordered_map<std::string, int> kmer_count ;
	+ // ANN
	+ kmer_count["000"] = 1 ;
	+ kmer_count["001"] = 2 ;
	+ kmer_count["002"] = 3 ;
	+ kmer_count["003"] = 4 ;
	+ kmer_count["010"] = 5 ;
	+ kmer_count["011"] = 6 ;
	+ kmer_count["012"] = 7 ;
	+ kmer_count["013"] = 8 ;
	+ kmer_count["020"] = 9 ;
	+ kmer_count["021"] = 10 ;
	+ kmer_count["022"] = 11 ;
	+ kmer_count["023"] = 12 ;
	+ kmer_count["030"] = 13 ;
	+ kmer_count["031"] = 14 ;
	+ kmer_count["032"] = 15 ;
	+ kmer_count["033"] = 16 ;
	+ // CNN
	+ kmer_count["100"] = 1 ;
	+ kmer_count["101"] = 2 ;
	+ kmer_count["102"] = 3 ;
	+ kmer_count["103"] = 4 ;
	+ kmer_count["110"] = 5 ;
	+ kmer_count["111"] = 6 ;
	+ kmer_count["112"] = 7 ;
	+ kmer_count["113"] = 8 ;
	+ kmer_count["120"] = 9 ;
	+ kmer_count["121"] = 10 ;
	+ kmer_count["122"] = 11 ;
	+ kmer_count["123"] = 12 ;
	+ kmer_count["130"] = 13 ;
	+ kmer_count["131"] = 14 ;
	+ kmer_count["132"] = 15 ;
	+ kmer_count["133"] = 16 ;
	+ // GNN
	+ kmer_count["200"] = 1 ;
	+ kmer_count["201"] = 2 ;
	+ kmer_count["202"] = 3 ;
	+ kmer_count["203"] = 4 ;
	+ kmer_count["210"] = 5 ;
	+ kmer_count["211"] = 6 ;
	+ kmer_count["212"] = 7 ;
	+ kmer_count["213"] = 8 ;
	+ kmer_count["220"] = 9 ;
	+ kmer_count["221"] = 10 ;
	+ kmer_count["222"] = 11 ;
	+ kmer_count["223"] = 12 ;
	+ kmer_count["230"] = 13 ;
	+ kmer_count["231"] = 14 ;
	+ kmer_count["232"] = 15 ;
	+ kmer_count["233"] = 16 ;
	+ // TNN
	+ kmer_count["300"] = 1 ;
	+ kmer_count["301"] = 2 ;
	+ kmer_count["302"] = 3 ;
	+ kmer_count["303"] = 4 ;
	+ kmer_count["310"] = 5 ;
	+ kmer_count["311"] = 6 ;
	+ kmer_count["312"] = 7 ;
	+ kmer_count["313"] = 8 ;
	+ kmer_count["320"] = 9 ;
	+ kmer_count["321"] = 10 ;
	+ kmer_count["322"] = 11 ;
	+ kmer_count["323"] = 12 ;
	+ kmer_count["330"] = 13 ;
	+ kmer_count["331"] = 14 ;
	+ kmer_count["332"] = 15 ;
	+ kmer_count["333"] = 16 ;
	+ */
	+
	+ // turn to vectors
	+ std::vector<std::string> kmers(kmer_count.size()) ;
	+ size_t i=0 ;
	+ for(const auto& iter : kmer_count)
	+ { kmers[i] = iter.first ;
	+ i++ ;
	+ }
	+ std::sort(kmers.begin(), kmers.end()) ;
	+ std::vector<int> counts(kmer_count.size()) ;
	+ i=0 ;
	+ for(const auto& kmer : kmers)
	+ { counts[i] = kmer_count[kmer] ;
	+ i++ ;
	+ }
	+ kmer_count.clear() ;
	+
	+ // get mononucleotide composition in kmer
	+ // this is a probability matrix modeling the kmer
	+ Matrix2D<int> comp1 = kmers::compute_mononucleotide(k, kmers, counts) ;
	+
	+ // get dinucleotide composition in kmer
	+ // this is a probability matrix modeling the kmer
	+ Matrix3D<int> comp2 = kmers::compute_dinucleotide(k, kmers, counts) ;
	+
	+ // get expected number of occurence for each kmer
	+ std::vector<double> expected = kmers::compute_exp_values(k,
	+ kmers,
	+ comp1,
	+ comp2) ;
	+
	+ // compute the pvalue associated to each kmer
	+ std::vector<double> pvalues = kmers::compute_pvalues(counts,
	+ expected) ;
	+
	+ return std::make_pair(kmers, pvalues) ;
	+}
	diff --git a/src/Statistics/KmersStatistics.hpp b/src/Statistics/KmersStatistics.hpp
	new file mode 100644
	index 0000000..fcc4f60
	--- /dev/null
	+++ b/src/Statistics/KmersStatistics.hpp
	@@ -0,0 +1,133 @@
	+#ifndef KMER_STATISTICS_HPP
	+#define KMER_STATISTICS_HPP
	+
	+#include <string>
	+#include <vector>
	+#include <unordered_map>
	+
	+#include <Matrix2D.hpp>
	+#include <Matrix3D.hpp>
	+
	+
	+namespace kmers {
	+
	+ /*!
	+ * \brief Creates a table of all kmer found in
	+ * a set of consensus sequences and return an
	+ * associated pvalue for each one of them.
	+ * To transform a consensus sequence into a kmer,
	+ * the major base at each position is considered
	+ * only.
	+ * The pvalue is computed as 1 - p(count, exp)
	+ * where p is the CDF of a Poisson distribution
	+ * with <count> being the number of time a given
	+ * kmer is observed in the data and <exp> its
	+ * expected number of occurences. The expected
	+ * number of occurences is computed from the
	+ * observed di-nucleotide occurences in the given
	+ * data.
	+ * \param consensus_sequences
	+ * \param k the kmer length.
	+ * \return a pair of vectors containing the kmers
	+ * in integer format (A:0, C:1, G:2, T:3) and
	+ * their number of occurences (in the same order).
	+ */
	+ std::pair<std::vector<std::string>, std::vector<double>>
	+ compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
	+ size_t k) ;
	+
	+ /*!
	+ * \brief Computes the number of occurences of each kmer
	+ * appearing in a set of consensus sequences.
	+ * To transform a consensus sequence into a kmer,
	+ * the major base at each position is considered
	+ * only.
	+ * \param consensus_sequence a matrix containing the
	+ * consensus sequences as a probability matrix. It has
	+ * the following dimensions :
	+ * 1st the number of consensus sequences
	+ * 2nd the length of the consensus sequences
	+ * 3rd 4 for A, C, G, T
	+ * The summing over the 1st and 2nd dimensions should
	+ * give 1s.
	+ * \param k the kmer length.
	+ * \return a map with the kmer found as keys and their
	+ * number of occurences as value.
	+ */
	+ std::unordered_map<std::string, int>
	+ compute_kmer_count(const Matrix3D<double>& consensus_sequence,
	+ size_t k) ;
	+
	+ /*!
	+ * \brief Compute the pvalue of a set of kmers given their
	+ * number of occurences and their expected number of
	+ * occurences.
	+ * \param counts the number of occurences of each kmer.
	+ * \param expected the expected number of occurences of
	+ * each kmer.
	+ * \return the pvalue of each kmer.
	+ */
	+ std::vector<double> compute_pvalues(const std::vector<int>& counts,
	+ const std::vector<double>& expected) ;
	+
	+ /*!
	+ * \brief Computes the number of expected occurences of a set
	+ * of kmers given the mononucleotide and dinucleotide composition
	+ * of the kmers.
	+ * \param k the length of the kmers.
	+ * \param kmers the kmers in integer format (A:0, C:1, G:2, T:3)
	+ * \param comp1 a matrix with the number of counts of each
	+ * base at each position in the kmers. It has the following
	+ * dimensions :
	+ * 1st dim k
	+ * 2nd dim 4 for A, C, G, T
	+ * \param comp2 a matrix containing the number of counts of
	+ * each pair of dinucleotide in the kmers. It has the
	+ * following dimensions :
	+ * 1st 4 for A, C, G, T (1st base of the dinucleotide)
	+ * 2nd k-1
	+ * 3rd 4 for A, C, G, T (2n base of the dinucleotide)
	+ * \return the expected number of occurences for each
	+ * kmer.
	+ */
	+ std::vector<double> compute_exp_values(size_t k,
	+ const std::vector<std::string>& kmers,
	+ const Matrix2D<int>& comp1,
	+ const Matrix3D<int>& comp2) ;
	+
	+ /*!
	+ * \brief Computes the number of occurences of each dinucleotide
	+ * pair at each position of a given set of kmers.
	+ * \param k the length of the kmers.
	+ * \param kmers the kmers in integer format (A:0, C:1, G:2, T:3).
	+ * \param counts the number of occurences of each kmer.
	+ * \return a matrix containing the number of counts of
	+ * each pair of dinucleotide in the kmers. It has the
	+ * following dimensions :
	+ * 1st 4 for A, C, G, T (1st base of the dinucleotide)
	+ * 2nd k-1
	+ * 3rd 4 for A, C, G, T (2n base of the dinucleotide)
	+ */
	+ Matrix3D<int> compute_dinucleotide(size_t k,
	+ const std::vector<std::string>& kmers,
	+ const std::vector<int>& counts) ;
	+
	+ /*!
	+ * \brief Computes the number of occurences of each nucleotide
	+ * at each position of a given set of kmers.
	+ * \param k the length of the kmers.
	+ * \param kmers the kmers in integer format (A:0, C:1, G:2, T:3).
	+ * \param counts the number of occurences of each kmer
	+ * \return a matrix with the number of counts of each
	+ * base at each position in the kmers. It has the following
	+ * dimensions :
	+ * 1st dim k
	+ * 2nd dim 4 for A, C, G, T
	+ */
	+ Matrix2D<int> compute_mononucleotide(size_t k,
	+ const std::vector<std::string>& kmers,
	+ const std::vector<int>& counts) ;
	+
	+}
	+
	+#endif // KMER_STATISTICS_HPP
	diff --git a/src/Statistics/Statistics.cpp b/src/Statistics/Statistics.cpp
	index c26f789..c671237 100644
	--- a/src/Statistics/Statistics.cpp
	+++ b/src/Statistics/Statistics.cpp
	@@ -1,31 +1,45 @@
	#include <Statistics.hpp>
	#include <cmath> // M_PI, pow(), sqrt(), log(), lgamma()
	#include <boost/math/distributions.hpp> // beta_distribution
	#include <boost/random.hpp>


	double normal_pmf(double x, double mean, double sd)
	{ static double pi_2 = 2.*M_PI ;
	return ( 1. / ( sd * sqrt(pi_2) )) * exp(-0.5 * pow((x-mean)/sd, 2.0 ) );
	}

	double poisson_pmf(int x, double lambda)
	{ if((x != 0) and (lambda == 0))
	{ return 0. ; }
	else if((x == 0) and (lambda == 0))
	{ return 1. ; }

	// if(x + lambda == 0)
	// { return 1. ; }
	return exp(x * log(lambda) - lgamma(x + 1.0) - lambda);
	}

	+double poisson_cdf(int x, double lambda, bool lower_tail)
	+{ double prob = 1. ;
	+ if(lambda != 0)
	+ { boost::math::poisson_distribution<> pois_dist(lambda) ;
	+ prob = cdf(pois_dist, x) ;
	+ }
	+ // prob of an observation as big as x
	+ if(lower_tail)
	+ { return prob ; }
	+ // prob of an observation at least as big as x (pvalue)
	+ else
	+ { return 1 - prob ; }
	+}
	+
	double beta_pmf(double x, double alpha, double beta)
	{
	boost::math::beta_distribution<> beta_dist(alpha, beta) ;
	double y = quantile(beta_dist, x) ;

	return y ;
	}


	diff --git a/src/Statistics/Statistics.hpp b/src/Statistics/Statistics.hpp
	index fdc9808..eb0a966 100755
	--- a/src/Statistics/Statistics.hpp
	+++ b/src/Statistics/Statistics.hpp
	@@ -1,291 +1,307 @@
	#ifndef STATISTCS_HPP
	#define STATISTICS_HPP

	#include <vector>
	#include <numeric>
	#include <cmath> // pow(), sqrt()
	#include <assert.h>
	#include <iostream>

	/*!
	* \brief Computes the density of x given a gaussian
	* distribution with the given mean and standard
	* deviation parameters.
	* \param x the value of interest.
	* \param mean the gaussian mean parameter value.
	* \param sd the gaussian standard deviation value.
	* \return the probability of x p(x\|N(mean,sd))
	*/
	double normal_pmf(double x, double mean, double sd) ;

	/*!
	* \brief Computes the probability of x given a Poisson
	* distribution with the given lambda parameter.
	* \param x the value of interest.
	* \param lambda the lambda parameter.
	* \return the probability of the value given the
	* distribution.
	*/
	double poisson_pmf(int x, double lambda) ;

	+/*!
	+ * \brief Computes the probability of observing a
	+ * value samller or equal to x, given a poisson
	+ * distribution of parameter lambda.
	+ * \param x the value.
	+ * \param lambda lambda the lambda parameter.
	+ * \param lower_tail whether the lower tail of the
	+ * distribution should be returned. If true, the
	+ * probability p of observing a value smaller or as
	+ * big as x is returned. Otherwised the probability
	+ * of observing a value at least as big as x
	+ * is returned (1-p).
	+ * \return the probability of observing a value
	+ * smaller or as big as x.
	+ */
	+double poisson_cdf(int x, double lambda, bool lower_tail=true) ;

	/*!
	* \brief Computes the probability of x given a Beta
	* distribution with the given alpha and beta parameters.
	* \param x the value of interest
	* \param alpha the alpha parameter.
	* \param beta the beta parameter.
	* \return the probability of the value given the
	* distribution.
	*/
	double beta_pmf(double x, double alpha, double beta) ;


	/*!
	* \brief Computes the weighted mean of a vector of measures <x> given their
	* probability <p>. The sum of <p> is expected to be 1, if not it will
	* be normalized.
	* \param x a vector of measures.
	* \param p a vector of probability associated to each element of <x>.
	* \param from the starting index, if different from -1.
	* \param to the ending index (not included), if different from -1.
	* \return the mean.
	*/
	template<class T>
	inline double mean(const std::vector<T>& x, const std::vector<double>& p) ;

	/*!
	* \brief Computes the standard deviation of a vector of measures <x> given
	* their probability <p>.
	* \param x a vector of measures.
	* \param p a vector of probability associated to each element of <x>.
	* \param unbiased whether the unbiased standard deviation should be returned
	* (the square root of the variance is divided by a normalizing factor).
	* \return the standard deviation.
	*/
	template<class T>
	inline double sd(const std::vector<T>& x, const std::vector<double>& p, bool unbiased=true) ;


	/*! \brief Computes the unbiased standard deviation of a vector of
	* measures <x> given their probability <p>.
	* \param x a vector of measures.
	* \param p a vector of probability associated to each element of <x>.
	* \return the unbiased standard deviation.
	*/
	template<class T>
	inline double sd_unbiased(const std::vector<T> &x, const std::vector<double> &p) ;

	/*! \brief Computes the biased (non-normalized) standard deviation of a vector of
	* measures <x> given their probability <p>.
	* \param x a vector of measures.
	* \param p a vector of probability associated to each element of <x>.
	* \return the biased standard deviation.
	*/
	template<class T>
	inline double sd_biased(const std::vector<T> &x, const std::vector<double> &p) ;


	/*!
	* \brief Computes the pearson correlation coefficient for two given vectors
	* using v1[from1-->to1) and v2[from2-->to2). If from1, to1, from2, to2 are equal
	* to -1, they are not accounted and the first and last elements considered are
	* the beginning and the end of the vectors.
	* \param v1 the first vector.
	* \param v2 the second vector.
	* \param from1 the index of the first value to use in the first vector.
	* \param to1 the index after the last value to use in the fist vector.
	* \param from2 the index of the first value to use in the second vector.
	* \param to2 the index after the last value to use in the second vector.
	* \return the correlation coefficient
	*/
	template<class T>
	inline double cor_pearson(const std::vector<T>& v1,
	const std::vector<T>& v2,
	int from1=-1, int to1=-1,
	int from2=-1, int to2=-1) ;

	/*!
	* \brief Computes the pearson correlation coefficient for two given vectors
	* using v1[from1-->to1) and v2(to2<--from2] (the second vector is read backward,
	* from the end).
	* If from1, to1, from2, to2 are equal to -1, they are not accounted and the first
	* and last elements considered are the beginning and the end of the vectors (for
	* the last element considered in v2 to be the 1st of the vector (the 0th), to2
	* should be set to -1, which is ignored. However this enables the default behaviour
	* using the 0th element as the last one).
	* \param v1 the first vector.
	* \param v2 the second vector.
	* \param from1 the index of the first value to use in the first vector.
	* \param to1 the index after the last value to use in the fist vector.
	* \param from2 the index of the first value to use in the second vector.
	* \param to2 the index after the last value to use in the second vector.
	* \return the correlation coefficient
	*/
	template<class T>
	inline double cor_pearson_rev(const std::vector<T>& v1,
	const std::vector<T>& v2,
	int from1=-1, int to1=-1,
	int from2=-1, int to2=-1) ;

	template<class T>
	inline double mean(const std::vector<T>& x, const std::vector<double>& p)
	{
	assert(x.size() == p.size()) ;

	double mean = 0. ;
	double total = 0. ;
	for(auto i : p)
	{ total += i ; }
	for(size_t i=0; i<x.size(); i++)
	{ mean += x[i] * (p[i]/total) ; }
	return mean ;
	}

	template<class T>
	inline double sd(const std::vector<T>& x, const std::vector<double>& p, bool unbiased)
	{ if(unbiased)
	{ return sd_unbiased(x, p) ; }
	else
	{ return sd_biased(x,p) ; }
	}

	template<class T>
	inline double sd_unbiased(const std::vector<T>& x, const std::vector<double>& p)
	{
	assert(x.size() == p.size()) ;

	double v1 = 0. ;
	double v2 = 0. ;
	double m = mean(x,p) ;
	double sd = 0. ;
	double total = 0. ;
	for(auto i : p)
	{ total += i ; }
	for(size_t i=0; i<x.size(); i++)
	{ double p_norm = p[i] / total ;
	sd += pow(x[i] - m, 2) * (p_norm) ;
	v1 += p_norm ;
	v2 += pow(p_norm,2) ;
	}
	return sqrt(sd / (v1 - (v2/v1))) ;
	}

	template<class T>
	inline double sd_biased(const std::vector<T>& x, const std::vector<double>& p)
	{
	assert(x.size() == p.size()) ;

	double m = mean(x,p) ;
	double sd = 0. ;
	double total = 0. ;
	for(auto i : p)
	{ total += i ; }
	for(size_t i=0; i<x.size(); i++)
	{ double p_norm = p[i] / total ;
	sd += pow(x[i] - m, 2) * (p_norm) ;
	}
	return sqrt(sd) ;
	}

	template<class T>
	inline double cor_pearson(const std::vector<T>& v1,
	const std::vector<T>& v2,
	int from1, int to1,
	int from2, int to2)
	{
	if(from1 == -1)
	{ from1 = 0 ; }
	if(to1 == -1)
	{ to1 = v1.size() ; }
	if(from2 == -1)
	{ from2 = 0 ; }
	if(to2 == -1)
	{ to2 = v2.size() ; }

	// 0 <= from < to < v.size()
	// if from == to -> no sense, no value : return nan
	// if from > to -> no sense, backward : does a segfault
	assert(from1 >= 0) ;
	assert(from1 < to1) ;
	assert(to1 <= v1.size()) ;
	assert(from2 >= 0) ;
	assert(from2 < to2) ;
	assert(to2 <= v2.size()) ;

	// the terms of the formula
	double sum_v1v2 = 0. ;
	double sum_v1 = 0. ;
	double sum_v2 = 0. ;
	double sum_v1_2 = 0. ;
	double sum_v2_2 = 0. ;
	// the effective number of values considered
	double n = to1 - from1 ;

	for(size_t i1=from1, i2=from2; i1<to1; i1++,i2++)
	{ sum_v1v2 += v1[i1]*v2[i2] ;
	sum_v1 += v1[i1] ;
	sum_v2 += v2[i2] ;
	sum_v1_2 += pow(v1[i1],2) ;
	sum_v2_2 += pow(v2[i2],2) ;

	}

	return (nsum_v1v2 - (sum_v1sum_v2)) /
	(sqrt(nsum_v1_2 - pow(sum_v1, 2)) sqrt(n*sum_v2_2 - pow(sum_v2, 2)) ) ;
	}


	template<class T>
	inline double cor_pearson_rev(const std::vector<T>& v1,
	const std::vector<T>& v2,
	int from1, int to1,
	int from2, int to2)
	{
	if(from1 == -1)
	{ from1 = 0 ; }
	if(to1 == -1)
	{ to1 = v1.size() ; }
	if(from2 == -1)
	{ from2 = v2.size() - 1 ; }
	if(to2 == -1)
	{ to2 = 0 ; }

	// beware -> for v1 [from1, to1)
	// for v2 [to2, from2]
	// use from1 and to1 for the loop conditions

	// 0 <= from < to < v1.size()
	// 0 <= to2 < from2 < v2.size() because for v2 [to2, from2]
	// if from == to -> no sense, no value : return nan
	// if from > to -> no sense, backward : does a segfault
	assert(from1 >= 0) ;
	assert(from1 < to1) ;
	assert(to1 <= v1.size()) ;
	assert(to2 >= 0) ;
	assert(to2 < from2) ;
	assert(from2 < v2.size()) ;

	// the terms of the formula
	double sum_v1v2 = 0. ;
	double sum_v1 = 0. ;
	double sum_v2 = 0. ;
	double sum_v1_2 = 0. ;
	double sum_v2_2 = 0. ;
	// the effective number of values considered
	double n = to1 - from1 ;

	for(int i1=from1, i2=from2; i1<to1; i1++,i2--)
	{ sum_v1v2 += v1[i1]*v2[i2] ;
	sum_v1 += v1[i1] ;
	sum_v2 += v2[i2] ;
	sum_v1_2 += pow(v1[i1],2) ;
	sum_v2_2 += pow(v2[i2],2) ;
	}

	return (nsum_v1v2 - (sum_v1sum_v2)) /
	(sqrt(nsum_v1_2 - pow(sum_v1, 2)) sqrt(n*sum_v2_2 - pow(sum_v2, 2)) ) ;
	}


	#endif // STATISTICS_HPP
	diff --git a/src/Utility/dna_utility.cpp b/src/Utility/dna_utility.cpp
	index 2902fb1..3db580a 100644
	--- a/src/Utility/dna_utility.cpp
	+++ b/src/Utility/dna_utility.cpp
	@@ -1,178 +1,210 @@
	#include<dna_utility.hpp>

	-#include <string>
	+#include <string> // string, to_string()
	#include <unordered_map>
	#include <stdexcept> // std::invalid_argument

	+
	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>
	#include <seqan/seq_io.h> // seqan::SeqFileIn


	int dna::map(char base, bool rev_compl)
	{
	static bool init = false ;
	static std::unordered_map<char,int> hash_map ;
	static std::unordered_map<char,int> hash_map_rev ;
	if(not init)
	{ hash_map['A'] = 0 ;
	hash_map['a'] = 0 ;
	hash_map['C'] = 1 ;
	hash_map['c'] = 1 ;
	hash_map['G'] = 2 ;
	hash_map['g'] = 2 ;
	hash_map['T'] = 3 ;
	hash_map['t'] = 3 ;
	hash_map['N'] = 4 ;
	hash_map['n'] = 4 ;

	hash_map_rev['A'] = hash_map['T'] ;
	hash_map_rev['a'] = hash_map['t'] ;
	hash_map_rev['C'] = hash_map['G'] ;
	hash_map_rev['c'] = hash_map['g'] ;
	hash_map_rev['G'] = hash_map['C'] ;
	hash_map_rev['g'] = hash_map['c'] ;
	hash_map_rev['T'] = hash_map['A'] ;
	hash_map_rev['t'] = hash_map['a'] ;
	hash_map_rev['N'] = hash_map['N'] ;
	hash_map_rev['n'] = hash_map['n'] ;

	init = true ;
	}

	try
	{ if(rev_compl)
	{ return hash_map_rev.at(base) ; }
	else
	{ return hash_map.at(base) ; }
	}
	// key could not be found
	catch(std::out_of_range& e)
	{ char msg[256] ;
	sprintf(msg, "Error! Invalid DNA base : %c", base) ;
	throw std::invalid_argument(msg) ;
	}
	}

	char dna::map(int base, bool rev_compl)
	{
	static bool init = false ;
	static std::unordered_map<int,char> hash_map ;
	static std::unordered_map<int,char> hash_map_rev ;
	if(not init)
	{ hash_map[0] = 'A' ;
	hash_map[1] = 'C' ;
	hash_map[2] = 'G' ;
	hash_map[3] = 'T' ;
	hash_map[4] = 'N' ;

	hash_map_rev[4] = hash_map[4] ;
	hash_map_rev[3] = hash_map[0] ;
	hash_map_rev[2] = hash_map[1] ;
	hash_map_rev[1] = hash_map[2] ;
	hash_map_rev[0] = hash_map[3] ;

	init = true ;
	}

	try
	{ if(rev_compl)
	{ return hash_map_rev.at(base) ; }
	else
	{ return hash_map.at(base) ; }
	}
	// key could not be found
	catch(std::out_of_range& e)
	{ char msg[256] ;
	sprintf(msg, "Error! Invalid DNA code : %i", base) ;
	throw std::invalid_argument(msg) ;
	}
	}

	int dna::char_to_int(char c, bool rev_compl)
	{ return dna::map(c, rev_compl) ; }

	Matrix2D<int> dna::char_to_int(const Matrix2D<int>& matrix)
	{
	size_t n_row = matrix.get_nrow() ;
	size_t n_col = matrix.get_ncol() ;

	Matrix2D<int> data_int(n_row, n_col) ;
	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_col; j++)
	{ data_int(i,j) = dna::char_to_int(matrix(i,j)) ; }
	}
	return data_int ;
	}

	char dna::int_to_char(int n, bool rev_compl)
	{ return dna::map(n, rev_compl) ; }

	std::vector<double> dna::base_composition(const Matrix2D<int>& sequences, bool both_strands)
	{
	double total = 0. ;
	std::vector<double> base_comp(4,0.) ;

	int base_N = dna::map('N') ;

	for(size_t i=0; i<sequences.get_nrow(); i++)
	{ for(size_t j=0; j<sequences.get_ncol(); j++)
	{ // forward strand
	int base = sequences(i,j) ;
	// do not account for N's
	if(base == base_N)
	{ continue ; }
	else
	{ base_comp[base] += 1;
	total += 1. ;
	}
	// reverse complement strand
	if(both_strands)
	{ // size_t c_hash_rev = dna::hash(c, true) ;
	base_comp[4-base-1] += 1. ;
	total += 1. ;
	}
	}
	}

	// normalize
	for(auto& i : base_comp)
	{ i /= total ; }

	return base_comp ;
	}

	std::vector<double> dna::base_composition(const Matrix3D<double>& consensus_sequences, bool both_strands)
	{
	if(consensus_sequences.get_dim()[2] != 4)
	{ char msg[4096] ;
	sprintf(msg, "Error! consensus sequences 3rd dimension not equal to 4 (%zu)",
	consensus_sequences.get_dim()[2]) ;
	throw std::invalid_argument(msg) ;
	}
	double total = 0. ;
	std::vector<double> base_comp(4,0.) ;

	for(size_t i=0; i<consensus_sequences.get_dim()[0]; i++)
	{ for(size_t j=0; j<consensus_sequences.get_dim()[1]; j++)
	{ for(size_t k=0; k<4; k++)
	{ // forward strand
	{ base_comp[k] += consensus_sequences(i,j,k) ;
	total += consensus_sequences(i,j,k) ;
	}
	// revers strand
	if(both_strands)
	{ size_t k_comp = 4 - k - 1 ;
	base_comp[k_comp] += consensus_sequences(i,j,k) ;
	total += consensus_sequences(i,j,k) ;
	}
	}
	}
	}

	// normalize
	for(auto& i : base_comp)
	{ i /= total ; }

	return base_comp ;
	}
	+
	+std::string dna::consensus_to_kmer(const Matrix3D<double>& consensus_sequences,
	+ size_t row,
	+ size_t from,
	+ size_t to)
	+{ // dna letter codes
	+ static int n_code = dna::char_to_int('N') ;
	+
	+ // some dimensions
	+ size_t length = to - from ;
	+ size_t n_dim3 = consensus_sequences.get_dim()[2] ;
	+
	+ // kmer
	+ std::string kmer(length, n_code) ;
	+
	+
	+ for(size_t i_mat=from, i_kmer=0; i_mat<to; i_mat++, i_kmer++)
	+ { // get majority base
	+ int max_j = 5 ;
	+ double max_n = -1. ;
	+ for(size_t j=0; j<n_dim3; j++)
	+ { if(consensus_sequences(row,i_mat,j) > max_n)
	+ { max_n = consensus_sequences(row,i_mat,j) ;
	+ max_j = j ;
	+ }
	+ }
	+ kmer[i_kmer] = std::to_string(max_j)[0] ;
	+ }
	+
	+ return kmer ;
	+}
	diff --git a/src/Utility/dna_utility.hpp b/src/Utility/dna_utility.hpp
	index 03bde59..d89d770 100644
	--- a/src/Utility/dna_utility.hpp
	+++ b/src/Utility/dna_utility.hpp
	@@ -1,106 +1,133 @@
	#ifndef DNA_UTILITY_HPP
	#define DNA_UTILITY_HPP

	+#include <string>
	+
	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>

	namespace dna
	{
	/*!
	* \brief Contains the mapping to convert
	* DNA characters to integer codes.
	* Lower and capital characters are accepted.
	* \param base the character of interest.
	* \param rev_compl whether the reverse
	* complement of the character is of interest.
	* \return the corresponding DNA code.
	*/
	int map(char base, bool rev_compl=false) ;

	/*!
	* \brief Contains the mapping to convert
	* DNA code to characters.
	* Only capital characters are returned.
	* \param base the code of interest.
	* \param rev_compl whether the reverse
	* complement of the code is of interest.
	* \return the corresponding DNA character.
	*/
	char map(int base, bool rev_compl=false) ;

	/*!
	* \brief Converts a DNA character (A, C,
	* G, T) to an integer.
	* \param c the DNA character of interest.
	* \return the character integer code.
	* \throw std::invalid_argument if the
	* given character is not a valid DNA
	* character.
	*/
	int char_to_int(char c, bool rev_compl= false) ;

	/*!
	* \brief Converts a DNA character matrix (A, C,
	* G, T) to an integer matrix.
	* The DNA characters are converted using
	* SequenceLayer::char_to_int(char).
	* param file_address the address of the file to load.
	* \return the corresponding int matrix.
	*/
	Matrix2D<int> char_to_int(const Matrix2D<int>& matrix) ;

	/*!
	* \brief Converts an int DNA code to
	* a regular DNA character (A, C, G, T).
	* This method is the reverse method of
	* char_to_int(char).
	* \param n the DNA code of interest.
	* \return the DNA character.
	* \throw std::invalid_argument if the
	* given int is not a valid DNA
	* code.
	*/
	char int_to_char(int n, bool rev_compl=false) ;

	/*!
	* \brief Computes the base composition of a set of
	* sequences, in integer format, contained in a matrix.
	* \param sequences a matrix containing the sequences
	* of interest.
	* \param both_strands also accounts for the reverse
	* complement of the sequences.
	* \throw std::invalid_argument if a non-supported
	* character is found in the matrix.
	* \return a vector of 4 values corresponding to the
	* frequencies of A,C,G and T
	* respectively.
	*/
	std::vector<double> base_composition(const Matrix2D<int>& sequences,
	bool both_strands) ;


	/*!
	* \brief Computes the base composition of a set of
	* consensus sequences, contained in a probability
	* matrix with the following dimensions:
	* 1st the number of sequences
	* 2nd the length of the sequences
	* 3rd 4 for A,C,G,T
	* The sum over the 1st and 2nd dimension should be 1.
	* The overall sum of the matrix values should be equal
	* to the 1st dimension.
	* \param consensus_sequences a matrix containing the
	* consensus sequences of interest.
	* \param both_strands also accounts for the reverse
	* complement of the sequences.
	* \throw std::invalid_argument if the 3rd dimension
	* of the consensus sequence matrix is not equal to 4.
	* \return a vector of 4 values corresponding to the
	* frequencies of A,C,G and T
	* respectively.
	*/
	std::vector<double> base_composition(const Matrix3D<double>& consensus_sequences,
	bool both_strands) ;
	+
	+ /*!
	+ * \brief Constructs a kmer from a piece of consensus
	+ * sequence piece contained in a probability matrix with
	+ * the following dimensions:
	+ * 1st the number of sequences
	+ * 2nd the length of the sequences
	+ * 3rd 4 for A,C,G,T
	+ * The kmer of a consensus sequence is defined as the
	+ * kmer composed of the most important base at each
	+ * position.
	+ * \param consensus_sequences the matrix containing the
	+ * consensus sequence "kmer".
	+ * \param row the index of the row containing the
	+ * consensus sequence piece.
	+ * \param from the starting position of the consensus
	+ * sequence piece.
	+ * \param to the past last ending position of the
	+ * consensus sequence piece.
	+ * \return the kmer.
	+ */
	+ std::string consensus_to_kmer(const Matrix3D<double>& consensus_sequences,
	+ size_t row,
	+ size_t from,
	+ size_t to) ;
	}

	#endif // DNA_UTILITY_HPP
	diff --git a/src/Utility/sorting_utility.hpp b/src/Utility/sorting_utility.hpp
	new file mode 100755
	index 0000000..8ce0fbe
	--- /dev/null
	+++ b/src/Utility/sorting_utility.hpp
	@@ -0,0 +1,36 @@
	+#ifndef SORTING_HPP
	+#define SORTING_HPP
	+
	+#include <vector>
	+#include <algorithm>
	+
	+/*! \brief Given a vector of data, it returns a vector of index
	+ * corresponding to the index of each element after data sorting
	+ * in ascending (by default) order.
	+ * \param data a vector of interest.
	+ * \param ascending specifies that the index should be returned in
	+ * ascending order
	+ * \return the index after sorting.
	+ */
	+template<class T>
	+std::vector<std::size_t> order(const std::vector<T>& data, bool ascending=true)
	+{
	+ std::vector<std::size_t> index(data.size(), 0) ;
	+
	+ for(std::size_t i=0; i<index.size(); i++)
	+ { index[i] = i ; }
	+
	+ sort(index.begin(), index.end(),
	+ [&](const int& a, const int& b)
	+ { return (data[a] < data[b]) ; }
	+ ) ;
	+
	+ if(not ascending)
	+ { std::reverse(index.begin(), index.end()) ; }
	+
	+ return index ;
	+}
	+
	+
	+
	+#endif // SORTING_HPP
	diff --git a/src/main_test.cpp b/src/main_test.cpp
	index 7e2a898..a827475 100644
	--- a/src/main_test.cpp
	+++ b/src/main_test.cpp
	@@ -1,453 +1,713 @@
	#include <iostream>
	#include <string>
	#include <vector>
	#include <utility>
	#include <cmath>
	+#include <unordered_map>

	#include <Matrix2D.hpp>
	#include <Matrix3D.hpp>
	#include <Matrix4D.hpp>
	#include <ConsensusSequenceModelComputer.hpp>
	#include <EMConsensusSequence.hpp>
	#include <SequenceModelComputer.hpp>
	#include <EMSequence.hpp>
	#include <dna_utility.hpp>
	+#include <Statistics.hpp>
	+#include <sorting_utility.hpp>

	+#include <boost/math/distributions.hpp>
	+
	+#include <KmersStatistics.hpp>

	template<class T>
	std::ostream& operator << (std::ostream& stream, const std::vector<T>& v)
	{ for(const auto& x : v)
	{ stream << x << " " ; }
	return stream ;
	}

	+template<class T, class U>
	+std::ostream& operator << (std::ostream& stream, const std::unordered_map<T,U>& m)
	+{ for(const auto& bucket : m)
	+ { stream << bucket.first << " " << bucket.second << std::endl ; }
	+ return stream ;
	+}
	+

	Matrix3D<double> compute_consensus_sequences(const Matrix2D<int>& seq,
	const Matrix4D<double>& prob,
	size_t class_k)
	{
	size_t n_seq = seq.get_nrow() ;
	size_t l_seq = seq.get_ncol() ;
	// size_t n_class = prob.get_dim()[1] ;
	size_t n_shift = prob.get_dim()[2] ;
	size_t n_flip = prob.get_dim()[3] ;
	bool flip = n_flip == 2 ;
	size_t l_model = l_seq - n_shift + 1 ;

	Matrix3D<double> cons_seq(n_seq, l_model, 4, 0.) ;

	// aggregate
	for(size_t i=0; i<n_seq; i++)
	{ std::cerr << "i " << i << std::endl ;
	for(size_t s=0; s<n_shift; s++)
	{ //std::cerr << "s " << s << std::endl ;
	double tot = 0. ;
	// aggregate
	for(size_t j=0; j<l_model; j++)
	{ // std::cerr << "j " << j << std::endl ;
	// std::cerr << "seq(" << i << "," << s+j << ")" << std::endl ;
	int base = seq(i,s+j) ;
	// N should participate to A,C,G,T equivalently -> same as doing nothing
	if(base == dna::char_to_int('N'))
	{ continue ; }
	int base_rv = 4 - base - 1 ;
	// --------- forward ---------
	// std::cerr << "cons_seq(" << i << "," << j << "," << base << ") += prob(" << i << "," << class_k << "," << 0 << std::endl ;
	cons_seq(i,j,base) += prob(i,class_k,s,0) ;
	tot += prob(i,class_k,s,0) ;

	// --------- reverse ---------
	if(flip)
	{ // std::cerr << "cons_seq(" << i << "," << j << "," << base_rv << ") += prob(" << i << "," << class_k << "," << 1 << std::endl ;
	cons_seq(i,j,base_rv) += prob(i,class_k,s,1) ;
	tot += prob(i,class_k,s,1) ;
	}
	}
	}
	}

	// normalize
	for(size_t i=0; i<n_seq; i++)
	{ for(size_t j=0; j<l_model; j++)
	{ double tot = 0. ;
	for(size_t k=0; k<4; k++)
	{ tot += cons_seq(i,j,k) ; }
	for(size_t k=0; k<4; k++)
	{ cons_seq(i,j,k) /= tot ; }
	}
	}

	return cons_seq ;
	}

	Matrix2D<int> char_to_int(const Matrix2D<char>& seq)
	{
	size_t n_row = seq.get_nrow() ;
	size_t n_col = seq.get_ncol() ;

	Matrix2D<int> iseq(n_row, n_col, 0.) ;

	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_col; j++)
	{ iseq(i,j) = dna::char_to_int(seq(i,j)) ; }
	}
	return iseq ;
	}

	Matrix3D<double> sequence_to_consensus(const Matrix2D<int>& seq)
	{
	size_t n_row = seq.get_nrow() ;
	size_t n_col = seq.get_ncol() ;
	size_t n_dim3 = 4 ; // A, C, G, T

	int base_n = dna::char_to_int('N') ;

	Matrix3D<double> cseq(n_row, n_col, n_dim3, 1e-100) ;


	double prob = 1. - 3*(1e-100) ; // p(base) - p(!base)

	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_col; j++)
	{ int base = seq(i,j) ;
	// N
	if(base == base_n)
	{ cseq(i,j,0) = 0.25 ;
	cseq(i,j,1) = 0.25 ;
	cseq(i,j,2) = 0.25 ;
	cseq(i,j,3) = 0.25 ;
	}
	// A C G T
	else
	{ cseq(i,j, base) = prob ; }
	}
	}
	return cseq ;
	}

	double score_consensus_subseq(const Matrix3D<double>& cons_seq,
	size_t row,
	size_t start,
	const Matrix2D<double>& model)
	{ size_t ncol = cons_seq.get_dim()[1] ;
	// size_t dim1 = cons_seq.get_dim()[0] ;
	// size_t dim2 = cons_seq.get_dim()[1] ;
	size_t dim3 = cons_seq.get_dim()[2] ;

	if(start > ncol - model.get_nrow())
	{ char msg[4096] ;
	sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu",
	start, ncol - model.get_nrow()) ;
	throw std::invalid_argument(msg) ;
	}
	else if(model.get_nrow() > ncol)
	{ char msg[4096] ;
	sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)",
	model.get_nrow(), ncol) ;
	throw std::invalid_argument(msg) ;
	}
	else if(model.get_ncol() != 4)
	{ char msg[4096] ;
	sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)",
	model.get_ncol()) ;
	throw std::invalid_argument(msg) ;
	}
	else if(dim3 != 4)
	{ char msg[4096] ;
	sprintf(msg, "Error! given data 3rd dimension is not 4 (%zu)",
	dim3) ;
	throw std::invalid_argument(msg) ;
	}

	/*

	double ll = 0 ;
	for(size_t j_seq=from, j_mod=0; j_seq<to; j_seq++, j_mod++)
	{ for(size_t k=0; k<dim3; k++)
	{ std::cerr << ll
	<< " *= "
	<< "("
	<< model(j_mod,k)
	<< " * "
	<< cons_seq(row,j_seq,k)
	<< ")"
	<< std::endl ;
	ll = model(j_mod,k) cons_seq(row,j_seq,k) ;
	}
	}*/

	size_t from = start ;
	size_t to = from + model.get_nrow() ; // will score [from, to)

	double ll = 0. ;

	for(size_t j_seq=start, j_mod=0; j_seq<to; j_seq++, j_mod++)
	{ double sum = 0. ;
	for(size_t k=0; k<dim3; k++)
	{ std::cerr << sum
	<< "+= ("
	<< cons_seq(row, j_seq, k)
	<< " * "
	<< model(j_mod, k)
	<< ")"
	<< std::endl ;
	sum += (cons_seq(row, j_seq, k) * model(j_mod, k)) ;
	}
	std::cerr << sum << std::endl ;
	std::cerr << "---------------" << std::endl ;
	ll += log(sum) ;
	}

	return ll ;
	}


	Matrix2D<double> test_consensus(const std::string& file,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed,
	size_t n_thread)
	{ size_t n_flip = flip + 1 ;

	Matrix2D<int> seq(file) ;
	// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;
	Matrix3D<double> cseq = sequence_to_consensus(seq) ;

	// partition
	EMConsensusSequence em(cseq,
	n_class,
	n_iter,
	n_shift,
	flip,
	bckg_class,
	seed,
	n_thread) ;
	em.classify() ;

	// get post prob
	Matrix4D<double> prob = em.get_post_prob() ;

	// compute models
	ConsensusSequenceModelComputer mc(std::move(cseq),
	prob,
	bckg_class,
	n_thread) ;

	Matrix2D<double> model = mc.get_model() ;

	// compute the class prob
	size_t n_row = prob.get_dim()[0] ;
	std::vector<double> class_prob(n_class, 0.) ;
	double p_tot = 0. ;
	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_class; j++)
	{ for(size_t k=0; k<n_shift; k++)
	{ for(size_t l=0; l<n_flip; l++)
	{ class_prob[j] += prob(i,j,k,l) ;
	p_tot += prob(i,j,k,l) ;
	}
	}
	}
	}
	for(auto& prob : class_prob)
	{ prob /= p_tot ; }
	// create a matrix containing the class prob in the 1st
	// column and the model in the remaining columns
	Matrix2D<double> model_final(model.get_nrow(),
	model.get_ncol() + 1) ;
	size_t i_class = 0 ;
	for(size_t i=0; i<model_final.get_nrow(); i++)
	{ if((i != 0) and (i % 4 == 0))
	{ i_class++ ; }
	model_final(i,0) = class_prob[i_class] ;
	}
	// fill the remaining with the model parameters
	for(size_t i=0; i<model.get_nrow(); i++)
	{ for(size_t j=0; j<model.get_ncol(); j++)
	{ model_final(i,j+1) = model(i,j) ; }
	}
	return model_final ;
	}

	Matrix2D<double> test_sequence(const std::string& file,
	size_t n_class,
	size_t n_iter,
	size_t n_shift,
	bool flip,
	bool bckg_class,
	const std::string& seed,
	size_t n_thread)
	{ size_t n_flip = flip + 1 ;

	Matrix2D<int> seq(file) ;
	// Matrix2D<int> seq(char_to_int(Matrix2D<char>(file))) ;

	// partition
	EMSequence em(seq,
	n_class,
	n_iter,
	n_shift,
	flip,
	bckg_class,
	seed,
	n_thread) ;
	em.classify() ;

	// get post prob
	Matrix4D<double> prob = em.get_post_prob() ;

	// compute models
	SequenceModelComputer mc(std::move(seq),
	prob,
	bckg_class,
	n_thread) ;

	Matrix2D<double> model = mc.get_model() ;

	// compute the class prob
	size_t n_row = prob.get_dim()[0] ;
	std::vector<double> class_prob(n_class, 0.) ;
	double p_tot = 0. ;
	for(size_t i=0; i<n_row; i++)
	{ for(size_t j=0; j<n_class; j++)
	{ for(size_t k=0; k<n_shift; k++)
	{ for(size_t l=0; l<n_flip; l++)
	{ class_prob[j] += prob(i,j,k,l) ;
	p_tot += prob(i,j,k,l) ;
	}
	}
	}
	}
	for(auto& prob : class_prob)
	{ prob /= p_tot ; }
	// create a matrix containing the class prob in the 1st
	// column and the model in the remaining columns
	Matrix2D<double> model_final(model.get_nrow(),
	model.get_ncol() + 1) ;
	size_t i_class = 0 ;
	for(size_t i=0; i<model_final.get_nrow(); i++)
	{ if((i != 0) and (i % 4 == 0))
	{ i_class++ ; }
	model_final(i,0) = class_prob[i_class] ;
	}
	// fill the remaining with the model parameters
	for(size_t i=0; i<model.get_nrow(); i++)
	{ for(size_t j=0; j<model.get_ncol(); j++)
	{ model_final(i,j+1) = model(i,j) ; }
	}
	return model_final ;
	}

	Matrix2D<double> test_aggregate(const std::string& file)
	{ Matrix2D<int> seq(file) ;
	size_t nrow = seq.get_nrow() ;
	size_t ncol = seq.get_ncol() ;

	Matrix2D<double> model_final(4, ncol+1, 1e-100) ;

	for(size_t i=0; i<4; i++)
	{ model_final(i,0) = 1. ; }

	std::vector<double> sum(ncol+1, 0.) ;

	for(size_t i=0; i<nrow; i++)
	{ for(size_t j=0; j<ncol; j++)
	{ int base = seq(i,j) ;
	if(base == dna::char_to_int('N'))
	{ continue ; }
	model_final(base, j+1) += 1. ;
	sum[j+1] += 1. ;
	}
	}

	for(size_t i=0; i<model_final.get_nrow(); i++)
	{ for(size_t j=1; j<model_final.get_ncol()+1; j++)
	{ model_final(i,j) /= sum[j] ; }
	}


	return model_final ;
	}

	+
	+std::unordered_map<std::string, int> get_kmer_count(const Matrix3D<double>& consensus_sequence,
	+ size_t length)
	+{ std::unordered_map<std::string, int> kmer_count ;
	+
	+ size_t n_row = consensus_sequence.get_dim()[0] ;
	+ size_t n_shift = consensus_sequence.get_dim()[1] - length + 1 ;
	+
	+ for(size_t i=0; i<n_row; i++)
	+ { for(size_t s=0; s<n_shift; s++)
	+ { // get kmer
	+ std::string kmer = dna::consensus_to_kmer(consensus_sequence,
	+ i,
	+ s,
	+ s+length) ;
	+ // insert
	+ auto iter = kmer_count.find(kmer) ;
	+ if(iter == kmer_count.end())
	+ { kmer_count.emplace(kmer, 1) ; }
	+ // update
	+ else
	+ { iter->second += 1 ; }
	+ }
	+ }
	+ return kmer_count ;
	+}
	+
	+Matrix2D<int> compute_mononucleotide(size_t k,
	+ const std::vector<std::string>& kmers,
	+ const std::vector<int>& counts)
	+{
	+ Matrix2D<int> comp(k, 4, 0) ;
	+
	+ for(size_t i=0; i<kmers.size(); i++)
	+ {
	+ std::string kmer = kmers[i] ;
	+ int count = counts[i] ;
	+
	+ for (size_t j=0; j<k ; j++)
	+ { int base = int(kmer[j]) - int('0') ;
	+ comp(j, base) += count ;
	+ }
	+ }
	+ return(comp);
	+}
	+
	+Matrix3D<int> compute_dinucleotide(size_t k,
	+ const std::vector<std::string>& kmers,
	+ const std::vector<int>& counts)
	+{
	+ Matrix3D<int> comp2(4, k-1, 4) ;
	+
	+ for(size_t i=0; i<kmers.size(); i++)
	+ { std::string kmer = kmers[i] ;
	+ int count = counts[i] ;
	+ for (size_t j=0; j<k-1; j++)
	+ { int base1 = int(kmer[j]) - int('0') ;
	+ int base2 = int(kmer[j+1]) - int('0') ;
	+ comp2(base1, j, base2) += count ;
	+ }
	+ }
	+ return(comp2);
	+}
	+
	+std::vector<double> compute_exp_values(size_t k,
	+ const std::vector<std::string>& kmers,
	+ const Matrix2D<int>& comp1,
	+ const Matrix3D<int>& comp2)
	+{ std::vector<double> expected(kmers.size()) ;
	+
	+ for(size_t i=0; i<kmers.size(); i++)
	+ { std::string kmer = kmers[i] ;
	+ double exp = comp1(0, int(kmer[0]) - int('0')) ;
	+
	+ for(size_t j=0; j<k-1; j++)
	+ { int base1 = int(kmer[j]) - int('0') ;
	+ int base2 = int(kmer[j+1]) - int('0') ;
	+ exp *= ((double)comp2(base1, j, base2) /
	+ (double)comp1(j+1, base2)) ;
	+ }
	+
	+ expected[i] = exp ;
	+ }
	+ return expected;
	+}
	+
	+std::vector<double> compute_pvalues(const std::vector<int>& counts,
	+ const std::vector<double>& expected)
	+{
	+ std::vector<double> pvalues(counts.size()) ;
	+
	+ for(size_t i=0; i<counts.size(); i++)
	+ {
	+ pvalues[i] = poisson_cdf((double)counts[i],
	+ expected[i],
	+ false) ;
	+ }
	+ return pvalues ;
	+}
	+
	+std::pair<std::vector<std::string>, std::vector<double>>
	+compute_kmer_pvalue(const Matrix3D<double>& consensus_sequences,
	+ size_t k)
	+{
	+
	+ // get kmer count
	+ std::unordered_map<std::string, int> kmer_count = get_kmer_count(consensus_sequences,
	+ k) ;
	+
	+ /*
	+ consensus_sequences.get_dim() ;
	+ std::unordered_map<std::string, int> kmer_count ;
	+ // ANN
	+ kmer_count["000"] = 1 ;
	+ kmer_count["001"] = 2 ;
	+ kmer_count["002"] = 3 ;
	+ kmer_count["003"] = 4 ;
	+ kmer_count["010"] = 5 ;
	+ kmer_count["011"] = 6 ;
	+ kmer_count["012"] = 7 ;
	+ kmer_count["013"] = 8 ;
	+ kmer_count["020"] = 9 ;
	+ kmer_count["021"] = 10 ;
	+ kmer_count["022"] = 11 ;
	+ kmer_count["023"] = 12 ;
	+ kmer_count["030"] = 13 ;
	+ kmer_count["031"] = 14 ;
	+ kmer_count["032"] = 15 ;
	+ kmer_count["033"] = 16 ;
	+ // CNN
	+ kmer_count["100"] = 1 ;
	+ kmer_count["101"] = 2 ;
	+ kmer_count["102"] = 3 ;
	+ kmer_count["103"] = 4 ;
	+ kmer_count["110"] = 5 ;
	+ kmer_count["111"] = 6 ;
	+ kmer_count["112"] = 7 ;
	+ kmer_count["113"] = 8 ;
	+ kmer_count["120"] = 9 ;
	+ kmer_count["121"] = 10 ;
	+ kmer_count["122"] = 11 ;
	+ kmer_count["123"] = 12 ;
	+ kmer_count["130"] = 13 ;
	+ kmer_count["131"] = 14 ;
	+ kmer_count["132"] = 15 ;
	+ kmer_count["133"] = 16 ;
	+ // GNN
	+ kmer_count["200"] = 1 ;
	+ kmer_count["201"] = 2 ;
	+ kmer_count["202"] = 3 ;
	+ kmer_count["203"] = 4 ;
	+ kmer_count["210"] = 5 ;
	+ kmer_count["211"] = 6 ;
	+ kmer_count["212"] = 7 ;
	+ kmer_count["213"] = 8 ;
	+ kmer_count["220"] = 9 ;
	+ kmer_count["221"] = 10 ;
	+ kmer_count["222"] = 11 ;
	+ kmer_count["223"] = 12 ;
	+ kmer_count["230"] = 13 ;
	+ kmer_count["231"] = 14 ;
	+ kmer_count["232"] = 15 ;
	+ kmer_count["233"] = 16 ;
	+ // TNN
	+ kmer_count["300"] = 1 ;
	+ kmer_count["301"] = 2 ;
	+ kmer_count["302"] = 3 ;
	+ kmer_count["303"] = 4 ;
	+ kmer_count["310"] = 5 ;
	+ kmer_count["311"] = 6 ;
	+ kmer_count["312"] = 7 ;
	+ kmer_count["313"] = 8 ;
	+ kmer_count["320"] = 9 ;
	+ kmer_count["321"] = 10 ;
	+ kmer_count["322"] = 11 ;
	+ kmer_count["323"] = 12 ;
	+ kmer_count["330"] = 13 ;
	+ kmer_count["331"] = 14 ;
	+ kmer_count["332"] = 15 ;
	+ kmer_count["333"] = 16 ;
	+ */
	+
	+ // turn to vectors
	+ std::vector<std::string> kmers(kmer_count.size()) ;
	+ size_t i=0 ;
	+ for(const auto& iter : kmer_count)
	+ { kmers[i] = iter.first ;
	+ i++ ;
	+ }
	+ std::sort(kmers.begin(), kmers.end()) ;
	+ std::vector<int> counts(kmer_count.size()) ;
	+ i=0 ;
	+ for(const auto& kmer : kmers)
	+ { counts[i] = kmer_count[kmer] ;
	+ i++ ;
	+ }
	+ kmer_count.clear() ;
	+
	+ // get mononucleotide composition in kmer
	+ // this is a probability matrix modeling the kmer
	+ Matrix2D<int> comp1 = compute_mononucleotide(k, kmers, counts) ;
	+
	+ // get dinucleotide composition in kmer
	+ // this is a probability matrix modeling the kmer
	+ Matrix3D<int> comp2 = compute_dinucleotide(k, kmers, counts) ;
	+
	+ // get expected number of occurence for each kmer
	+ std::vector<double> expected = compute_exp_values(k,
	+ kmers,
	+ comp1,
	+ comp2) ;
	+
	+ // compute the pvalue associated to each kmer
	+ std::vector<double> pvalues = compute_pvalues(counts,
	+ expected) ;
	+
	+ return std::make_pair(kmers, pvalues) ;
	+}
	+
	int main()
	{
	+ // Matrix3D<double> mat(1,5,4,0.) ;
	+ // mat(0,0,0) = 1. ; mat(0,1,1) = 1. ; mat(0,2,2) = 1. ; mat(0,3,3) = 1. ; mat(0,4,2) = 1. ;
	+
	+
	+
	+ std::string file = "/local/groux/scATAC-seq/data/"
	+ "10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
	+ Matrix3D<double> mat = sequence_to_consensus(Matrix2D<int>(file)) ;
	+
	+
	+ size_t k = 12 ;
	+ auto kmers_pvalues = kmers::compute_kmer_pvalue(mat, k) ;
	+
	+ // for(size_t i=0; i<kmers_pvalues.first.size(); i++)
	+ // { std::cerr << kmers_pvalues.first[i] << " " << kmers_pvalues.second[i] << std::endl ; }
	+
	+ std::vector<size_t> index = order(kmers_pvalues.second, true) ;
	+
	+ for(size_t i=0 ; i<10; i++)
	+ { size_t idx = index[i] ;
	+ std::cout << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ;
	+ }
	+
	+ /*
	size_t n_class = 2 ;
	- size_t n_iter = 50 ;
	+ size_t n_iter = 50 ;sequence_to_consensus
	size_t n_shift = 1 ;
	bool flip = true ;
	bool bckg_class = false ;
	std::string seed = "" ;
	size_t n_thread = 1 ;


	std::string file = "/local/groux/scATAC-seq/data/"
	"10xgenomics_PBMC_5k_motifs/sp1_motifs_10e-7_sequences.mat" ;
	+ */

	// std::string file = "/local/groux/scATAC-seq/test.mat" ;
	// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_noflip.mat" ;
	// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_1class_flip.mat" ;
	// std::string file = "/local/groux/scATAC-seq/data/toy_data/simulated_sequences_2class_flip.mat" ;

	/*
	Matrix2D<double> model_final = test_consensus(file,
	n_class,
	n_iter,
	n_shift,
	flip,
	bckg_class,
	seed,
	n_thread) ;

	*/
	+ /*
	Matrix2D<double> model_final = test_sequence(file,
	n_class,
	n_iter,
	n_shift,
	flip,
	bckg_class,
	seed,
	n_thread) ;
	+ */


	/*
	Matrix2D<double> model_final = test_aggregate(file) ;
	*/

	- std::cout << model_final << std::endl ;
	+ // std::cout << model_final << std::endl ;


	/*
	double p0 = 0 ;
	// double l0 = log(p0) ;

	double p1 = 1. - (3.*p0) ;
	// double l1 = log(p1) ;

	// Matrix3D<double> cseq(1, 4, 4) ;
	// cseq(0, 0, 0) = p1 ; cseq(0, 0, 1) = p0 ; cseq(0, 0, 2) = p0 ; cseq(0, 0, 3) = p0 ;
	// cseq(0, 1, 0) = p0 ; cseq(0, 1, 1) = p1 ; cseq(0, 1, 2) = p0 ; cseq(0, 1, 3) = p0 ;
	// cseq(0, 2, 0) = p0 ; cseq(0, 2, 1) = p0 ; cseq(0, 2, 2) = p1 ; cseq(0, 2, 3) = p0 ;
	// cseq(0, 3, 0) = p0 ; cseq(0, 3, 1) = p0 ; cseq(0, 3, 2) = p0 ; cseq(0, 3, 3) = p1 ;

	Matrix2D<int> seq(1, 4, -1) ;
	seq(0, 0) = 0 ; seq(0, 1) = 1 ; seq(0, 2) = 2 ; seq(0, 3) = 3 ;
	Matrix3D<double> cseq = sequence_to_consensus(seq) ;

	std::cout << cseq << std::endl << std::endl ;

	Matrix2D<double> mod(4, 4, -1) ;
	mod(0,0) = p1 ; mod(0,1) = p0 ; mod(0,2) = p0 ; mod(0,3) = p0 ;
	mod(1,0) = p0 ; mod(1,1) = p1 ; mod(1,2) = p0 ; mod(1,3) = p0 ;
	mod(2,0) = p0 ; mod(2,1) = p0 ; mod(2,2) = p1 ; mod(2,3) = p0 ;
	mod(3,0) = p0 ; mod(3,1) = p0 ; mod(3,2) = p0 ; mod(3,3) = p1 ;

	std::cout << mod << std::endl ;

	std::cout << score_consensus_subseq(cseq, 0,0,mod)
	<< std::endl ;
	*/
	+
	return EXIT_SUCCESS ;
	}

No OneTemporaryActions

File Metadata

View Options

Event Timeline

No OneTemporary
Actions