diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.R index 9a0ddf8..2dee435 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ctcf_motif.R @@ -1,172 +1,171 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) -library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## open chromatin patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } ################## nucleosomes chromatin patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_open_read_atac_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ctcf_motifs_10e-6_classification_1nucl_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.R index 41a905f..9bd60e0 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_ebf1_motif.R @@ -1,96 +1,95 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) -library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ebf1 motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("ebf1_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.R index 57ae656..a91de12 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_myc_motif.R @@ -1,96 +1,95 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) -library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around myc motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("myc_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("myc_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R index 53fede4..5971095 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_1/classification_sp1_motif.R @@ -1,96 +1,95 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) -library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around sp1 motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_1", sprintf("sp1_motifs_10e-7_classification_open_bin1bp_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) x.at = (x.lab + ncol(ref.open)) / 2 axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal x.at = seq(0, 1, 0.5) axis(2, at=x.at, labels=x.at) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = 380:420 plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = 1:length(idx) axis(1, at=x.at, labels=x.at) # yaxis x.at = seq(0, 2, by=1) axis(2, at=x.at, labels=x.at) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.R index 5343c07..357b7cc 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.R @@ -1,98 +1,97 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) -library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ctcf_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ctcf_motifs_10e-6_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = seq(1, length(idx), length.out = 3) x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.R index 564f066..44359b8 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.R @@ -1,98 +1,97 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) -library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ebf1 motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ebf1_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("ebf1_motifs_10e-6_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = seq(1, length(idx), length.out = 3) x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.R index c7c2f80..03d9418 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.R @@ -1,98 +1,97 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) -library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around myc motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("myc_motifs_10e-6_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("myc_motifs_10e-6_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("myc_motifs_10e-6_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = seq(1, length(idx), length.out = 3) x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.R index 9cb993f..bd9a5ec 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_sp1_motif.R @@ -1,98 +1,97 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) -library(seqLogo) # functions source(file.path("scripts", "functions.R")) # the minimum number of classes searched k.min = 1 # the maximum number of classes searched k.max = 10 # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around sp1 motifs ################## for(k in k.min:k.max) { # open chromatin data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k))) model.open = data$models model.prob = data$prob data = NULL # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_%dclass_model.mat", k)))$models # sequence model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("sp1_motifs_10e-7_sequences_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=17, height=10) png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_2", sprintf("sp1_motifs_10e-7_classification_%dclass.png", k)), units="in", res=720, width=18, height=12) m = matrix(1:10, nrow=5, ncol=2, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F] ref.nucl = model.nucl[ord,, drop=F] ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter for(i in 1:nrow(ref.open)) { # plot logo center right = 0.5*col_n - 0.01 left = right - 0.2 bottom = 1-(row_n*(0.2))+0.05 top = bottom + 0.15 par(fig=c(left, right, bottom, top), new=T) idx = (391-1-20):(391+1+20) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis x.at = seq(1, length(idx), length.out = 3) x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% 5 == 0) { col_n = col_n + 1 row_n = 1 } } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R similarity index 50% copy from scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R copy to scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R index e369c8d..5b0389a 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R @@ -1,103 +1,96 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) # the number of classes searched -n.classes = c(23) +n.classes = c(10, 20, 30) # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## for(k in n.classes) { # sequence - data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", - sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k))) + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_0", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", k))) model.seq = data$models model.prob = data$prob data = NULL # open chromatin - model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", - sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models + model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_0", + sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model.mat", k)))$models # nucleosomes - model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", - sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_0", + sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") - X11(width=26, height=12) - # png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", - # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), - # units="in", res=720, width=18, height=12) - m = matrix(1:24, nrow=6, ncol=4, byrow=F) + # X11(width=26, height=12) + png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_0", + sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:30, nrow=6, ncol=5, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) - ref.open = model.open[ord,, drop=F][,] - ref.nucl = model.nucl[ord,, drop=F][,] - ref.seq = model.seq[,,ord, drop=F][,,] + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center - # inlets with center - row_n = 1 # row counter - col_n = 1 # column counter - row_h = 1/nrow(m) # height of row - col_w = 1/ncol(m) # width of column - row_cor = row_h / 2 - col_cor = col_w / 3 - for(i in 1:nrow(ref.open)) - { # plot logo center - left = (col_w*col_n) - col_w - right = left + col_w - left = right - col_cor - bottom = 1 - (row_h*row_n) - top = bottom + row_h - bottom = top - row_cor - - par(fig=c(left, right, bottom, top), new=T) - idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10) - plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) - # plot signal (multiplies by 2 because the y-axis goes to 2 bits) - lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) - lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) - # xaxis - x.at = ceiling(length(idx)/2) - x.lab = 0 - axis(1, at=x.at, labels=x.lab) - # yaxis - axis(2, at=y.at, labels=y.lab) - row_n = row_n + 1 - if(i %% nrow(m) == 0) - { col_n = col_n + 1 - row_n = 1 - } - } + # row_n = 1 # row counter + # col_n = 1 # column counter + # for(i in 1:nrow(ref.open)) + # { # plot logo center + # right = 0.5*col_n - 0.01 + # left = right - 0.2 + # bottom = 1-(row_n*(0.2))+0.05 + # top = bottom + 0.15 + # par(fig=c(left, right, bottom, top), new=T) + # idx = (391-1-20):(391+1+20) + # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + # axis(1, at=x.at, labels=x.lab) + # # yaxis + # axis(2, at=y.at, labels=y.lab) + # row_n = row_n + 1 + # if(i %% 5 == 0) + # { col_n = col_n + 1 + # row_n = 1 + # } + # } dev.off() } + diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh similarity index 89% copy from scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh copy to scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh index 34da2c9..17b2445 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" -results_dir="results/10xgenomics_PBMC_5k_peaks_classification_1" +results_dir="results/10xgenomics_PBMC_5k_peaks_classification_0" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' -n_shift='981' +n_shift='971' n_core=8 # classify for k in 10 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --bgclass --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/motif_tree.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/motif_tree.R new file mode 100644 index 0000000..5d26e96 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/motif_tree.R @@ -0,0 +1,83 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(motifStack) +library(TFBSTools) +library(MotifDb) + +# functions +source(file.path("scripts", "functions.R")) + +get_pfm_list = function(motifs, prefix_name) +{ pfm_list = list() + + for(i in 1:dim(motifs)[3]) + { pfm_list[[i]] = new("pfm", + mat=motifs[,,i], + name=sprintf("%s %d", prefix_name, i)) + } + return(pfm_list) +} + + +# number of classes searched in the data +n_classes = c(10,30) + + +for(n_class in n_classes) +{ + # get MEME motifs + files_meme = list.files(file.path("results", + "10xgenomics_PBMC_5k_peaks_meme", + sprintf("%d_motifs", n_class)), + "mat", full.names=T) + motifs_meme = lapply(files_meme, read.table) + motifs_meme = lapply(motifs_meme, as.matrix) + motifs_meme = lapply(motifs_meme, t) + tmp = list() + for(i in 1:length(motifs_meme)) + { rownames(motifs_meme[[i]]) = c("A", "C", "G", "T") + tmp[[i]] = new("pfm", + mat=motifs_meme[[i]], + name=sprintf("MEME %d", i)) + } + motifs_meme = tmp + rm(tmp) + + # get EM discovered motifs + motifs_found = get_pfm_list(read.sequence.models(file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_0", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, + "EM") + + + # colors + red = brewer.pal(3, "Set1")[1] + blue = brewer.pal(3, "Set1")[2] + color = c(rep(blue, length(motifs_meme)), + rep(red, length(motifs_found))) + # draw tree + # X11(height=12, width=12) + png(filename=file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_0", + sprintf("%dclass_motif_tree.png", n_class)), + units="in", res=720, width=14, height=14) + motifStack(c(motifs_meme, + motifs_found), + layout="radialPhylog", + circle=0.3, + cleaves = 0.2, + clabel.leaves = 0.5, + col.bg=color, + col.bg.alpha=0.3, + col.leaves=color, + col.inner.label.circle=color, + inner.label.circle.width=0.05, + col.outer.label.circle=color, + outer.label.circle.width=0.02, + circle.motif=.5, + angle=350) + + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh new file mode 100644 index 0000000..5125c78 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/run_all.sh @@ -0,0 +1,5 @@ + +scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh + +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/analysis_test_sampled.R +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_0/motif_tree.R diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh index 34da2c9..a32bc7e 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_1" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='981' -n_core=8 +n_core=24 # classify for k in 10 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --bgclass --iter $n_iter --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --bgclass --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core --bgclass 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core --bgclass 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core --bgclass 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R new file mode 100644 index 0000000..9d94afc --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R @@ -0,0 +1,83 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(motifStack) +library(TFBSTools) +library(MotifDb) + +# functions +source(file.path("scripts", "functions.R")) + +get_pfm_list = function(motifs, prefix_name) +{ pfm_list = list() + + for(i in 1:dim(motifs)[3]) + { pfm_list[[i]] = new("pfm", + mat=motifs[,,i], + name=sprintf("%s %d", prefix_name, i)) + } + return(pfm_list) +} + + +# number of classes searched in the data +n_classes = c(10,30) + + +for(n_class in n_classes) +{ + # get MEME motifs + files_meme = list.files(file.path("results", + "10xgenomics_PBMC_5k_peaks_meme", + sprintf("%d_motifs", n_class)), + "mat", full.names=T) + motifs_meme = lapply(files_meme, read.table) + motifs_meme = lapply(motifs_meme, as.matrix) + motifs_meme = lapply(motifs_meme, t) + tmp = list() + for(i in 1:length(motifs_meme)) + { rownames(motifs_meme[[i]]) = c("A", "C", "G", "T") + tmp[[i]] = new("pfm", + mat=motifs_meme[[i]], + name=sprintf("MEME %d", i)) + } + motifs_meme = tmp + rm(tmp) + + # get EM discovered motifs + motifs_found = get_pfm_list(read.sequence.models(file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_1", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, + "EM") + + + # colors + red = brewer.pal(3, "Set1")[1] + blue = brewer.pal(3, "Set1")[2] + color = c(rep(blue, length(motifs_meme)), + rep(red, length(motifs_found))) + # draw tree + # X11(height=12, width=12) + png(filename=file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_1", + sprintf("%dclass_motif_tree.png", n_class)), + units="in", res=720, width=14, height=14) + motifStack(c(motifs_meme, + motifs_found), + layout="radialPhylog", + circle=0.3, + cleaves = 0.2, + clabel.leaves = 0.5, + col.bg=color, + col.bg.alpha=0.3, + col.leaves=color, + col.inner.label.circle=color, + inner.label.circle.width=0.05, + col.outer.label.circle=color, + outer.label.circle.width=0.02, + circle.motif=.5, + angle=350) + + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh new file mode 100644 index 0000000..a4e36e8 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/run_all.sh @@ -0,0 +1,5 @@ + +scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh + +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/analysis_test_sampled.R +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_1/motif_tree.R diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh index 03c84de..9a59ce4 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_2" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='981' n_core=24 # classify for k in 10 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R new file mode 100644 index 0000000..f49336b --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R @@ -0,0 +1,83 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(motifStack) +library(TFBSTools) +library(MotifDb) + +# functions +source(file.path("scripts", "functions.R")) + +get_pfm_list = function(motifs, prefix_name) +{ pfm_list = list() + + for(i in 1:dim(motifs)[3]) + { pfm_list[[i]] = new("pfm", + mat=motifs[,,i], + name=sprintf("%s %d", prefix_name, i)) + } + return(pfm_list) +} + + +# number of classes searched in the data +n_classes = c(10,30) + + +for(n_class in n_classes) +{ + # get MEME motifs + files_meme = list.files(file.path("results", + "10xgenomics_PBMC_5k_peaks_meme", + sprintf("%d_motifs", n_class)), + "mat", full.names=T) + motifs_meme = lapply(files_meme, read.table) + motifs_meme = lapply(motifs_meme, as.matrix) + motifs_meme = lapply(motifs_meme, t) + tmp = list() + for(i in 1:length(motifs_meme)) + { rownames(motifs_meme[[i]]) = c("A", "C", "G", "T") + tmp[[i]] = new("pfm", + mat=motifs_meme[[i]], + name=sprintf("MEME %d", i)) + } + motifs_meme = tmp + rm(tmp) + + # get EM discovered motifs + motifs_found = get_pfm_list(read.sequence.models(file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_2", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, + "EM") + + + # colors + red = brewer.pal(3, "Set1")[1] + blue = brewer.pal(3, "Set1")[2] + color = c(rep(blue, length(motifs_meme)), + rep(red, length(motifs_found))) + # draw tree + # X11(height=12, width=12) + png(filename=file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_2", + sprintf("%dclass_motif_tree.png", n_class)), + units="in", res=720, width=14, height=14) + motifStack(c(motifs_meme, + motifs_found), + layout="radialPhylog", + circle=0.3, + cleaves = 0.2, + clabel.leaves = 0.5, + col.bg=color, + col.bg.alpha=0.3, + col.leaves=color, + col.inner.label.circle=color, + inner.label.circle.width=0.05, + col.outer.label.circle=color, + outer.label.circle.width=0.02, + circle.motif=.5, + angle=350) + + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh new file mode 100644 index 0000000..a7385c4 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/run_all.sh @@ -0,0 +1,5 @@ + +scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh + +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/analysis_test_sampled.R +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_2/motif_tree.R diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh index a396e60..30bbc56 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_3" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='981' n_core=24 # classify for k in 10 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_openchromatin-sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed - bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R new file mode 100644 index 0000000..27dcef6 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R @@ -0,0 +1,83 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(motifStack) +library(TFBSTools) +library(MotifDb) + +# functions +source(file.path("scripts", "functions.R")) + +get_pfm_list = function(motifs, prefix_name) +{ pfm_list = list() + + for(i in 1:dim(motifs)[3]) + { pfm_list[[i]] = new("pfm", + mat=motifs[,,i], + name=sprintf("%s %d", prefix_name, i)) + } + return(pfm_list) +} + + +# number of classes searched in the data +n_classes = c(10,30) + + +for(n_class in n_classes) +{ + # get MEME motifs + files_meme = list.files(file.path("results", + "10xgenomics_PBMC_5k_peaks_meme", + sprintf("%d_motifs", n_class)), + "mat", full.names=T) + motifs_meme = lapply(files_meme, read.table) + motifs_meme = lapply(motifs_meme, as.matrix) + motifs_meme = lapply(motifs_meme, t) + tmp = list() + for(i in 1:length(motifs_meme)) + { rownames(motifs_meme[[i]]) = c("A", "C", "G", "T") + tmp[[i]] = new("pfm", + mat=motifs_meme[[i]], + name=sprintf("MEME %d", i)) + } + motifs_meme = tmp + rm(tmp) + + # get EM discovered motifs + motifs_found = get_pfm_list(read.sequence.models(file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_3", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, + "EM") + + + # colors + red = brewer.pal(3, "Set1")[1] + blue = brewer.pal(3, "Set1")[2] + color = c(rep(blue, length(motifs_meme)), + rep(red, length(motifs_found))) + # draw tree + # X11(height=12, width=12) + png(filename=file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_3", + sprintf("%dclass_motif_tree.png", n_class)), + units="in", res=720, width=14, height=14) + motifStack(c(motifs_meme, + motifs_found), + layout="radialPhylog", + circle=0.3, + cleaves = 0.2, + clabel.leaves = 0.5, + col.bg=color, + col.bg.alpha=0.3, + col.leaves=color, + col.inner.label.circle=color, + inner.label.circle.width=0.05, + col.outer.label.circle=color, + outer.label.circle.width=0.02, + circle.motif=.5, + angle=350) + + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh new file mode 100644 index 0000000..723d374 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/run_all.sh @@ -0,0 +1,5 @@ + +scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh + +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/analysis_test_sampled.R +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_3/motif_tree.R diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh index d87ff4e..383afb1 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh @@ -1,55 +1,55 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" pwm_dir="data/pwm/jaspar_2018_clustering/" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_4" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='971' -n_core=30 +n_core=24 ## PWM files jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" # classify -for k in 30 20 17 +for k in 17 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/test_dendrogram.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/motif_tree.R similarity index 96% rename from scripts/test_dendrogram.R rename to scripts/10xgenomics_PBMC_5k_peaks_classification_4/motif_tree.R index 708c704..69ebdd8 100644 --- a/scripts/test_dendrogram.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/motif_tree.R @@ -1,105 +1,106 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) library(motifStack) library(TFBSTools) library(MotifDb) # functions source(file.path("scripts", "functions.R")) get_pfm_list = function(motifs, prefix_name) { pfm_list = list() for(i in 1:dim(motifs)[3]) { pfm_list[[i]] = new("pfm", mat=motifs[,,i], name=sprintf("%s class %d", prefix_name, i)) } return(pfm_list) } # number of classes searched in the data n_classes = c(17, 20, 30) # load motifs from JASPAR clustering used to initialise the classes motifs_jaspar_paths = c("data/pwm/jaspar_2018_clustering/cluster_3_node_23_20_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_4_node_31_3_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_4_node_22_4_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_7_node_13_2_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_5_node_20_5_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_31_node_4_5_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_31_node_5_2_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_1_node_74_2_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_66_1_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_33_node_1_2_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_20_4_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_7_node_17_5_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_32_node_STAT6_1_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_3_node_24_4_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_4_node_30_2_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_39_node_1_2_motifs_prob.mat", "data/pwm/jaspar_2018_clustering/cluster_48_node_ctcf_1_motifs_prob.mat") motifs_jaspar = lapply(motifs_jaspar_paths, read.table) motifs_jaspar = lapply(motifs_jaspar, as.matrix) motifs_jaspar_names = c("jun", "HIF1A", "myc", "PU.1", "CEBPb", "Irf4", "Irf2", "LHX3", "Fox1H", "Sox3", "Mef2c", "Elf5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF") tmp = list() for(i in 1:length(motifs_jaspar)) { rownames(motifs_jaspar[[i]]) = c("A", "C", "G", "T") tmp[[i]] = new("pfm", mat=motifs_jaspar[[i]], name=motifs_jaspar_names[i]) } motifs_jaspar = tmp rm(tmp) for(n_class in n_classes) { # load classes found motifs_found = get_pfm_list(read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_4", sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, "") # colors red = brewer.pal(3, "Set1")[1] blue = brewer.pal(3, "Set1")[2] color = c(rep(blue, length(motifs_jaspar)), rep(red, length(motifs_found))) # plot logo stack with radial style # X11(height=12, width=12) - png(filename=file.path(sprintf("test_%dclass.png", n_class)), + png(filename=file.path("10xgenomics_PBMC_5k_peaks_classification_4", + sprintf("%dclass_motif_tree.png", n_class)), units="in", res=720, width=14, height=14) motifStack(c(motifs_jaspar, motifs_found), layout="radialPhylog", circle=0.3, cleaves = 0.2, clabel.leaves = 0.5, col.bg=color, col.bg.alpha=0.3, col.leaves=color, col.inner.label.circle=color, inner.label.circle.width=0.05, col.outer.label.circle=color, outer.label.circle.width=0.02, circle.motif=.5, angle=350) dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh new file mode 100644 index 0000000..52cbdab --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/run_all.sh @@ -0,0 +1,5 @@ + +scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh + +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/analysis_test_sampled.R +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_4/motif_tree.R diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh index 5f54d1d..d44282f 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_5" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='991' n_core=24 # classify for k in 20 30 40 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R new file mode 100644 index 0000000..7a1904d --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R @@ -0,0 +1,83 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(motifStack) +library(TFBSTools) +library(MotifDb) + +# functions +source(file.path("scripts", "functions.R")) + +get_pfm_list = function(motifs, prefix_name) +{ pfm_list = list() + + for(i in 1:dim(motifs)[3]) + { pfm_list[[i]] = new("pfm", + mat=motifs[,,i], + name=sprintf("%s %d", prefix_name, i)) + } + return(pfm_list) +} + + +# number of classes searched in the data +n_classes = c(30) + + +for(n_class in n_classes) +{ + # get MEME motifs + files_meme = list.files(file.path("results", + "10xgenomics_PBMC_5k_peaks_meme", + sprintf("%d_motifs", n_class)), + "mat", full.names=T) + motifs_meme = lapply(files_meme, read.table) + motifs_meme = lapply(motifs_meme, as.matrix) + motifs_meme = lapply(motifs_meme, t) + tmp = list() + for(i in 1:length(motifs_meme)) + { rownames(motifs_meme[[i]]) = c("A", "C", "G", "T") + tmp[[i]] = new("pfm", + mat=motifs_meme[[i]], + name=sprintf("MEME %d", i)) + } + motifs_meme = tmp + rm(tmp) + + # get EM discovered motifs + motifs_found = get_pfm_list(read.sequence.models(file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_5", + sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model.mat", n_class)))$models, + "EM") + + + # colors + red = brewer.pal(3, "Set1")[1] + blue = brewer.pal(3, "Set1")[2] + color = c(rep(blue, length(motifs_meme)), + rep(red, length(motifs_found))) + # draw tree + # X11(height=12, width=12) + png(filename=file.path("results", + "10xgenomics_PBMC_5k_peaks_classification_5", + sprintf("%dclass_motif_tree.png", n_class)), + units="in", res=720, width=14, height=14) + motifStack(c(motifs_meme, + motifs_found), + layout="radialPhylog", + circle=0.3, + cleaves = 0.2, + clabel.leaves = 0.5, + col.bg=color, + col.bg.alpha=0.3, + col.leaves=color, + col.inner.label.circle=color, + inner.label.circle.width=0.05, + col.outer.label.circle=color, + outer.label.circle.width=0.02, + circle.motif=.5, + angle=350) + + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh new file mode 100644 index 0000000..934f5ea --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/run_all.sh @@ -0,0 +1,5 @@ + +scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh + +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/analysis_test_sampled.R +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_5/motif_tree.R diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R index e369c8d..1fc02f9 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R @@ -1,103 +1,104 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) # the number of classes searched n.classes = c(23) # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## sequence patterns around ctcf motifs ################## for(k in n.classes) { # sequence data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", sprintf("peaks_rmsk_sampled_sequences_1kb_%dclass_model_extended.mat", k))) model.seq = data$models model.prob = data$prob data = NULL # open chromatin model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", sprintf("peaks_rmsk_sampled_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", sprintf("peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") X11(width=26, height=12) # png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_6", # sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), - # units="in", res=720, width=18, height=12) + # units="in", res=720, width=26, height=12) m = matrix(1:24, nrow=6, ncol=4, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F][,] ref.nucl = model.nucl[ord,, drop=F][,] ref.seq = model.seq[,,ord, drop=F][,,] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] for(i in 1:nrow(ref.open)) { # plot logo plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("class %d (p=%.2f)", class[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center - # inlets with center row_n = 1 # row counter col_n = 1 # column counter row_h = 1/nrow(m) # height of row col_w = 1/ncol(m) # width of column row_cor = row_h / 2 col_cor = col_w / 3 for(i in 1:nrow(ref.open)) { # plot logo center left = (col_w*col_n) - col_w right = left + col_w left = right - col_cor bottom = 1 - (row_h*row_n) top = bottom + row_h bottom = top - row_cor par(fig=c(left, right, bottom, top), new=T) idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] x.at = ceiling(length(idx)/2) x.lab = 0 axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% nrow(m) == 0) { col_n = col_n + 1 - row_n = 1 + row_n = 1 } } - dev.off() + # dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh index 1efedd4..f10ec18 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh @@ -1,76 +1,76 @@ # paths ## dir data_dir_p="data/10xgenomics_PBMC_5k_peaks" data_dir="data/10xgenomics_PBMC_5k" pwm_dir="data/pwm/jaspar_2018_clustering/" hg19_dir="data/genomes" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_6" ## matrix files file_mat_open=$data_dir_p/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir_p/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir_p/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='1' n_shift='971' -n_core=8 +n_core=24 ## PWM files jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" klf="$pwm_dir/cluster_28_node_14_3_motifs_prob.mat" nr4a1="$pwm_dir/cluster_2_node_12_4_motifs_prob.mat" egr="$pwm_dir/cluster_28_node_13_4_motifs_prob.mat" gata="$pwm_dir/cluster_21_node_5_6_motifs_prob.mat" nfat="$pwm_dir/cluster_19_node_2_3_motifs_prob.mat" runx="$pwm_dir/cluster_38_node_3_3_motifs_prob.mat" # classify for k in 23 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob + bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf,$klf,$nr4a1,$egr,$gata,$nfat,$runx --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 # extend models file_mod1_ext=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model_extended.mat' file_mod2_ext=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model_extended.mat' file_mod3_ext=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model_extended.mat' file_bed=$data_dir/'atac_v1_pbmc_5k_peaks_rmsk_sampled.bed' file_fasta=$hg19_dir/'hg19.fasta' file_bam_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam' file_bai_open=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai' file_bam_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam' file_bai_nucl=$data_dir/'atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai' bin/ReadModelExtender --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'read_atac' --thread $n_core > $file_mod1_ext bin/ReadModelExtender --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --ext 1000 --binSize 1 --method 'fragment_center' --thread $n_core > $file_mod2_ext bin/SequenceModelExtender --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --ext 1000 --thread $n_core > $file_mod3_ext done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh new file mode 100644 index 0000000..1803c05 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_6/run_all.sh @@ -0,0 +1,4 @@ + +scripts/10xgenomics_PBMC_5k_peaks_classification_6/classification_peaks_sampled.sh + +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_6/analysis_test_sampled.R diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh new file mode 100644 index 0000000..2e4fecb --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/run_all.sh @@ -0,0 +1,4 @@ + +scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks_sampled.sh + +Rscript scripts/10xgenomics_PBMC_5k_peaks_classification_7/analysis_test_sampled.R diff --git a/src/Applications/ReadModelExtenderApplication.cpp b/src/Applications/ClassCorrelationMatrixCreatorApplication.cpp similarity index 50% copy from src/Applications/ReadModelExtenderApplication.cpp copy to src/Applications/ClassCorrelationMatrixCreatorApplication.cpp index bf1d9b6..ad233ae 100644 --- a/src/Applications/ReadModelExtenderApplication.cpp +++ b/src/Applications/ClassCorrelationMatrixCreatorApplication.cpp @@ -1,273 +1,296 @@ -#include + +#include +#include #include -#include #include -#include // std::move() -#include // std::invalid_argument, std::runtime_error +#include +#include +#include // std::invalid_argument -#include -#include #include -#include -#include namespace po = boost::program_options ; - // the valid values for --method option std::string method_read = "read" ; std::string method_read_atac = "read_atac" ; std::string method_fragment = "fragment" ; std::string method_fragment_center = "fragment_center" ; -ReadModelExtenderApplication::ReadModelExtenderApplication(int argn, char** argv) +ClassCorrelationMatrixCreatorApplication::ClassCorrelationMatrixCreatorApplication(int argn, char** argv) : file_bed(""), file_bam(""), file_bai(""), file_prob(""), - from(0), to(0), ext(0), bin_size(0), - method(CorrelationMatrixCreator::FRAGMENT), - n_threads(0), runnable(false) -{ this->parseOptions(argn, argv) ; } - -ReadModelExtenderApplication::~ReadModelExtenderApplication() -{} + from(0), to(0), bin_size(0), class_k(0), + method(CorrelationMatrixCreator::FRAGMENT), runnable(true) +{ + // parse command line options and set the fields + this->parseOptions(argn, argv) ; +} -int ReadModelExtenderApplication::run() +int ClassCorrelationMatrixCreatorApplication::run() { if(this->runnable) - { // extend limits - int ext_right = this->ext/2 ; - int ext_left = this->ext - ext_right ; - this->from -= ext_left ; - this->to += ext_right ; - - // create extended matrix - CorrelationMatrixCreator mc(this->file_bed, - this->file_bam, - this->file_bai, - this->from, - this->to, - this->bin_size, - this->method) ; - Matrix2D data = mc.create_matrix() ; - - // compute model - // Matrix4D prob(this->file_prob) ; - Matrix4D prob ; prob.load(this->file_prob) ; - if(prob.get_dim()[0] != data.get_nrow()) - { char msg[4096] ; - sprintf(msg, - "Error! data matrix and probability matrix have " - "unequal row numbers (%zu and %zu)", - prob.get_dim()[0], - data.get_nrow()) ; - throw std::invalid_argument(msg) ; - } + { + // load posterior prob + Matrix4D prob(this->file_prob) ; + // prob.load(this->file_prob) ; size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; - - ReadModelComputer model_cp(std::move(data), - prob, - this->n_threads) ; - Matrix2D model = model_cp.get_model() ; + bool flip = n_flip == 2 ; // compute class prob - vector_d class_prob(n_class, 0.) ; - double p_tot = 0. ; + std::vector prob_colsum(n_class, 0.) ; + double tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), - model.get_ncol() + 1) ; - // 1st column contain the class prob - for(size_t i=0; i 0-based) + this->class_k -= 1 ; + if(this->class_k > n_class) + { char msg[4096] ; + sprintf(msg, "Error! invalid class of interest (%zu and %zu found)", + this->class_k, n_class) ; + throw std::invalid_argument(msg) ; } - std::cout << model_final << std::endl ; - return 0 ; + + // extend limits + int ext = n_shift - 1 ; + int ext_r = ext / 2 ; + int ext_l = ext - ext_r ; + int from2 = this->from - ext_l ; + int to2 = this->to + ext_r ; + CorrelationMatrixCreator mc(this->file_bed, + this->file_bam, + this->file_bai, + from2, + to2, + this->bin_size, + this->method) ; + Matrix2D data2 = mc.create_matrix() ; + size_t n_col2 = data2.get_ncol() ; + + // realign matrix + size_t n_col3 = this->to - this->from + 1 ; + Matrix2D data3(n_row, + n_col3, + 0.) ; + for(size_t i=0; i= to_dat2_rev; + j_dat2_rev--, j_dat3_fw++) + { data3(i,j_dat3_fw) += + (prob(i,class_k,s,1) * + data2(i,j_dat2_rev)) / + prob_colsum[class_k] ; + } + } + } + } + // clean memory + prob = Matrix4D() ; + + // convert to integer + Matrix2D data4(data3.get_nrow(), + data3.get_ncol()) ; + for(size_t i=0; i< data4.get_nrow(); i++) + { for(size_t j=0; j(&(this->file_bed)), opt_bed_msg.c_str()) - ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) - ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) - ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) + ("help,h", opt_help_msg.c_str()) - ("from", po::value(&(this->from)), opt_from_msg.c_str()) - ("to", po::value(&(this->to)), opt_to_msg.c_str()) - ("ext", po::value(&(this->ext)), opt_ext_msg.c_str()) - ("binSize", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) - ("method", po::value(&(method)), opt_method_msg.c_str()) + ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str()) + ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) + ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) + ("prob", po::value(&(this->file_prob)), opt_bai_msg.c_str()) - ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; + ("from", po::value(&(this->from)), opt_from_msg.c_str()) + ("to", po::value(&(this->to)), opt_to_msg.c_str()) + ("binSize", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) + ("k", po::value(&(this->class_k)), opt_classk_msg.c_str()) + ("method", po::value(&(method)), opt_method_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bam == "" and (not help)) { std::string msg("Error! No BAM file was given (--bam)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bai == "" and (not help)) { std::string msg("Error! No BAM index file was given (--bai)!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) - { std::string msg("Error! No posterior probability file was given (--prob)!") ; + { std::string msg("Error! No probability (partition) file was given (--prob)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } - else if(ext <= 0 and (not help)) - { std::string msg("Error! the number of columns to add should be > 0 (--ext)!") ; + else if(this->class_k == 0 and (not help)) + { std::string msg("Error! no class of interest was given (--k)!") ; throw std::invalid_argument(msg) ; } else if(this->bin_size <= 0 and (not help)) { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; throw std::invalid_argument(msg) ; } else if(method != method_read and method != method_read_atac and method != method_fragment and method != method_fragment_center) { char msg[4096] ; sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; throw std::invalid_argument(msg) ; } // set method if(method == method_read) { this->method = CorrelationMatrixCreator::READ ; } else if(method == method_read_atac) { this->method = CorrelationMatrixCreator::READ_ATAC ; } else if(method == method_fragment) { this->method = CorrelationMatrixCreator::FRAGMENT ; } else if(method == method_fragment_center) { this->method = CorrelationMatrixCreator::FRAGMENT_CENTER ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } + int main(int argn, char** argv) -{ ReadModelExtenderApplication app(argn, argv) ; +{ ClassCorrelationMatrixCreatorApplication app(argn, argv) ; return app.run() ; } + diff --git a/src/Applications/CorrelationMatrixCreatorApplication.hpp b/src/Applications/ClassCorrelationMatrixCreatorApplication.hpp similarity index 55% copy from src/Applications/CorrelationMatrixCreatorApplication.hpp copy to src/Applications/ClassCorrelationMatrixCreatorApplication.hpp index 31aed3d..edaea0b 100644 --- a/src/Applications/CorrelationMatrixCreatorApplication.hpp +++ b/src/Applications/ClassCorrelationMatrixCreatorApplication.hpp @@ -1,102 +1,99 @@ -#ifndef CORRELATIONMATRIXCREATORAPPLICATION_HPP -#define CORRELATIONMATRIXCREATORAPPLICATION_HPP +#ifndef CLASSCORREALTIONMATRIXCREATORAPPLICATION_HPP +#define CLASSCORREALTIONMATRIXCREATORAPPLICATION_HPP #include -#include // CorrelationMatrixCreator::methods #include -/*! - * \brief The CorrelationMatrixCreatorApplication class is a wrapper around a - * CorrelationMatrixCreator instance creating an autonomous application to - * compute a count matrix from a BAM file by directly passing all the options - * and parameters from the command line. - */ -class CorrelationMatrixCreatorApplication: public ApplicationInterface +#include // CorrelationMatrixCreator::methods + +class ClassCorrelationMatrixCreatorApplication : public ApplicationInterface { public: - CorrelationMatrixCreatorApplication() = delete ; - CorrelationMatrixCreatorApplication(const CorrelationMatrixCreatorApplication& app) = delete ; + ClassCorrelationMatrixCreatorApplication() = delete ; + ClassCorrelationMatrixCreatorApplication( + const ClassCorrelationMatrixCreatorApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ - CorrelationMatrixCreatorApplication(int argn, char** argv) ; + ClassCorrelationMatrixCreatorApplication(int argn, char** argv) ; /*! - * \brief Runs the application. A matrix containing the - * read densities around the center of the bed regions, +/- - * the given offsets is created by searching the fasta file - * and printed on the stdout. - * The matrix is a 2D matrix with dimensions : - * 1) number of regions - * 2) length of region (to - from + 1) / bin_size. + * \brief TOFO * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the bed file. */ std::string file_bed ; /*! * \brief the path to the bam file. */ std::string file_bam ; /*! * \brief the path to the bam index file. */ std::string file_bai ; /*! - * \brief a relative coordinate indicating the - * most downstream position to consider around - * each region in the bed file. + * \brief the path to the posterior probability + * file (the partition). + */ + std::string file_prob ; + /*! + * \brief the coordinate of the most upstream + * position that was in the original matrix, in + * relative coordinate. */ int from ; /*! - * \brief a relative coordinate indicating the - * most upstream position to consider around - * each region in the bed file. + * \brief the coordinate of the most downstream + * position that was in the original matrix, in + * relative coordinate. */ int to ; /*! - * \brief the size of the bin that will be used - * to bin the signal in the regions [from,to] around - * each region in the bed file. + * \brief the size of the bin that was used for + * the original matrix. */ int bin_size ; + /*! + * \brief the class of interest (0-based). + */ + size_t class_k ; /*! * \brief How to consider the sequenced fragments when computing * the bin values. */ CorrelationMatrixCreator::methods method ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; - -#endif // CORRELATIONMATRIXCREATORAPPLICATION_HPP +#endif // CLASSCORREALTIONMATRIXCREATORAPPLICATION_HPP diff --git a/src/Applications/CorrelationMatrixCreatorApplication.hpp b/src/Applications/CorrelationMatrixCreatorApplication.hpp index 31aed3d..b108055 100644 --- a/src/Applications/CorrelationMatrixCreatorApplication.hpp +++ b/src/Applications/CorrelationMatrixCreatorApplication.hpp @@ -1,102 +1,102 @@ #ifndef CORRELATIONMATRIXCREATORAPPLICATION_HPP #define CORRELATIONMATRIXCREATORAPPLICATION_HPP #include #include // CorrelationMatrixCreator::methods #include /*! * \brief The CorrelationMatrixCreatorApplication class is a wrapper around a * CorrelationMatrixCreator instance creating an autonomous application to * compute a count matrix from a BAM file by directly passing all the options * and parameters from the command line. */ class CorrelationMatrixCreatorApplication: public ApplicationInterface { public: CorrelationMatrixCreatorApplication() = delete ; CorrelationMatrixCreatorApplication(const CorrelationMatrixCreatorApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ CorrelationMatrixCreatorApplication(int argn, char** argv) ; /*! * \brief Runs the application. A matrix containing the * read densities around the center of the bed regions, +/- - * the given offsets is created by searching the fasta file + * the given offsets is created by searching the BAM file * and printed on the stdout. * The matrix is a 2D matrix with dimensions : * 1) number of regions * 2) length of region (to - from + 1) / bin_size. * \return an exit code EXIT_SUCCESS or EXIT_FAILURE * to return to the OS. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the bed file. */ std::string file_bed ; /*! * \brief the path to the bam file. */ std::string file_bam ; /*! * \brief the path to the bam index file. */ std::string file_bai ; /*! * \brief a relative coordinate indicating the * most downstream position to consider around * each region in the bed file. */ int from ; /*! * \brief a relative coordinate indicating the * most upstream position to consider around * each region in the bed file. */ int to ; /*! * \brief the size of the bin that will be used * to bin the signal in the regions [from,to] around * each region in the bed file. */ int bin_size ; /*! * \brief How to consider the sequenced fragments when computing * the bin values. */ CorrelationMatrixCreator::methods method ; /*! * \brief a flag indicating whether the core of run() can be * run or not. */ bool runnable ; } ; #endif // CORRELATIONMATRIXCREATORAPPLICATION_HPP diff --git a/src/Applications/EMJointApplication.cpp b/src/Applications/EMJointApplication.cpp index 5252ef2..e0ba459 100644 --- a/src/Applications/EMJointApplication.cpp +++ b/src/Applications/EMJointApplication.cpp @@ -1,194 +1,202 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument #include #include // boost::split() #include namespace po = boost::program_options ; +template +std::ostream& operator << (std::ostream& stream, + const std::vector& v) +{ for(const auto& x : v) + { stream << x << " " ; } + return stream ; +} EMJointApplication::EMJointApplication(int argn, char** argv) : files_read(""), file_sequence(""), file_out(""), n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMJointApplication::run() { if(this->runnable) { // read data std::vector read_paths ; boost::split(read_paths, this->files_read, [](char c){return c == ',';}) ; + std::vector> data_read ; for(const auto& path : read_paths) { if(path == "") { continue ; } data_read.push_back(Matrix2D(path)) ; } // sequence data EMJoint* em = nullptr ; if(this->file_sequence == "") { em = new EMJoint(std::move(data_read), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } else { Matrix2D data_seq(this->file_sequence) ; em = new EMJoint(std::move(data_read), std::move(data_seq), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } em->classify() ; em->get_post_prob().save(this->file_out) ; delete em ; em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMJointApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMJoint is a probabilistic partitioning algorithm that \n" "sofetly assigns genomic regions to classes given 1) the shapes \n" "of the read densities over the regions and 2) the region sequence \n" "motif contents. \n " "The assignment probabilitiesare returned through stdout.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations, \n" "by default 0 (no parallelization)." ; std::string opt_read_msg = "A coma separated list of paths to the file containing the \n" "read density data. At least one path is needed." ; std::string opt_seq_msg = "The path to the file containing the sequence data. If no path is \n" "given, the classification is only cares about the read density \n" "shapes." ; std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting " "freedom. By default, shifting is " "disabled (equivalent to --shift 1)." ; std::string opt_flip_msg = "Enables flipping."; std::string opt_bckg_msg = "Adds a class to model the sequence and the read signal background. This\n" "class contains sequence background probabilies (for the sequence model)\n" "and the mean number of reads (for the read count models) at each position\n" "and is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("read", po::value(&(this->files_read)), opt_read_msg.c_str()) ("seq", po::value(&(this->file_sequence)), opt_read_msg.c_str()) ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->files_read == "" and this->file_sequence == "" and (not help)) { std::string msg("Error! No data were given (--read and --seq)!") ; throw std::invalid_argument(msg) ; } if(this->files_read == "" and (not help)) { std::string msg("Error! No read density data were given (--read)!") ; throw std::invalid_argument(msg) ; } if(this->file_out == "" and (not help)) { std::string msg("Error! No output file given (--out)!") ; throw std::invalid_argument(msg) ; } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { EMJointApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ProbToModelApplication.cpp b/src/Applications/ProbToModelApplication.cpp index 65334f9..b8227a4 100644 --- a/src/Applications/ProbToModelApplication.cpp +++ b/src/Applications/ProbToModelApplication.cpp @@ -1,214 +1,223 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument, std::runtime_error #include #include #include #include #include namespace po = boost::program_options ; typedef std::vector vector_d ; ProbToModelApplication::ProbToModelApplication(int argn, char** argv) - : file_read(""), file_seq(""), file_prob(""), + : file_read(""), file_seq(""), file_prob(""), bckg_class(false), n_threads(0), runnable(false) { this->parseOptions(argn, argv) ; } ProbToModelApplication::~ProbToModelApplication() {} int ProbToModelApplication::run() { if(this->runnable) { // load data std::string file_data ; bool read_data = false ; bool seq_data = false ; if(this->file_read != "") { file_data = this->file_read ; read_data = true ; seq_data = false ; } else if(this->file_seq != "") { file_data = this->file_seq ; read_data = false ; seq_data = true ; } else { std::string msg("Error! Could not determine the type of the data!") ; throw std::runtime_error(msg) ; } Matrix2D data(file_data) ; // Matrix4D prob(this->file_prob) ; Matrix4D prob ; prob.load(this->file_prob) ; if(data.get_nrow() != prob.get_dim()[0]) { char msg[4096] ; sprintf(msg, "Error! data and prob matrices have unequal " "row numbers (%zu / %zu)!", data.get_nrow(), prob.get_dim()[0]) ; throw std::runtime_error(msg) ; } else if(data.get_ncol() < prob.get_dim()[2]) { char msg[4096] ; sprintf(msg, "Error! too many shift states for the data!" "%zu shift states and %zu columns in data)!", prob.get_dim()[2], data.get_ncol()) ; throw std::runtime_error(msg) ; } // get the data model ModelComputer* ptr = nullptr ; if(read_data) { ptr = new ReadModelComputer(std::move(data), prob, + this->bckg_class, this->n_threads) ; } else if(seq_data) { ptr = new SequenceModelComputer(std::move(data), prob, + this->bckg_class, this->n_threads) ; } Matrix2D model = ptr->get_model() ; delete ptr ; ptr = nullptr ; // compute the class prob size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; vector_d class_prob(n_class, 0.) ; double p_tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), model.get_ncol() + 1) ; // 1st column contain the class prob if(read_data) { for(size_t i=0; i(&(this->file_read)), opt_read_msg.c_str()) ("seq,", po::value(&(this->file_seq)), opt_seq_msg.c_str()) ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) + ("bgclass", opt_bckg_msg.c_str()) + ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; - // checks unproper option settings if((this->file_read == "") and (this->file_seq == "") and (not help)) { std::string msg("Error! No data file was given (--read or --seq)!") ; throw std::invalid_argument(msg) ; } else if((this->file_read != "") and (this->file_seq != "") and (not help)) { std::string msg("Error! --read and --seq are mutually exclusive!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No posterior probabily file was given (--prob)!") ; throw std::invalid_argument(msg) ; } + // set background class + if(vm.count("bgclass")) + { this->bckg_class = true ; } + // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { ProbToModelApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ProbToModelApplication.hpp b/src/Applications/ProbToModelApplication.hpp index 66d2541..0ac126f 100644 --- a/src/Applications/ProbToModelApplication.hpp +++ b/src/Applications/ProbToModelApplication.hpp @@ -1,82 +1,88 @@ #ifndef PROBTOMODELAPPLICATION_HPP #define PROBTOMODELAPPLICATION_HPP #include #include #include /*! * \brief The ProbToModelApplication class is a wrapper around an ModelReferenceComputer * instance creating an autonomous application to compute data models given the * data and the results of the classification procedure (the posterior probability * matrix). */ class ProbToModelApplication : public ApplicationInterface { public: ProbToModelApplication() = delete ; ProbToModelApplication(const ProbToModelApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ ProbToModelApplication(int argn, char** argv) ; /*! * \brief Destructor. */ virtual ~ProbToModelApplication() override ; /*! * \brief Runs the application. The data model * is computed and displayed through the * stdout. * \return the exit code. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the file containing the * read count data. */ std::string file_read ; /*! * \brief the path to the file containing the * sequence data. */ std::string file_seq ; /*! * \brief the path to the file containing the * classification posterior probabilities. */ std::string file_prob ; + /*! + * \brief whether the last class of the + * classification (posterior probabilities) is a + * background class. + */ + bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief whether run() can be called. */ bool runnable ; } ; #endif // PROBTOMODELAPPLICATION_HPP diff --git a/src/Applications/ReadModelExtenderApplication.cpp b/src/Applications/ReadModelExtenderApplication.cpp index bf1d9b6..cc4f368 100644 --- a/src/Applications/ReadModelExtenderApplication.cpp +++ b/src/Applications/ReadModelExtenderApplication.cpp @@ -1,273 +1,282 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument, std::runtime_error #include #include #include #include #include namespace po = boost::program_options ; // the valid values for --method option std::string method_read = "read" ; std::string method_read_atac = "read_atac" ; std::string method_fragment = "fragment" ; std::string method_fragment_center = "fragment_center" ; ReadModelExtenderApplication::ReadModelExtenderApplication(int argn, char** argv) : file_bed(""), file_bam(""), file_bai(""), file_prob(""), from(0), to(0), ext(0), bin_size(0), - method(CorrelationMatrixCreator::FRAGMENT), + method(CorrelationMatrixCreator::FRAGMENT), bckg_class(false), n_threads(0), runnable(false) { this->parseOptions(argn, argv) ; } ReadModelExtenderApplication::~ReadModelExtenderApplication() {} int ReadModelExtenderApplication::run() { if(this->runnable) { // extend limits int ext_right = this->ext/2 ; int ext_left = this->ext - ext_right ; this->from -= ext_left ; this->to += ext_right ; // create extended matrix CorrelationMatrixCreator mc(this->file_bed, this->file_bam, this->file_bai, this->from, this->to, this->bin_size, this->method) ; Matrix2D data = mc.create_matrix() ; // compute model // Matrix4D prob(this->file_prob) ; Matrix4D prob ; prob.load(this->file_prob) ; if(prob.get_dim()[0] != data.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! data matrix and probability matrix have " "unequal row numbers (%zu and %zu)", prob.get_dim()[0], data.get_nrow()) ; throw std::invalid_argument(msg) ; } size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; ReadModelComputer model_cp(std::move(data), prob, + this->bckg_class, this->n_threads) ; Matrix2D model = model_cp.get_model() ; // compute class prob vector_d class_prob(n_class, 0.) ; double p_tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), model.get_ncol() + 1) ; // 1st column contain the class prob for(size_t i=0; i(&(this->file_bed)), opt_bed_msg.c_str()) ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str()) ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str()) ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) + ("bgclass", opt_bckg_msg.c_str()) + ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ("ext", po::value(&(this->ext)), opt_ext_msg.c_str()) ("binSize", po::value(&(this->bin_size)), opt_binsize_msg.c_str()) ("method", po::value(&(method)), opt_method_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bam == "" and (not help)) { std::string msg("Error! No BAM file was given (--bam)!") ; throw std::invalid_argument(msg) ; } else if(this->file_bai == "" and (not help)) { std::string msg("Error! No BAM index file was given (--bai)!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No posterior probability file was given (--prob)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(ext <= 0 and (not help)) { std::string msg("Error! the number of columns to add should be > 0 (--ext)!") ; throw std::invalid_argument(msg) ; } else if(this->bin_size <= 0 and (not help)) { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ; throw std::invalid_argument(msg) ; } else if(method != method_read and method != method_read_atac and method != method_fragment and method != method_fragment_center) { char msg[4096] ; sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)", method_read.c_str(), method_read_atac.c_str(), method_fragment.c_str(), method_fragment_center.c_str()) ; throw std::invalid_argument(msg) ; } + // set background class + if(vm.count("bgclass")) + { this->bckg_class = true ; } + // set method if(method == method_read) { this->method = CorrelationMatrixCreator::READ ; } else if(method == method_read_atac) { this->method = CorrelationMatrixCreator::READ_ATAC ; } else if(method == method_fragment) { this->method = CorrelationMatrixCreator::FRAGMENT ; } else if(method == method_fragment_center) { this->method = CorrelationMatrixCreator::FRAGMENT_CENTER ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { ReadModelExtenderApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/ReadModelExtenderApplication.hpp b/src/Applications/ReadModelExtenderApplication.hpp index 5fc8379..e27701c 100644 --- a/src/Applications/ReadModelExtenderApplication.hpp +++ b/src/Applications/ReadModelExtenderApplication.hpp @@ -1,122 +1,127 @@ #ifndef READMODELEXTENDERAPPLICATION_HPP #define READMODELEXTENDERAPPLICATION_HPP #include #include #include #include /*! * \brief The ReadModelExtenderApplication class is a class implementing an * application to extend a read model of length L' (L' = L - S + 1 * where L is the number of column of the data matrix and S the * shifting freedom allowed during the classification) to a new model * length L'' = L' + E (E is the number of columns to add to the * model) given the data matrix and the results of the classification * (posterior probability matrix). * To do this, the read count matrix from which the original model * was computed is extended (0.5*E columns on each side) and a model * is computed using the new matrix and the given posterior probabities. * The extended model is returned through the stdout. */ class ReadModelExtenderApplication : public ApplicationInterface { public: ReadModelExtenderApplication() = delete ; ReadModelExtenderApplication(const ReadModelExtenderApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ ReadModelExtenderApplication(int argn, char** argv) ; /*! * \brief Destructor. */ virtual ~ReadModelExtenderApplication() override ; /*! * \brief Runs the application. The data new model * is computed and displayed through the * stdout. * \return the exit code. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the bed file. */ std::string file_bed ; /*! * \brief the path to the bam file. */ std::string file_bam ; /*! * \brief the path to the bam index file. */ std::string file_bai ; /*! * \brief the path to the file containing the * classification posterior probabilities. */ std::string file_prob ; /*! * \brief a relative coordinate indicating the * most downstream position to consider around * each region in the bed file. */ int from ; /*! * \brief a relative coordinate indicating the * most upstream position to consider around * each region in the bed file. */ int to ; /*! * \brief the number of columns to add to the * matrix (half of this value on each side). */ int ext ; /*! * \brief the size of the bin that will be used * to bin the signal in the regions [from,to] around * each region in the bed file. */ int bin_size ; /*! * \brief How to consider the sequenced fragments when computing * the bin values. */ CorrelationMatrixCreator::methods method ; - + /*! + * \brief whether the last class of the + * classification (posterior probabilities) is a + * background class. + */ + bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief whether run() can be called. */ bool runnable ; } ; #endif // READMODELEXTENDERAPPLICATION_HPP diff --git a/src/Applications/SequenceModelExtenderApplication.cpp b/src/Applications/SequenceModelExtenderApplication.cpp index 1d50194..355d2e1 100644 --- a/src/Applications/SequenceModelExtenderApplication.cpp +++ b/src/Applications/SequenceModelExtenderApplication.cpp @@ -1,216 +1,225 @@ #include #include #include #include #include // std::move() #include // std::invalid_argument, std::runtime_error #include #include #include #include #include namespace po = boost::program_options ; SequenceModelExtenderApplication::SequenceModelExtenderApplication(int argn, char** argv) : file_bed(""), file_fasta(""), file_prob(""), - from(0), to(0), ext(0), + from(0), to(0), ext(0), bckg_class(false), n_threads(0), runnable(false) { this->parseOptions(argn, argv) ; } SequenceModelExtenderApplication::~SequenceModelExtenderApplication() {} int SequenceModelExtenderApplication::run() { if(this->runnable) { // extend limits int ext_right = this->ext/2 ; int ext_left = this->ext - ext_right ; this->from -= ext_left ; this->to += ext_right ; // create extended matrix SequenceMatrixCreator mc(this->file_bed, this->file_fasta, this->from, this->to) ; Matrix2D data = mc.create_matrix() ; // compute model // Matrix4D prob(this->file_prob) ; Matrix4D prob ; prob.load(this->file_prob) ; if(prob.get_dim()[0] != data.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! data matrix and probability matrix have " "unequal row numbers (%zu and %zu)", prob.get_dim()[0], data.get_nrow()) ; throw std::invalid_argument(msg) ; } size_t n_row = prob.get_dim()[0] ; size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; SequenceModelComputer model_cp(std::move(data), prob, + this->bckg_class, this->n_threads) ; Matrix2D model = model_cp.get_model() ; // compute class prob vector_d class_prob(n_class, 0.) ; double p_tot = 0. ; for(size_t i=0; i model_final(model.get_nrow(), model.get_ncol() + 1) ; // 1st column contain the class prob size_t i_class = 0 ; for(size_t i=0; i(&(this->file_bed)), opt_bed_msg.c_str()) ("fasta", po::value(&(this->file_fasta)), opt_fasta_msg.c_str()) ("prob,", po::value(&(this->file_prob)), opt_prob_msg.c_str()) + ("bgclass", opt_bckg_msg.c_str()) + ("from", po::value(&(this->from)), opt_from_msg.c_str()) ("to", po::value(&(this->to)), opt_to_msg.c_str()) ("ext", po::value(&(this->ext)), opt_ext_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_bed == "" and (not help)) { std::string msg("Error! No BED file was given (--bed)!") ; throw std::invalid_argument(msg) ; } else if(this->file_fasta == "" and (not help)) { std::string msg("Error! No fasta file was given (--fasta)!") ; throw std::invalid_argument(msg) ; } else if(this->file_prob == "" and (not help)) { std::string msg("Error! No posterior probability file was given (--prob)!") ; throw std::invalid_argument(msg) ; } else if(this->from == 0 and this->to == 0 and (not help)) { std::string msg("Error! No range given (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(this->from >= this->to and (not help)) { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ; throw std::invalid_argument(msg) ; } else if(ext <= 0 and (not help)) { std::string msg("Error! the number of columns to add should be > 0 (--ext)!") ; throw std::invalid_argument(msg) ; } + // set background class + if(vm.count("bgclass")) + { this->bckg_class = true ; } + // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } int main(int argn, char** argv) { SequenceModelExtenderApplication app(argn, argv) ; return app.run() ; } diff --git a/src/Applications/SequenceModelExtenderApplication.hpp b/src/Applications/SequenceModelExtenderApplication.hpp index 6bc00d7..a97b25d 100644 --- a/src/Applications/SequenceModelExtenderApplication.hpp +++ b/src/Applications/SequenceModelExtenderApplication.hpp @@ -1,107 +1,113 @@ #ifndef SEQUENCEMODELEXTENDERAPPLICATION_HPP #define SEQUENCEMODELEXTENDERAPPLICATION_HPP #include #include #include #include /*! * \brief The SequenceModelExtenderApplication class is a class implementing an * application to extend a sequence model of length L' (L' = L - S + 1 * where L is the number of column of the sequence matrix and S the * shifting freedom allowed during the classification) to a new model * length L'' = L' + E (E is the number of columns to add to the * model) given the data matrix and the results of the classification * (posterior probability matrix). * To do this, the sequence count matrix from which the original model * was computed is extended (0.5*E columns on each side) and a model * is computed using the new matrix and the given posterior probabities. * The extended model is returned through the stdout. */ class SequenceModelExtenderApplication : public ApplicationInterface { public: SequenceModelExtenderApplication() = delete ; SequenceModelExtenderApplication(const SequenceModelExtenderApplication& app) = delete ; /*! * \brief Constructs an object from the command line * options. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. */ SequenceModelExtenderApplication(int argn, char** argv) ; /*! * \brief Destructor. */ virtual ~SequenceModelExtenderApplication() override ; /*! * \brief Runs the application. The data new model * is computed and displayed through the * stdout. * \return the exit code. */ virtual int run() override ; private: /*! * \brief Parses the program command line options and * sets the object field accordingly. * If the help option is detected, the "runnable" * field is set to false and subsequent calls to * run() will produce nothing. * \param argn the number of options passed to the * main() function. * \param argv the vector of options passed to the * main() function. * \throw std::invalid_argument if an error is found * in the program options. */ void parseOptions(int argn, char** argv) ; /*! * \brief the path to the bed file. */ std::string file_bed ; /*! * \brief the path to the fasta file * containing the sequences. */ std::string file_fasta ; /*! * \brief the path to the file containing the * classification posterior probabilities. */ std::string file_prob ; /*! * \brief a relative coordinate indicating the * most downstream position to consider around * each region in the bed file. */ int from ; /*! * \brief a relative coordinate indicating the * most upstream position to consider around * each region in the bed file. */ int to ; /*! * \brief the number of columns to add to the * matrix (half of this value on each side). */ int ext ; + /*! + * \brief whether the last class of the + * classification (posterior probabilities) is a + * background class. + */ + bool bckg_class ; /*! * \brief the number of threads. */ size_t n_threads ; /*! * \brief whether run() can be called. */ bool runnable ; } ; #endif // SEQUENCEMODELEXTENDERAPPLICATION_HPP diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 245382c..4b0d893 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,122 +1,128 @@ # compiler options add_compile_options(-std=c++14) add_compile_options(-O3) add_compile_options(-Wall) add_compile_options(-Wextra) add_compile_options(-Werror) add_compile_options(-Wfatal-errors) add_compile_options(-pedantic) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}") add_definitions (${SEQAN_DEFINITIONS}) # include file location include_directories(${Boost_INCLUDE_DIRS}) include_directories(${SEQAN_INCLUDE_DIRS}) include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/Clustering") include_directories("${scATACseq_SOURCE_DIR}/src/Random") include_directories("${scATACseq_SOURCE_DIR}/src/Parallel") include_directories("${scATACseq_SOURCE_DIR}/src/Statistics") include_directories("${scATACseq_SOURCE_DIR}/src/GUI") include_directories("${scATACseq_SOURCE_DIR}/src/Applications") include_directories("${scATACseq_SOURCE_DIR}/src/Matrix") include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools") include_directories("${scATACseq_SOURCE_DIR}/src/Utility") # compile modules into static libraries ## set output directory set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib") ## build instructions add_library(Clustering "Clustering/DataLayer.cpp" "Clustering/ReadLayer.cpp" "Clustering/SequenceLayer.cpp" "Clustering/ModelComputer.cpp" "Clustering/ReadModelComputer.cpp" "Clustering/SequenceModelComputer.cpp" "Clustering/EMBase.cpp" "Clustering/EMRead.cpp" "Clustering/EMSequence.cpp" "Clustering/EMJoint.cpp") add_library(Random "Random/Random.cpp" "Random/RandomNumberGenerator.cpp") add_library(Parallel "Parallel/ThreadPool.cpp") add_library(Statistics "Statistics/Statistics.cpp") add_library(GUI "GUI/ConsoleProgressBar.cpp" "GUI/Diplayable.cpp" "GUI/Updatable.cpp") add_library(GenomicTools "GenomicTools/MatrixCreator.cpp" "GenomicTools/ReadMatrixCreator.cpp" "GenomicTools/CorrelationMatrixCreator.cpp" "GenomicTools/SequenceMatrixCreator.cpp" "GenomicTools/GenomeRegion.cpp") add_library(Utility "Utility/matrices.cpp" "Utility/dna_utility.cpp") ## resolve dependencies target_link_libraries(Utility ${SEQAN_LIBRARIES}) target_link_libraries(Clustering Utility Random Statistics GUI Parallel ${SEQAN_LIBRARIES}) target_link_libraries(Parallel Threads::Threads) target_link_libraries(GenomicTools Utility ${SEQAN_LIBRARIES}) # executables ## a toy for seqan set(EXE_MAIN_TEST "main_test") add_executable(${EXE_MAIN_TEST} "main_test.cpp") target_link_libraries(${EXE_MAIN_TEST} GenomicTools Clustering) set_target_properties(${EXE_MAIN_TEST} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## a toy for correlation matrix set(EXE_MAIN_CORMAT "main_cormat") add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp") target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} Utility GenomicTools Random) set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an application to create a matrix from BED and a BAM file set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator") add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Utility Boost::program_options) set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an application to create a sequence matrix from BED and a fasta file set(EXE_MAIN_SEQMATRIX "SequenceMatrixCreator") add_executable(${EXE_MAIN_SEQMATRIX} "Applications/SequenceMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_MAIN_SEQMATRIX} GenomicTools Utility Boost::program_options) set_target_properties(${EXE_MAIN_SEQMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMRead standalone set(EXE_EMREAD "EMRead") add_executable(${EXE_EMREAD} "Applications/EMReadApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMREAD} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMREAD} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMSequence standalone set(EXE_EMSEQ "EMSequence") add_executable(${EXE_EMSEQ} "Applications/EMSequenceApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMSEQ} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMSEQ} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an EMJoint standalone set(EXE_EMJOINT "EMJoint") add_executable(${EXE_EMJOINT} "Applications/EMJointApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_EMJOINT} Clustering Utility Boost::program_options) set_target_properties(${EXE_EMJOINT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to compute data models from the data and the post prob of an EM classification set(EXE_PROB2REF "ProbToModel") add_executable(${EXE_PROB2REF} "Applications/ProbToModelApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_PROB2REF} Clustering Utility Boost::program_options) set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extend read models from an EM classification set(EXE_READMODELEXTENDER "ReadModelExtender") add_executable(${EXE_READMODELEXTENDER} "Applications/ReadModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_READMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_READMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") ## an executable to extend read models from an EM classification set(EXE_SEQUENCEMODELEXTENDER "SequenceModelExtender") add_executable(${EXE_SEQUENCEMODELEXTENDER} "Applications/SequenceModelExtenderApplication.cpp" "Applications/ApplicationInterface.cpp") target_link_libraries(${EXE_SEQUENCEMODELEXTENDER} Clustering GenomicTools Utility Boost::program_options) set_target_properties(${EXE_SEQUENCEMODELEXTENDER} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") +## an executable to extend read models from an EM classification +set(EXE_CLASSCORRELATIONMATRIXCREATOR "ClassCorrelationMatrixCreator") +add_executable(${EXE_CLASSCORRELATIONMATRIXCREATOR} "Applications/ClassCorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp") +target_link_libraries(${EXE_CLASSCORRELATIONMATRIXCREATOR} Clustering GenomicTools Utility Boost::program_options) +set_target_properties(${EXE_CLASSCORRELATIONMATRIXCREATOR} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") + ## a test suite set(EXE_TESTS "unittests") add_executable(${EXE_TESTS} "unittests.cpp" "Unittests/unittests_matrix.cpp" "Unittests/unittests_genomictools.cpp") target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools) set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin") diff --git a/src/Clustering/ConsensusSequenceLayer.cpp b/src/Clustering/ConsensusSequenceLayer.cpp new file mode 100644 index 0000000..e69de29 diff --git a/src/Clustering/ConsensusSequenceLayer.hpp b/src/Clustering/ConsensusSequenceLayer.hpp new file mode 100644 index 0000000..f5edc48 --- /dev/null +++ b/src/Clustering/ConsensusSequenceLayer.hpp @@ -0,0 +1,93 @@ +#ifndef CONSENSUSSEQUENCELAYER_HPP +#define CONSENSUSSEQUENCELAYER_HPP + +#include + +class ConsensusSequenceLayer : public SequenceLayer +{ + public: + /*! + * \brief Constructs an object with the + * given data and an empty (0 values) + * model. + * \param data the data. + * \param n_class the number of classes + * of the model. + * \param n_shift the number of shift + * states of the model. + * \param flip whether flipping is allowed. + * \param bckg_class should the last class + * of the model is constant and models the + * background. + */ + ConsensusSequenceLayer(const Matrix3D& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) ; + + /*! + * \brief Constructs an object with the + * given data and an empty (0 values) + * model. + * \param data the data. + * \param n_class the number of classes + * of the model. + * \param n_shift the number of shift + * states of the model. + * \param flip whether flipping is allowed. + * \param bckg_class should the last class + * of the model is constant and models the + * background. + */ + ConsensusSequenceLayer(Matrix2D&& data, + size_t n_class, + size_t n_shift, + bool flip, + bool bckg_class) ; + + /*! + * \brief Destructor. + */ + virtual ~ConsensusSequenceLayer() ; + + /*! + * \brief Computes the log likelihood of the data + * given the current model parameters. + * \param logliklihood a matrix to store the + * results. It should have the following dimensions : + * 1st : same as the data number of row + * 2nd : same as the model number of classes + * 3rd : same as the number of shifts + * 4th : same as the number of flip states + * \param loglikelihood_max a vector containing the + * max value for each row of loglikelihood. + * Its length should be equal to the data row number. + * \throw std::invalid_argument if the dimensions are + * incorrect. + */ + virtual void compute_loglikelihoods(Matrix4D& loglikelihood, + vector_d& loglikelihood_max, + ThreadPool* threads=nullptr) const override ; + + /*! + * \brief Updates the model given the posterior + * probabilities (the probabilities of each row + * in the data to be assigned to each class, + * for each shift and flip state). + * \param posterior_prob the data assignment probabilities to + * the different classes. + */ + virtual void update_model(const Matrix4D& posterior_prob, + ThreadPool* threads=nullptr) override ; + + + protected: + + + + +} ; + + +#endif // CONSENSUSSEQUENCELAYER_HPP diff --git a/src/Clustering/DataLayer.cpp b/src/Clustering/DataLayer.cpp index e69ac60..973a695 100644 --- a/src/Clustering/DataLayer.cpp +++ b/src/Clustering/DataLayer.cpp @@ -1,215 +1,192 @@ #include #include // std::invalid_argument #include // log() #include #include #include #include DataLayer::DataLayer() {} DataLayer::DataLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst) + bool bckg_class) :data(data), flip(flip), n_row(this->data.get_nrow()), n_col(this->data.get_ncol()), n_class(n_class), l_model(n_col - n_shift + 1), n_shift(n_shift), n_flip(flip + 1), - last_class_cst(last_class_cst) + bckg_class(bckg_class) { // models cannot be initialise here // as the number of categories depend // on the exact class } DataLayer::DataLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst) + bool bckg_class) :data(data), flip(flip), n_row(this->data.get_nrow()), n_col(this->data.get_ncol()), n_class(n_class), l_model(n_col - n_shift + 1), n_shift(n_shift), n_flip(flip + 1), - last_class_cst(last_class_cst) + bckg_class(bckg_class) { // models cannot be initialise here // as the number of categories depend // on the exact class } DataLayer::DataLayer(const Matrix2D& data, const Matrix3D& model, bool flip, - bool last_class_cst) + bool bckg_class) : data(data), model(model), flip(flip), n_row(this->data.get_nrow()), n_col(this->data.get_ncol()), n_class(this->model.get_dim()[0]), l_model(this->model.get_dim()[1]), n_category(this->model.get_dim()[2]), n_shift(n_col - l_model + 1), n_flip(flip + 1), - last_class_cst(last_class_cst) + bckg_class(bckg_class) { // check if model is not too long if(this->n_col < this->l_model) { char msg[4096] ; sprintf(msg, "Error! model is longer than data : %zu / %zu", this->l_model, this->n_col) ; throw std::invalid_argument(msg) ; } this->n_shift = this->n_col - this->l_model + 1 ; } DataLayer::DataLayer(Matrix2D&& data, Matrix3D&& model, bool flip, - bool last_class_cst) + bool bckg_class) : data(data), model(model), flip(flip), n_row(this->data.get_nrow()), n_col(this->data.get_ncol()), n_class(this->model.get_dim()[0]), l_model(this->model.get_dim()[1]), n_category(this->model.get_dim()[2]), n_shift(n_col - l_model + 1), n_flip(flip + 1), - last_class_cst(last_class_cst) + bckg_class(bckg_class) { // check if model is not too long if(this->n_col < this->l_model) { char msg[4096] ; sprintf(msg, "Error! model is longer than data : %zu / %zu", this->l_model, this->n_col) ; throw std::invalid_argument(msg) ; } this->n_shift = this->n_col - this->l_model + 1 ; } DataLayer::~DataLayer() {} -void DataLayer::set_class(size_t i, const Matrix2D& class_model) -{ // check dimensions - if(class_model.get_nrow() != this->n_category) - { char msg[4096] ; - sprintf(msg, "Error! the given class model is incompatible " - "with the model (%zu rows instead of %zu)", - class_model.get_nrow(), this->n_category) ; - throw std::invalid_argument(msg) ; - } - else if(class_model.get_ncol() != this->l_model) - { char msg[4096] ; - sprintf(msg, "Error! the given class model is incompatible " - "with the model (%zu columns instead of %zu)", - class_model.get_ncol(), this->l_model) ; - throw std::invalid_argument(msg) ; - } - - for(size_t j=0; jmodel(i,j,k) = class_model(k,j) ; } - } -} - Matrix3D DataLayer::get_model() const { return this->model ; } void DataLayer::check_loglikelihood_dim(const Matrix4D& loglikelihood) const { if(loglikelihood.get_dim()[0] != this->n_row) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 1st dimension is not " "equal to data row number : %zu / %zu", loglikelihood.get_dim()[0], this->n_row) ; throw std::invalid_argument(msg) ; } else if(loglikelihood.get_dim()[1] != this->n_class) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 2nd dimension is not " "equal to model class number : %zu / %zu", loglikelihood.get_dim()[1], this->n_class) ; throw std::invalid_argument(msg) ; } else if(loglikelihood.get_dim()[2] != this->n_shift) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 3rd dimension is not " "equal to model shift state number : %zu / %zu", loglikelihood.get_dim()[2], this->n_shift) ; throw std::invalid_argument(msg) ; } else if(loglikelihood.get_dim()[3] != this->n_flip) { char msg[4096] ; sprintf(msg, "Error! loglikelihood matrix 4th dimension is not " "equal to model flip state number : %zu / %zu", loglikelihood.get_dim()[3], this->n_flip) ; throw std::invalid_argument(msg) ; } } void DataLayer::check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const { if(loglikelihood_max.size() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! loglikelihood_max length is not " "equal to data row number : %zu / %zu", loglikelihood_max.size(), this->n_flip) ; throw std::invalid_argument(msg) ; } } void DataLayer::check_posterior_prob_dim(const Matrix4D& posterior_prob) const { if(posterior_prob.get_dim()[0] != this->n_row) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 1st dimension is not " "equal to data row number : %zu / %zu", posterior_prob.get_dim()[0], this->n_row) ; throw std::invalid_argument(msg) ; } else if(posterior_prob.get_dim()[1] != this->n_class) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 2nd dimension is not " "equal to model class number : %zu / %zu", posterior_prob.get_dim()[1], this->n_class) ; throw std::invalid_argument(msg) ; } else if(posterior_prob.get_dim()[2] != this->n_shift) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 3rd dimension is not " "equal to model shift state number : %zu / %zu", posterior_prob.get_dim()[2], this->n_shift) ; throw std::invalid_argument(msg) ; } else if(posterior_prob.get_dim()[3] != this->n_flip) { char msg[4096] ; sprintf(msg, "Error! posterior_prob matrix 4th dimension is not " "equal to model flip state number : %zu / %zu", posterior_prob.get_dim()[3], this->n_flip) ; throw std::invalid_argument(msg) ; } } const double DataLayer::p_min = 1e-100 ; const double DataLayer::p_min_log = log(DataLayer::p_min) ; diff --git a/src/Clustering/DataLayer.hpp b/src/Clustering/DataLayer.hpp index 3fbc159..aae52db 100644 --- a/src/Clustering/DataLayer.hpp +++ b/src/Clustering/DataLayer.hpp @@ -1,298 +1,279 @@ #ifndef DATALAYER_HPP #define DATALAYER_HPP #include #include // std::promise, std::future #include #include #include #include typedef std::vector vector_d ; /*! * \brief The DataLayer class define the basic design * to handle probabilistic models together with * their data. * A DataLayer is made of two parts : * 1) a data matrix * 2) a model * The model contains the parameters of a probabilistic * model with one or more classes that fits the data. * The data likelihood given the model can be computed * and the model can be updated given a set of * posterior probabilities representing the data * assignments to the different classes. */ class DataLayer { public: /*! * \brief the smallest acceptable probability * for computations. */ static const double p_min ; /*! * \brief the log of the smallest probability. */ static const double p_min_log ; /*! * \brief The possible flip states. */ enum flip_states{FORWARD=0, REVERSE} ; /*! * \brief Default constructor. */ DataLayer() ; /*! * \brief Constructs an object with the * given data. * An empty model is not initialised yet * as the model number of categories * depends on the final class. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. */ DataLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst) ; + bool bckg_class) ; /*! * \brief Constructs an object with the * given data. * An empty model is not initialised yet * as the model number of categories * depends on the final class. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. */ DataLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst) ; + bool bckg_class) ; /*! * \brief Constructs an object with the * given data and model. * The model dimensions set the number of * classes and the shifting freedom. * \param data the data. * \param the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. */ DataLayer(const Matrix2D& data, const Matrix3D& model, bool flip, - bool last_class_cst) ; + bool bckg_class) ; /*! * \brief Constructs an object with the * given data and model. * The model dimensions set the number of * classes and the shifting freedom. * \param data the data. * \param the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. */ DataLayer(Matrix2D&& data, Matrix3D&& model, bool flip, - bool last_class_cst) ; + bool bckg_class) ; /*! * \brief Destructor. */ virtual ~DataLayer() ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const = 0 ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) = 0 ; - /*! - * \brief Modify the values of th given class - * with the given parameters. - * \param i the index of the class to modify, 0-based. - * \param class_model the class model parameters. - * Its dimensions should be: - * 1st : , 4 for sequence models (A,C,G,T), - * 1 for signal models. - * 2nd : the model length. - * \throw std::invalid_argument if the dimensions are not - * compatible with the current model classes. - */ - virtual void set_class(size_t i, - const Matrix2D& class_model) ; - /*! * \brief Returns a copy of the current model. * \return the current model. * 1st dim : the number of classes * 2nd dim : the model length * 3rd dim : the number of value categories. */ virtual Matrix3D get_model() const ; protected: /*! * \brief Checks the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data row number * 2nd : same as the model class number * 3rd : same as the shift state number * 4th : same as the flip state number * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_loglikelihood_dim(const Matrix4D& loglikelihood) const ; /*! * \brief Checks that the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * It should have a length equal to the number of * the data row number. * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_loglikelihood_max_dim(const vector_d& loglikelihood_max) const ; /*! * \brief Checks the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param posterior_prob a matrix to store the * results. It should have the following dimensions : * 1st : same as the data row number * 2nd : same as the model class number * 3rd : same as the shift state number * 4th : same as the flip state number * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_posterior_prob_dim(const Matrix4D& posterior_prob) const ; /*! * \brief the data. */ Matrix2D data ; /*! * \brief the data model. */ Matrix3D model ; /*! * \brief whether flip is enabled. */ bool flip ; /*! * \brief the number of row in the data. */ size_t n_row ; /*! * \brief the number of columns in the data. */ size_t n_col ; /*! * \brief the number of classes in the model. */ size_t n_class ; /*! * \brief the model length, its 2nd dimension. */ size_t l_model ; /*! * \brief the number of variable categories in * the data. This is also the model 3rd * dimension. * Read counts are quantitative values and * have a number of categories equal to one * whereas as DNA sequences are made of * A,C,G,T (at least) and have 4 different * categories. */ size_t n_category ; /*! * \brief the number of shift states. */ size_t n_shift ; /*! * \brief the number of flip states. */ size_t n_flip ; /*! * \brief A flag indicating that the last class of the model - * is constant and should not be updated when calling - * update_model(). + * is modelling the background. This class is considered constant + * and should not be updated when calling update_model(). */ - bool last_class_cst ; + bool bckg_class ; } ; #endif // DATALAYER_HPP diff --git a/src/Clustering/EMJoint.cpp b/src/Clustering/EMJoint.cpp index afef19b..d3efbbf 100644 --- a/src/Clustering/EMJoint.cpp +++ b/src/Clustering/EMJoint.cpp @@ -1,660 +1,623 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include #include #include #include #include #include #include // getRandomNumberGenerator() #include // ConsoleProgressBar -#include // EMRead::generate_bckg_motif() -#include // EMSequence::generate_bckg_motif() + EMJoint::EMJoint(const std::vector>& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; - - // compute background and - // overwrite last class as background class - if(bckg_class) - { size_t motif_len = matrix.get_ncol() - this->n_shift + 1 ; - Matrix2D bckg_motif = EMRead::generate_bckg_motif(matrix, - motif_len) ; - this->read_layers.back()->set_class(this->n_class-1, - bckg_motif) ; - } } } EMJoint::EMJoint(std::vector>&& read_matrices, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()), loglikelihood_layer(n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions if(this->n_layer == 0) { throw std::invalid_argument("Error! No data layer given!") ; } for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable row numbers " "(%zu and %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! Read layers have variable column numbers " "(%zu and %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { - // compute background before giving data to - // ReadLayer - Matrix2D bckg_motif ; - if(bckg_class) - { size_t motif_len = matrix.get_ncol() - this->n_shift + 1 ; - bckg_motif = EMRead::generate_bckg_motif(matrix, - motif_len) ; - } - // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; - - // overwrite last class as background class - if(bckg_class) - { this->read_layers.back()->set_class(this->n_class-1, - bckg_motif) ; - } } } EMJoint::EMJoint(const std::vector>& read_matrices, const Matrix2D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { // create the layer this->read_layers.push_back(new ReadLayer(matrix, this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; - - // compute background and - // overwrite last class as background class - if(bckg_class) - { size_t motif_len = matrix.get_ncol() - this->n_shift + 1 ; - Matrix2D bckg_motif = EMRead::generate_bckg_motif(matrix, - motif_len) ; - this->read_layers.back()->set_class(this->n_class-1, - bckg_motif) ; - } } // create sequence layer and initialise the models from the post prob this->seq_layer = new SequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, bckg_class) ; this->seq_layer->update_model(this->post_prob, this->threads) ; - // compute background and - // overwrite last class as background class - if(bckg_class) - { size_t motif_len = seq_matrix.get_ncol() - this->n_shift + 1 ; - Matrix2D bckg_motif = EMSequence::generate_bckg_motif(seq_matrix, - motif_len, - this->flip) ; - this->seq_layer->set_class(this->n_class-1, - bckg_motif) ; - } } EMJoint::EMJoint(std::vector>&& read_matrices, Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrices[0].get_nrow(), read_matrices[0].get_ncol(), n_class, n_iter, n_shift, flip, n_threads), n_layer(read_matrices.size()+1), loglikelihood_layer(this->n_layer, Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.)), loglikelihood_max(this->n_layer, vector_d(this->n_row, 0.)), read_layers(), seq_layer(nullptr) { // check data matrices and their dimensions for(const auto& matrix : read_matrices) { if(matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A read matrix row number is different than expected " "(%zu instead of %zu)!", matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A read matrix column number is different than expected " "(%zu instead of %zu)!", matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } } if(seq_matrix.get_nrow() != this->n_row) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix row number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_nrow(), this->n_row) ; throw std::invalid_argument(msg) ; } else if(seq_matrix.get_ncol() != this->n_col) { char msg[4096] ; sprintf(msg, "Error! A sequence matrix column number is different than expected " "(%zu instead of %zu)!", seq_matrix.get_ncol(), this->n_col) ; throw std::invalid_argument(msg) ; } // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models // create read layer and initialise the models from the post prob for(auto& matrix : read_matrices) { - // compute background before giving data to - // ReadLayer - Matrix2D bckg_motif ; - if(bckg_class) - { size_t motif_len = matrix.get_ncol() - this->n_shift + 1 ; - bckg_motif = EMRead::generate_bckg_motif(matrix, - motif_len) ; - } - // create the layer this->read_layers.push_back(new ReadLayer(std::move(matrix), this->n_class, this->n_shift, this->flip, bckg_class, this->threads)) ; this->read_layers.back()->update_model(this->post_prob, this->threads) ; - // overwrite last class as background class - if(bckg_class) - { this->read_layers.back()->set_class(this->n_class-1, - bckg_motif) ; - } } // create sequence layer and initialise the models from the post prob - // compute background before giving data to - // SequenceLayer - Matrix2D bckg_motif ; - if(bckg_class) - { size_t motif_len = seq_matrix.get_ncol() - this->n_shift + 1 ; - bckg_motif = EMSequence::generate_bckg_motif(seq_matrix, - motif_len, - this->flip) ; - } this->seq_layer = new SequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; - // overwrite last class as background class - if(bckg_class) - { this->seq_layer->set_class(this->n_class-1, - bckg_motif) ; - } } EMJoint::~EMJoint() { // join the threads in case // deleted by EMBase destructor - this->threads->join() ; + if(this->threads != nullptr) + { this->threads->join() ; + delete this->threads ; + this->threads = nullptr ; + } // read data and models for(auto& ptr : this->read_layers) { if(ptr != nullptr) { delete ptr ; ptr = nullptr ; } } // sequence data and models if(seq_layer != nullptr) { delete seq_layer ; seq_layer = nullptr ; } } std::vector> EMJoint::get_read_models() const { std::vector> models ; for(const auto& ptr : this->read_layers) { models.push_back(ptr->get_model()) ; } return models ; } Matrix3D EMJoint::get_sequence_models() const { return this->seq_layer->get_model() ; } EMJoint::exit_codes EMJoint::classify() -{ - size_t bar_update_n = this->n_iter ; +{ size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; + return EMJoint::exit_codes::ITER_MAX ; } void EMJoint::compute_loglikelihood() { // compute the loglikelihood for each layer size_t i = 0 ; for(auto& ptr : this->read_layers) { ptr->compute_loglikelihoods(this->loglikelihood_layer[i], this->loglikelihood_max[i], this->threads) ; i++ ; } - this->seq_layer->compute_loglikelihoods(this->loglikelihood_layer[i], - this->loglikelihood_max[i], - this->threads) ; - i++ ; - /* - // sum the likelihood for each state, over all layers - // this is the "joint likelihood" - for(size_t i=0; in_row; i++) - { for(size_t j=0; jn_class; j++) - { for(size_t k=0; kn_shift; k++) - { for(size_t l=0; ln_flip; l++) - { - // reset - this->loglikelihood(i,j,k,l) = 0. ; - // sum - for(size_t m=0; mn_layer; m++) - { this->loglikelihood(i,j,k,l) += - (this->loglikelihood_layer[m](i,j,k,l) - - this->loglikelihood_max[m][i]) ; - } - } - } - } + if(this->seq_layer != nullptr) + { this->seq_layer->compute_loglikelihoods(this->loglikelihood_layer[i], + this->loglikelihood_max[i], + this->threads) ; + i++ ; } - */ + // sum the likelihood for each state, over all layers // and rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else - { size_t n_threads = this->threads->getNThread() ; + { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; - for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMJoint::compute_loglikelihood_routine, this, slice.first, slice.second, - std::ref(promises[i])))) ; + std::ref(promises[j])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } +/* void EMJoint::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // limite value range for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { // reset this->loglikelihood(i,j,k,l) = 0. ; // sum for(size_t m=0; mn_layer; m++) { this->loglikelihood(i,j,k,l) += (this->loglikelihood_layer[m](i,j,k,l) - this->loglikelihood_max[m][i]) ; } } } } } done.set_value(true) ; } +*/ + +void EMJoint::compute_loglikelihood_routine(size_t from, + size_t to, + std::promise& done) +{ // the max likelihood found per row + std::vector rowmax(to-from, std::numeric_limits::lowest()) ; + + // sum over layers + for(size_t i=from, i_rowmax=0; in_class; j++) + { for(size_t k=0; kn_shift; k++) + { for(size_t l=0; ln_flip; l++) + { + // reset + this->loglikelihood(i,j,k,l) = 0. ; + // sum + for(size_t m=0; mn_layer; m++) + { // add rescaled layer value + this->loglikelihood(i,j,k,l) += + (this->loglikelihood_layer[m](i,j,k,l) - + this->loglikelihood_max[m][i]) ; + } + // keep track of max + if(this->loglikelihood(i,j,k,l) > rowmax[i_rowmax]) + { rowmax[i_rowmax] = this->loglikelihood(i,j,k,l) ; } + } + } + } + } + + // rescale + for(size_t i=from, i_rowmax=0; in_class; j++) + { for(size_t k=0; kn_shift; k++) + { for(size_t l=0; ln_flip; l++) + { this->loglikelihood(i,j,k,l) -= rowmax[i_rowmax] ; } + } + } + } + done.set_value(true) ; +} + void EMJoint::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMJoint::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMJoint::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * - this->post_state_prob(n_class,n_shift,n_flip) ; + this->post_state_prob(n_class,n_shift,n_flip) ; + // double p = std::max(exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * + // this->post_state_prob(n_class,n_shift,n_flip), + // ReadLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ReadLayer::p_min) ; + // double p = this->post_prob(i,n_class,n_shift,n_flip) / + // this->post_prob_rowsum[i] ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMJoint::update_models() { // read data and models for(auto& ptr : this->read_layers) { ptr->update_model(this->post_prob, this->post_prob_colsum, this->threads) ; } // sequence data and models - this->seq_layer->update_model(this->post_prob, - this->threads) ; + if(this->seq_layer != nullptr) + { this->seq_layer->update_model(this->post_prob, + this->threads) ; + } } diff --git a/src/Clustering/EMRead.cpp b/src/Clustering/EMRead.cpp index a5be742..edad5a3 100644 --- a/src/Clustering/EMRead.cpp +++ b/src/Clustering/EMRead.cpp @@ -1,350 +1,308 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // exp() #include // ReadLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool -Matrix2D EMRead::generate_bckg_motif(const Matrix2D& read_matrix, - size_t motif_length) -{ - // sequence composition - double mean = 0. ; - double n = (double)read_matrix.get_nrow() * - (double)read_matrix.get_ncol() ; - for(size_t i=0; i bckg_motif(1,motif_length) ; - for(size_t j=0; j& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrix.get_nrow(), read_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), read_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly this->set_post_prob_random(seed) ; // data and models this->read_layer = new ReadLayer(read_matrix, this->n_class, this->n_shift, flip, bckg_class, this->threads) ; // intialise the models with the post prob this->read_layer->update_model(this->post_prob, this->threads) ; - // compute background and - // overwrite last class as background class - if(bckg_class) - { size_t motif_len = read_matrix.get_ncol() - this->n_shift + 1 ; - Matrix2D bckg_motif = EMRead::generate_bckg_motif(read_matrix, - motif_len) ; - this->read_layer->set_class(this->n_class-1, - bckg_motif) ; - } } EMRead::EMRead(Matrix2D&& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(read_matrix.get_nrow(), read_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), read_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly this->set_post_prob_random(seed) ; - // compute background before giving data to - // ReadLayer - Matrix2D bckg_motif ; - if(bckg_class) - { size_t motif_len = read_matrix.get_ncol() - this->n_shift + 1 ; - bckg_motif = EMRead::generate_bckg_motif(read_matrix, - motif_len) ; - } - // data and models this->read_layer = new ReadLayer(std::move(read_matrix), this->n_class, this->n_shift, flip, bckg_class, this->threads) ; // intialise the models with the post prob this->read_layer->update_model(this->post_prob, this->threads) ; - - // overwrite last class as background class - if(bckg_class) - { this->read_layer->set_class(this->n_class-1, - bckg_motif) ; - } } EMRead::~EMRead() { if(this->read_layer == nullptr) - { delete this->read_layer ; - this->read_layer = nullptr ; + { this->threads->join() ; + delete this->threads ; + this->threads = nullptr ; } } Matrix3D EMRead::get_read_models() const { return read_layer->get_model() ; } EMRead::exit_codes EMRead::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; // std::cerr << this->post_prob_rowsum << std::endl ; // std::cerr << this->post_prob_colsum << std::endl ; this->compute_post_prob() ; // M-step // std::cerr << this->post_prob_rowsum << std::endl ; // std::cerr << this->post_prob_colsum << std::endl ; this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMRead::exit_codes::ITER_MAX ; } void EMRead::compute_loglikelihood() { // compute the loglikelihood this->read_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; /* // rescale the values for(size_t i=0; in_row; i++) { for(size_t j=0; jn_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } */ // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMRead::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMRead::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } void EMRead::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMRead::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMRead::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { // avoid p=0. by rounding errors double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ReadLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMRead::update_models() { this->read_layer->update_model(this->post_prob, this->post_prob_colsum, this->threads) ; } diff --git a/src/Clustering/EMRead.hpp b/src/Clustering/EMRead.hpp index fc2d51e..ec3a896 100644 --- a/src/Clustering/EMRead.hpp +++ b/src/Clustering/EMRead.hpp @@ -1,183 +1,172 @@ #ifndef EMREAD_HPP #define EMREAD_HPP #include #include #include #include // std::promise #include #include typedef std::vector vector_d ; class EMRead : public EMBase -{ public: - /*! - * \brief Generates a model with mean number of counts - * in the data,at each position. - * \param data the count data. - * \return the motif. Its dimensions are: - * 1st 1, a one row matrix - * 2nd the motif length. - */ - static Matrix2D generate_bckg_motif(const Matrix2D& read_matrix, - size_t motif_length) ; - +{ public: /*! * \brief Constructs an object to partition the * region (rows) according to the shape of the signal * with the given shifting and flipping freedom. * \param read_matrix a matrix containing the read * densitiy (ChIP-seq or related signal) for the * regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the * background by setting all its parameters, at all * positions, to the mean number of counts. Since * the background is constant, this class will never * be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMRead(const Matrix2D& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * region (rows) according to the shape of the signal * with the given shifting and flipping freedom. * \param read_matrix a matrix containing the read * densitiy (ChIP-seq or related signal) for the * regions of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the * background by setting all its parameters, at all * positions, to the mean number of counts. Since * the background is constant, this class will never * be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMRead(Matrix2D&& read_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; EMRead(const EMRead& other) = delete ; /*! * \brief Destructor. */ virtual ~EMRead() override ; /*! * \brief Returns the class read signal model. * \return the class read signal model. */ Matrix3D get_read_models() const ; /*! * \brief Runs the read signal model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ virtual EMRead::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood(). * This method rescales the loglikelihood values by * substacting to each value the maximum loglikelihood * value found in the same data row. * This method * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the max loglikelihood value for * each data row. */ std::vector loglikelihood_max ; /*! * \brief A pointer to the object managing * the data and their model. */ ReadLayer* read_layer ; } ; #endif // EMREAD_HPP diff --git a/src/Clustering/EMSequence.cpp b/src/Clustering/EMSequence.cpp index fe69d06..2a182e3 100644 --- a/src/Clustering/EMSequence.cpp +++ b/src/Clustering/EMSequence.cpp @@ -1,392 +1,351 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // exp() #include // SequenceLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool #include // dna::base_composition() -Matrix2D EMSequence::generate_bckg_motif(const Matrix2D& sequence_matrix, - size_t motif_length, - bool reverse_compl) -{ // sequence composition - std::vector base_comp = - dna::base_composition(sequence_matrix, - reverse_compl) ; - // create a motif - Matrix2D bckg_motif(4,motif_length) ; - for(size_t i=0; i& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models this->seq_layer = new SequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; - - // compute background and - // overwrite last class as background class - if(bckg_class) - { size_t motif_len = seq_matrix.get_ncol() - this->n_shift + 1 ; - Matrix2D bckg_motif = EMSequence::generate_bckg_motif(seq_matrix, - motif_len, - this->flip) ; - this->seq_layer->set_class(this->n_class-1, - bckg_motif) ; - } } EMSequence::EMSequence(Matrix2D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; - // compute background before giving data to - // SequenceLayer - Matrix2D bckg_motif ; - if(bckg_class) - { size_t motif_len = seq_matrix.get_ncol() - this->n_shift + 1 ; - bckg_motif = EMSequence::generate_bckg_motif(seq_matrix, - motif_len, - this->flip) ; - } // data and models this->seq_layer = new SequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->seq_layer->update_model(this->post_prob, this->threads) ; - // overwrite last class as background class - if(bckg_class) - { this->seq_layer->set_class(this->n_class-1, - bckg_motif) ; - } } EMSequence::EMSequence(const Matrix2D& seq_matrix, const Matrix3D& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), motifs.get_dim()[0], n_iter, seq_matrix.get_ncol() - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // data and models // background motif (if any) is the last of the given motifs this->seq_layer = new SequenceLayer(seq_matrix, motifs, this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMSequence::EMSequence(Matrix2D&& seq_matrix, Matrix3D&& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_nrow(), seq_matrix.get_ncol(), motifs.get_dim()[0], n_iter, seq_matrix.get_ncol() - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), seq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // data and models // background motif (if any) is the last of the given motifs this->seq_layer = new SequenceLayer(std::move(seq_matrix), std::move(motifs), this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMSequence::~EMSequence() { if(this->seq_layer == nullptr) - { delete this->seq_layer ; - this->seq_layer = nullptr ; + { this->threads->join() ; + delete this->threads ; + this->threads = nullptr ; } } Matrix3D EMSequence::get_sequence_models() const { return seq_layer->get_model() ; } EMSequence::exit_codes EMSequence::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMSequence::exit_codes::ITER_MAX ; } void EMSequence::compute_loglikelihood() { // compute the loglikelihood this->seq_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMSequence::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMSequence::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = (this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i]) ; } } } } done.set_value(true) ; } void EMSequence::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMSequence::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMSequence::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], SequenceLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMSequence::update_models() { this->seq_layer->update_model(this->post_prob, this->threads) ; } diff --git a/src/Clustering/EMSequence.hpp b/src/Clustering/EMSequence.hpp index 8c177ac..1cec297 100644 --- a/src/Clustering/EMSequence.hpp +++ b/src/Clustering/EMSequence.hpp @@ -1,256 +1,239 @@ #ifndef EMSEQUENCE_HPP #define EMSEQUENCE_HPP #include #include #include #include // std::promise #include #include typedef std::vector vector_d ; class EMSequence : public EMBase { - public: - /*! - * \brief Generates a motif with the background - * base probabilites at each position. The background - * probabilities are computed from the data. - * \param sequence_matrix the sequence matrix in integer - * format. - * \param reverse_compl whether the reverse complement of - * the sequences should also be considered. - * \return the motif. Its dimensions are: - * 1st 4 for A,C,G,T - * 2nd the motif length. - */ - static Matrix2D generate_bckg_motif(const Matrix2D& sequence_matrix, - size_t motif_length, - bool reverse_compl) ; - public: /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences models are initialised randomly. * \param sequence_matrix a matrix containing the sequences * of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the background * by setting all its parameters, at all positions, to the * background base probabilties. Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(const Matrix2D& sequence_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences models are initialised randomly. * \param sequence_matrix a matrix containing the sequences * of interest. * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param n_shift the number of shift states allowed. * \param flip whether flipping is allowed. * \param bckg_class the last class is used to model the background * by setting all its parameters, at all positions, to the * background base probabilties. Since the background is constant, * this class will never be updated. * \param seed a seed to initialise the random number * generator. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(Matrix2D&& sequence_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed="", size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences class models are initialised using * the given motifs. The class probabilities are * initialised uniformlly. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param sequence_matrix a matrix containing the sequences * of interest. * \param motifs a matrix containing the different initial * class models with the following dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 for A,C,G,T * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param flip whether flipping is allowed. * \param bckg_class indicates that the last class in the * given motifs is used to model the background and it * should never be updated. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(const Matrix2D& sequence_matrix, const Matrix3D& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads=0) ; /*! * \brief Constructs an object to partition the * given sequences (rows) according to their motif * content. * The sequences class models are initialised using * the given motifs. The class probabilities are * initialised uniformlly. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param sequence_matrix a matrix containing the sequences * of interest. * \param motifs a matrix containing the different initial * class models with the following dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 for A,C,G,T * \param n_class the number of region classes * to search. * \param n_iter the number of optimization iterations. * \param flip whether flipping is allowed. * \param bckg_class indicates that the last class in the * given motifs is used to model the background and it * should never be updated. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ EMSequence(Matrix2D&& sequence_matrix, Matrix3D&& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads=0) ; EMSequence(const EMSequence& other) = delete ; /*! * \brief Destructor. */ virtual ~EMSequence() override ; /*! * \brief Returns the class sequence model. * \return the class sequence model. */ Matrix3D get_sequence_models() const ; /*! * \brief Runs the sequence model optimization and * the data classification. * \return a code indicating how the optimization * ended. */ virtual EMSequence::exit_codes classify() override ; private: /*! * \brief Computes the data log likelihood given the * current models, for all layers and the joint * likelihood for each state (the sum of the layer * likelihoods for all layers, for a given state). */ virtual void compute_loglikelihood() override ; /*! * \brief This is a routine of compute_loglikelihood(). * This method rescales the loglikelihood values by * substacting to each value the maximum loglikelihood * value found in the same data row. * This method * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done a promise to fill when the method * is done. */ void compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Computes the data posterior probabilties. */ virtual void compute_post_prob() override ; /*! * \brief The routine that effectively computes * the posterior probabilties. * \param from the index of the first row * in the data to consider. * \param to the index of the past last row * in the data to consider. * \param done the partial column (over the classes) * sum of posterior probabilities. If several routines * are running together, the colsums are retrieved by * summing up the vectors together. */ void compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) ; /*! * \brief Update the data models for all layers, given * the current posterior and class probabilities. */ virtual void update_models() override ; /*! * \brief the max loglikelihood value for * each data row. */ std::vector loglikelihood_max ; /*! * \brief A pointer to the object managing * the data and their model. */ SequenceLayer* seq_layer ; } ; #endif // EMSEQUENCE_HPP diff --git a/src/Clustering/ReadLayer.cpp b/src/Clustering/ReadLayer.cpp index 9555cf2..79e61da 100644 --- a/src/Clustering/ReadLayer.cpp +++ b/src/Clustering/ReadLayer.cpp @@ -1,468 +1,492 @@ #include #include // std::invalid_argument #include // numeric_limits #include // log(), exp(), pow() #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // beta_pmf(), poisson_pmf() #include // rand_real_uniform(), rand_int_uniform() #include #include #include #include #include typedef std::vector vector_d ; ReadLayer::ReadLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst, + bool bckg_class, ThreadPool* threads) - : DataLayer(data, n_class, n_shift, flip, last_class_cst), + : DataLayer(data, n_class, n_shift, flip, bckg_class), window_means(n_row, n_shift, 0.) { this->n_category = 1 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0) ; + // background class + if(this->bckg_class) + { this->create_bckg_class() ; } + // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst, + bool bckg_class, ThreadPool* threads) - : DataLayer(std::move(data), n_class, n_shift, flip, last_class_cst), + : DataLayer(std::move(data), n_class, n_shift, flip, bckg_class), window_means(n_row, n_shift, 0.) { this->n_category = 1 ; // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0) ; + + // background class + if(this->bckg_class) + { this->create_bckg_class() ; } + // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(const Matrix2D& data, const Matrix3D& model, bool flip, - bool last_class_cst, + bool bckg_class, ThreadPool* threads) - : DataLayer(data, model, flip, last_class_cst), + : DataLayer(data, model, flip, bckg_class), window_means(n_row, n_shift, 0.) { // check that the model only has one category if(this->n_category > 1) { char msg[4096] ; sprintf(msg, "Error! model is expected to have length 1 on " "3rd dimension, not %zu", this->n_category) ; throw std::invalid_argument(msg) ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::ReadLayer(Matrix2D&& data, Matrix3D&& model, bool flip, - bool last_class_cst, + bool bckg_class, ThreadPool* threads) - : DataLayer(std::move(data), std::move(model), flip, last_class_cst), + : DataLayer(std::move(data), std::move(model), flip, bckg_class), window_means(n_row, n_shift, 0.) { // check that the model only has one category if(this->n_category > 1) { char msg[4096] ; sprintf(msg, "Error! model is expected to have length 1 on " "3rd dimension, not %zu", this->n_category) ; throw std::invalid_argument(msg) ; } // compute window means this->compute_window_means(threads) ; } ReadLayer::~ReadLayer() {} void ReadLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_row, std::ref(loglikelihood), std::ref(loglikelihood_max), promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, std::promise& done) const { // normalize the models Matrix3D model_norm = this->model ; for(size_t i=0; in_class; i++) { double mean = 0. ; for(size_t j=0; jl_model; j++) { mean += model_norm(i,j,0) ; } mean /= this->l_model ; for(size_t j=0; jl_model; j++) { model_norm(i,j,0) /= mean ; } } // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s_fw=0, s_rev=this->n_shift-1; s_fwn_shift; s_fw++, s_rev--) { // slice is [from_fw,to) // from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw] // fw |---------->>>----------| // ----------------------------------> data // rev |----------<<<----------| [from_dat_rev, to_dat_rev] // to_dat_rev can be -1 -> int // to_dat_rev from_dat_rev // log likelihood double ll_fw = 0. ; double ll_rev = 0. ; // --------------- forward --------------- size_t from_dat_fw = s_fw ; size_t to_dat_fw = from_dat_fw + this->l_model - 1 ; // --------------- reverse --------------- size_t from_dat_rev = this->n_col - 1 - s_fw ; // size_t to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev; j_dat_fwdata(i,j_dat_fw), model_norm(j,j_ref_fw,0)* this->window_means(i,s_fw))) ; // ll_fw += ll ; // p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf ll_fw += std::max(ll, ReadLayer::p_min_log) ; // --------------- reverse --------------- if(this->flip) { ll = log(poisson_pmf(this->data(i,j_dat_rev), model_norm(j,j_ref_fw,0)* this->window_means(i,s_rev))) ; // ll_rev += ll ; // p(A|B) may be really unlikely -> rounded to 0 -> log(0) = -inf ll_rev += std::max(ll, ReadLayer::p_min_log) ; } } loglikelihood(i,j,from_dat_fw,flip_states::FORWARD) = ll_fw ; // keep track of the max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } if(this->flip) { loglikelihood(i,j,from_dat_fw,flip_states::REVERSE) = ll_rev ; // keep track of the max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } void ReadLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // computing sum over the columns (classes) size_t n_row = posterior_prob.get_dim()[0] ; size_t n_class = posterior_prob.get_dim()[1] ; size_t n_shift = posterior_prob.get_dim()[2] ; size_t n_flip = posterior_prob.get_dim()[3] ; vector_d colsum(n_class, 0.) ; for(size_t i=0; iupdate_model(posterior_prob, colsum, threads) ; } void ReadLayer::update_model(const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise> promise ; std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, posterior_prob_colsum, promise) ; // this->model = future.get() ; auto model = future.get() ; - size_t n_class_to_update = this->n_class - this->last_class_cst ; + size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = model(i,j,k) ; } } } } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector>> promises(n_threads) ; std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(posterior_prob_colsum), std::ref(promises[i])))) ; } // reinitialise the model /* this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; */ - size_t n_class_to_update = this->n_class - this->last_class_cst ; + size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = 0. ; } } } // wait until all threads are done working // and update the mode for(auto& future : futures) { Matrix3D model_part = future.get() ; // for(size_t i=0; in_class; i++) for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // avoid 0's in the model to ensure that pmf_poisson() never // return 0 for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = std::max(this->model(i,j,k), ReadLayer::p_min) ; } } } } void ReadLayer::update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; this->check_posterior_prob_colsum_dim(posterior_prob_colsum) ; // partial model Matrix3D model(this->n_class, this->l_model, this->n_category, 0.) ; - size_t n_class_to_update = this->n_class - this->last_class_cst ; + size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t n_class=0; n_class < n_class_to_update; n_class++) { for(size_t i=from; in_shift; n_shift++) { // --------------- forward --------------- int from_dat_fw = n_shift ; int to_dat_fw = from_dat_fw + this->l_model - 1 ; for(int j_dat_fw=from_dat_fw, j_ref_fw=0; j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++) { model(n_class,j_ref_fw,0) += (posterior_prob(i,n_class,n_shift,flip_states::FORWARD) * this->data(i,j_dat_fw)) / posterior_prob_colsum[n_class] ; } // --------------- reverse --------------- if(this->flip) { int from_dat_rev = this->n_col - 1 - n_shift ; int to_dat_rev = from_dat_rev - (this->l_model - 1) ; for(int j_dat_rev=from_dat_rev, j_ref_fw=0; j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++) { model(n_class,j_ref_fw,0) += (posterior_prob(i,n_class,n_shift,flip_states::REVERSE) * this->data(i,j_dat_rev)) / posterior_prob_colsum[n_class] ; } } } } } promise.set_value(model) ; } void ReadLayer::compute_window_means(ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_window_means_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&ReadLayer::compute_window_means_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void ReadLayer::compute_window_means_routine(size_t from, size_t to, std::promise& done) { double l_window = double(this->l_model) ; for(size_t i=from; in_shift; from++) { double sum = 0. ; // slice is [from,to) size_t to = from + this->l_model ; for(size_t j=from; jdata(i,j) ;} this->window_means(i,from) = sum / l_window ; } } done.set_value(true) ; } void ReadLayer::check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const { if(posterior_prob_colsum.size() != this->n_class) { char msg[4096] ; sprintf(msg, "Error! posterior_class_prob matrix size is not " "equal to model class number : %zu / %zu", posterior_prob_colsum.size(), this->n_class) ; throw std::invalid_argument(msg) ; } } +void ReadLayer::create_bckg_class() +{ + // mean count + double mean = 0. ; + double n = (double)this->data.get_nrow() * + (double)this->data.get_ncol() ; + for(size_t i=0; idata.get_nrow(); i++) + { for(size_t j=0; jdata.get_ncol(); j++) + { mean += ((double)this->data(i,j)) / n ; } + } + // set last class as background class + for(size_t j=0; jl_model; j++) + { this->model(this->n_class-1,j,0) = mean ; } +} + diff --git a/src/Clustering/ReadLayer.hpp b/src/Clustering/ReadLayer.hpp index d6deef6..aa1fedc 100644 --- a/src/Clustering/ReadLayer.hpp +++ b/src/Clustering/ReadLayer.hpp @@ -1,264 +1,266 @@ #ifndef READLAYER_HPP #define READLAYER_HPP #include #include #include #include #include typedef std::vector vector_d ; class ReadLayer : public DataLayer { public: /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst, + bool bckg_class, ThreadPool* threads = nullptr) ; /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst, + bool bckg_class, ThreadPool* threads = nullptr) ; /*! * \brief Construct an object with the * given data and model. * \param data the data. * \param the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(const Matrix2D& data, const Matrix3D& model, bool flip, - bool last_class_cst, + bool bckg_class, ThreadPool* threads = nullptr) ; /*! * \brief Construct an object with the * given data and model. * \param data the data. * \param the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ ReadLayer(Matrix2D&& data, Matrix3D&& model, bool flip, - bool last_class_cst, + bool bckg_class, ThreadPool* threads = nullptr) ; /*! * Destructor */ virtual ~ReadLayer() override ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * During this process, a normalized version of the * models, having a sum of signal of 1 count in average, * is used (a copy of the models is normalized, meaning * that the original models can still be retrieved the * dedicated getter). * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of loglikelihood. * Its length should be equal to the data row number. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. * \throw std::invalid_argument if the dimensions are * incorrect. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * This method does the same as the virtual method it * overloads. The only difference is that, for run time * gain, it is given the sum over the columns of the * posterior_prob matrix which is computed by the virtual * method. * \param posterior_prob the data assignment probabilities to * the different classes. * \param posterior_prob_colsum the sum over the columns * (classes) of the posterior_prob matrix. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ void update_model(const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, ThreadPool* threads=nullptr) ; protected: /*! * \brief The routine that effectively performs the * loglikelihood computations. * \param from the index of the first row of the data * to considered. * \param to the index of the past last row of the data * to considered. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param done a promise to be filled when the routine * is done running. */ void compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, std::promise& done) const ; /*! * \brief The routine that effectively update the model. * \param from the index of the first row of the * posterior probabilities to considered. * \param to the index of the past last row of the * posterior probabilities to considered. * \param posterior_prob the data assignment probabilities * to the different classes. * \param * \param promise a promise containing the partial model * computed from the given data slice. If several routines * work together to update the model, the promise matrices * need to be summed up to get the final model. */ void update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, const vector_d& posterior_prob_colsum, std::promise>& promise) const ; /*! * \brief Computes the mean number of reads present in * each slice (of length l_model), in each row * of the data and store them in this->window_means. * \param threads a pointer to a thread pool to * parallelize the computations. If nullptr is given, * the computations are performed by the main thread. */ void compute_window_means(ThreadPool* threads) ; /*! * \brief The routine that effectively computes the * window means. * \param from the index of the first row of the * data to considered. * \param to the index of the past last row of the * data to considered. * \param done a promise to fill when the routine * is done running. */ void compute_window_means_routine(size_t from, size_t to, std::promise& done) ; /*! * \brief Checks that the argument has compatible * dimensions with the data and models. If this is * not the case, throw a std::invalid_argument with * a relevant message. * \param posterior_class_prob a vector containing the * class probabilities. * It should have a length equal to the number of * classes. * \throw std::invalid_argument if the dimensions are * incorrect. */ void check_posterior_prob_colsum_dim(const vector_d& posterior_prob_colsum) const ; + /*! + * \brief Sets the last class as background class with the mean + * number of counts in the data at each position. + */ + void create_bckg_class() ; + /*! * \brief contains the data means, for * each window of size l_model. */ Matrix2D window_means ; } ; #endif // READLAYER_HPP diff --git a/src/Clustering/ReadModelComputer.cpp b/src/Clustering/ReadModelComputer.cpp index 212af00..72dff2e 100644 --- a/src/Clustering/ReadModelComputer.cpp +++ b/src/Clustering/ReadModelComputer.cpp @@ -1,49 +1,50 @@ #include // std::move() #include #include #include #include #include #include ReadModelComputer::ReadModelComputer(Matrix2D&& data, const Matrix4D& post_prob, + bool bckg_class, size_t n_threads) : ModelComputer(), threads(nullptr) { // parameters size_t n_class = post_prob.get_dim()[1] ; size_t n_shift = post_prob.get_dim()[2] ; size_t n_flip = post_prob.get_dim()[3] ; bool flip = n_flip == 2 ; // the threads if(n_threads) { this->threads = new ThreadPool(n_threads) ; } // the data and the model this->data_layer = new ReadLayer(std::move(data), n_class, n_shift, flip, - false) ; + bckg_class) ; this->data_layer->update_model(post_prob, this->threads) ; } ReadModelComputer::~ReadModelComputer() { // threads if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } // data and model if(this->data_layer != nullptr) { delete this->data_layer ; this->data_layer = nullptr ; } } diff --git a/src/Clustering/ReadModelComputer.hpp b/src/Clustering/ReadModelComputer.hpp index f2d5aba..5a3b46e 100644 --- a/src/Clustering/ReadModelComputer.hpp +++ b/src/Clustering/ReadModelComputer.hpp @@ -1,42 +1,46 @@ #ifndef READMODELCOMPUTER_HPP #define READMODELCOMPUTER_HPP #include #include #include #include class ReadModelComputer : public ModelComputer { public: /*! * \brief Constructs an object to retrieve * the read model given the data and their * classification results. * \param data the data. * \param post_prob the data class assignment * probabilities. + * \param bckg_class whether the last class of the + * classification (posterior probabilities) is a + * background class. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ ReadModelComputer(Matrix2D&& data, const Matrix4D& post_prob, + bool bckg_class, size_t n_threads) ; /*! * \brief Destructor. */ virtual ~ReadModelComputer() override ; protected: /*! * \brief the threads. */ ThreadPool* threads ; } ; #endif // READMODELCOMPUTER_HPP diff --git a/src/Clustering/SequenceLayer.cpp b/src/Clustering/SequenceLayer.cpp index ae6aeaf..68834d0 100644 --- a/src/Clustering/SequenceLayer.cpp +++ b/src/Clustering/SequenceLayer.cpp @@ -1,411 +1,440 @@ #include #include // std::invalid_argument #include // numeric_limits #include // log(), pow() #include #include // std::max_element() #include // beta_pmf() #include // rand_real_uniform(), rand_int_uniform() #include #include #include #include double SequenceLayer::score_subseq(const Matrix2D& seq, size_t row, size_t start, const Matrix2D& model_log) { if(start > seq.get_ncol() - model_log.get_nrow()) { char msg[4096] ; sprintf(msg, "Error! given start (%zu) is too high. Max value is %zu", start, seq.get_ncol() - model_log.get_nrow()) ; throw std::invalid_argument(msg) ; } else if(model_log.get_nrow() > seq.get_ncol()) { char msg[4096] ; sprintf(msg, "Error! given model is longer than sequences (%zu / %zu)", model_log.get_nrow(), seq.get_ncol()) ; throw std::invalid_argument(msg) ; } else if(model_log.get_ncol() != 4) { char msg[4096] ; sprintf(msg, "Error! given model 2nd dimension is not 4 (%zu)", model_log.get_ncol()) ; throw std::invalid_argument(msg) ; } size_t from = start ; size_t to = from + model_log.get_nrow() ; // will score [from, to) int n_code = dna::char_to_int('N') ; double ll = 0 ; for(size_t i=from, j=0; i get max score if(base == n_code) { std::vector row = model_log.get_row(j) ; ll += *(std::max_element(std::begin(row), std::end(row))) ; } // A,C,G,T -> get its score else { ll += model_log(j,base) ; } } return ll ; } SequenceLayer::SequenceLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst) - : DataLayer(data, n_class, n_shift, flip,last_class_cst) -{ this->n_category = 4 ; + bool bckg_class) + : DataLayer(data, n_class, n_shift, flip,bckg_class) +{ + this->n_category = 4 ; + // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; + // background class + if(this->bckg_class) + { this->create_bckg_class() ; } } SequenceLayer::SequenceLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst) - : DataLayer(std::move(data), n_class, n_shift, flip,last_class_cst) -{ this->n_category = 4 ; + bool bckg_class) + : DataLayer(std::move(data), n_class, n_shift, flip, bckg_class) +{ + this->n_category = 4 ; + // initialise the empty model this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; + + // background class + if(this->bckg_class) + { this->create_bckg_class() ; } } SequenceLayer::SequenceLayer(const Matrix2D& data, const Matrix3D& model, bool flip, - bool last_class_cst) - : DataLayer(data, model,flip,last_class_cst) + bool bckg_class) + : DataLayer(data, model,flip, bckg_class) {} SequenceLayer::SequenceLayer(Matrix2D&& data, Matrix3D&& model, bool flip, - bool last_class_cst) - : DataLayer(std::move(data), std::move(model),flip,last_class_cst) + bool bckg_class) + : DataLayer(std::move(data), std::move(model),flip, bckg_class) {} SequenceLayer::~SequenceLayer() {} void SequenceLayer::compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads) const { // dimension checks this->check_loglikelihood_dim(loglikelihood) ; this->check_loglikelihood_max_dim(loglikelihood_max) ; // compute the log prob model and the log prob reverse-complement model std::vector> model_log(this->n_class, Matrix2D(this->l_model, this->n_category, 0.)) ; std::vector> model_log_rev = model_log ; /* Matrix3D model_log(this->n_class, this->l_model, this->n_category, 0.) ; Matrix3D model_log_rev = model_log ; */ for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { // forward model_log[i](j,k) = log(this->model(i,j,k)) ; // reverse model_log_rev[i](this->l_model-j-1,this->n_category-k-1) = log(this->model(i,j,k)) ; } } } // don't parallelize if(threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihoods_routine(0, this->n_row, loglikelihood, loglikelihood_max, model_log, model_log_rev, promise) ; future.get() ; } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::compute_loglikelihoods_routine, this, slice.first, slice.second, std::ref(loglikelihood), std::ref(loglikelihood_max), std::ref(model_log), std::ref(model_log_rev), std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void SequenceLayer::compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, const std::vector>& model_log, const std::vector>& model_log_rev, std::promise& done) const { // compute log likelihood for(size_t i=from; i::lowest() ; for(size_t j=0; jn_class; j++) { for(size_t s=0; sn_shift; s++) { // forward strand { double ll_fw = score_subseq(this->data, i, s, model_log[j]) ; - loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; + // loglikelihood(i,j,s,flip_states::FORWARD) = ll_fw ; + // apply lower bound here -> log(1e-100) + loglikelihood(i,j,s,flip_states::FORWARD) = std::max(ll_fw, + SequenceLayer::p_min_log) ; // keep track of max per row if(ll_fw > loglikelihood_max[i]) { loglikelihood_max[i] = ll_fw ; } } // reverse if(this->flip) { double ll_rev = score_subseq(this->data, i, s, model_log_rev[j]) ; - loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; + // loglikelihood(i,j,s,flip_states::REVERSE) = ll_rev ; + // apply lower bound here -> log(1e-100) + loglikelihood(i,j,s,flip_states::REVERSE) = std::max(ll_rev, + SequenceLayer::p_min_log) ; // keep track of max per row if(ll_rev > loglikelihood_max[i]) { loglikelihood_max[i] = ll_rev ; } } } } } done.set_value(true) ; } void SequenceLayer::update_model(const Matrix4D& posterior_prob, ThreadPool* threads) { // don't parallelize if(threads == nullptr) { std::promise> promise ; std::future> future = promise.get_future() ; this->update_model_routine(0, this->n_row, posterior_prob, promise) ; // this->model = future.get() ; auto model = future.get() ; - size_t n_class_to_update = this->n_class - this->last_class_cst ; + size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = model(i,j,k) ; } } } } // parallelize else { size_t n_threads = threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row, n_threads) ; // get promises and futures // the function run by the threads will simply fill the promise with // "true" to indicate that they are done std::vector>> promises(n_threads) ; std::vector>> futures(n_threads) ; for(size_t i=0; iaddJob(std::move( std::bind(&SequenceLayer::update_model_routine, this, slice.first, slice.second, std::ref(posterior_prob), std::ref(promises[i])))) ; } // reinitialise the model /* this->model = Matrix3D(this->n_class, this->l_model, this->n_category, 0.) ; */ - size_t n_class_to_update = this->n_class - this->last_class_cst ; + size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = 0. ; } } } // wait until all threads are done working // and update the model for(auto& future : futures) { Matrix3D model_part = future.get() ; // for(size_t i=0; in_class; i++) for(size_t i=0; il_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) += model_part(i,j,k) ; } } } } // -------------------------- threads stop --------------------------- } // make sure to have no 0 values for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { for(size_t k=0; kn_category; k++) { this->model(i,j,k) = std::max(this->model(i,j,k), SequenceLayer::p_min) ; } } } // normalize to get probs for(size_t i=0; in_class; i++) { for(size_t j=0; jl_model; j++) { double sum = 0. ; for(size_t k=0; kn_category; k++) { sum += this->model(i,j,k) ; } for(size_t k=0; kn_category; k++) { double p = this->model(i,j,k) / sum ; this->model(i,j,k) = p ; /* this->model(i,j,k) = std::max(p, SequenceLayer::p_min) ; */ } } } } void SequenceLayer::update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, std::promise>& promise) const { // dimension checks this->check_posterior_prob_dim(posterior_prob) ; Matrix3D model(this->n_class, this->l_model, this->n_category, 0.) ; // the int code of A, C, G, T, N static int a_code = dna::char_to_int('A') ; static int c_code = dna::char_to_int('C') ; static int g_code = dna::char_to_int('G') ; static int t_code = dna::char_to_int('T') ; static int n_code = dna::char_to_int('N') ; // the int code of the reverse complement of A, C, G, T static int a_code_r = dna::char_to_int('A', true) ; static int c_code_r = dna::char_to_int('C', true) ; static int g_code_r = dna::char_to_int('G', true) ; static int t_code_r = dna::char_to_int('T', true) ; - size_t n_class_to_update = this->n_class - this->last_class_cst ; + size_t n_class_to_update = this->n_class - this->bckg_class ; for(size_t k=0; k < n_class_to_update; k++) // for(size_t k=0; k < this->n_class; k++) { for(size_t s=0; sn_shift; s++) { for(size_t j=0; jl_model; j++) { // base prob on fw and rv strand vector_d base_prob_fw(this->n_category, 0.) ; vector_d base_prob_rv(this->n_category, 0.) ; for(size_t i=from; idata(i,s+j) ; int base_rev = this->n_category - base - 1 ; // N if(base == n_code) { // --------------- forward --------------- { base_prob_fw[a_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[c_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[g_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; base_prob_fw[t_code] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) { base_prob_rv[a_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[c_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[g_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; base_prob_rv[t_code_r] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } // A, C, G, T else { { base_prob_fw[base] += posterior_prob(i,k,s,SequenceLayer::FORWARD) ; } // --------------- reverse --------------- if(this->flip) { base_prob_rv[base_rev] += posterior_prob(i,k,s,SequenceLayer::REVERSE) ; } } } // update this position of the model for(size_t i=0,i_rev=base_prob_fw.size()-1; iflip) { model(k,this->l_model-j-1,i) += base_prob_rv[i] ; } } } } } promise.set_value(model) ; } + +void SequenceLayer::create_bckg_class() +{ // sequence composition + std::vector base_comp = + dna::base_composition(this->data, + this->flip) ; + // set last class as background class + for(size_t i=0; in_category; i++) + { for(size_t j=0; jl_model; j++) + { this->model(this->n_class-1,j,i) = base_comp[i] ; } + } +} diff --git a/src/Clustering/SequenceLayer.hpp b/src/Clustering/SequenceLayer.hpp index 1a1532b..567c113 100644 --- a/src/Clustering/SequenceLayer.hpp +++ b/src/Clustering/SequenceLayer.hpp @@ -1,228 +1,230 @@ #ifndef SEQUENCELAYER_HPP #define SEQUENCELAYER_HPP #include #include #include #include // std::promise, std::future #include #include #include #include typedef std::vector vector_d ; class SequenceLayer : public DataLayer { public: /*! * \brief Computes the log-likelihood of the sub- * sequence - stored in a given row - and starting * at the offset in the given sequence matrix. * The subsequence length is determined by the model * lenght. * \param seq the sequences in integer format. * \param row the row containing the sequence of * interest. * \param col the index at which the sub-sequence * is starting (1st index inside the subsequence * of interest). * \param model_log a model containing the log * probability model. * \return the log-likelihood of the sub-sequence * given the model. * \throw std::invalid_argument if 1) the offset is * invalid, 2) the sequence and the model have * incompatible dimensions or 3) the model 2n dimension * is not 4 (A,C,G,T). */ static double score_subseq(const Matrix2D& seq, size_t row, size_t col, const Matrix2D& model_log) ; public: /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. */ SequenceLayer(const Matrix2D& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst) ; + bool bckg_class) ; /*! * \brief Constructs an object with the * given data and an empty (0 values) * model. * \param data the data. * \param n_class the number of classes * of the model. * \param n_shift the number of shift * states of the model. * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. */ SequenceLayer(Matrix2D&& data, size_t n_class, size_t n_shift, bool flip, - bool last_class_cst) ; + bool bckg_class) ; /*! * \brief Construct an object with the * given data and model. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param data the data. The sequences * should be stored as integer values : * A:0, C:1, G:2, T:3, else:5. * \param model the model with the following * dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 (A,C,G,T) * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. */ SequenceLayer(const Matrix2D& data, const Matrix3D& model, bool flip, - bool last_class_cst) ; + bool bckg_class) ; /*! * \brief Construct an object with the * given data and model. * The shifting freedom is set to (data number * of columns) - (the model 2nd dimension) * + 1. * \param data the data. The sequences * should be stored as integer values : * A:0, C:1, G:2, T:3, else:5. * \param model the model with the following * dimensions : * dim1 the number of classes * dim2 the model length * dim3 4 (A,C,G,T) * \param flip whether flipping is allowed. - * \param last_class_cst indicates that the - * last class of the model is constant - * and should never be updated by calls to - * update_model(). + * \param bckg_class should the last class + * of the model is constant and models the + * background. */ SequenceLayer(Matrix2D&& data, Matrix3D&& model, bool flip, - bool last_class_cst) ; + bool bckg_class) ; /*! * Destructor */ virtual ~SequenceLayer() override ; /*! * \brief Computes the log likelihood of the data * given the current model parameters. * \param logliklihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of loglikelihood. * Its length should be equal to the data row number. * \throw std::invalid_argument if the dimensions are * incorrect. */ virtual void compute_loglikelihoods(Matrix4D& loglikelihood, vector_d& loglikelihood_max, ThreadPool* threads=nullptr) const override ; /*! * \brief Updates the model given the posterior * probabilities (the probabilities of each row * in the data to be assigned to each class, * for each shift and flip state). * \param posterior_prob the data assignment probabilities to * the different classes. */ virtual void update_model(const Matrix4D& posterior_prob, ThreadPool* threads=nullptr) override ; protected: /*! * \brief The routine that effectively performs the * loglikelihood computations. * \param from the index of the first row of the data * to considered. * \param to the index of the past last row of the data * to considered. * \param loglikelihood a matrix to store the * results. It should have the following dimensions : * 1st : same as the data number of row * 2nd : same as the model number of classes * 3rd : same as the number of shifts * 4th : same as the number of flip states * \param loglikelihood_max a vector containing the * max value for each row of log_likelihood. * Its length should be equal to the data row number. * \param model_log a vector containing the matrices with * the log values of the model for each class. * \param model_log_rev a vector containing the matrices with * the log values of the reverse strand model for each class * (the 1st position in the model becomes the last in the * reverse model and probabilities are swapped A<->T and C<->G). * \param done a promise to be filled when the routine * is done running. */ void compute_loglikelihoods_routine(size_t from, size_t to, Matrix4D& loglikelihood, vector_d& loglikelihood_max, const std::vector>& model_log, const std::vector>& model_log_rev, std::promise& done) const ; /*! * \brief The routine that effectively update the model. * \param from the index of the first row of the * posterior probabilities to considered. * \param to the index of the past last row of the * posterior probabilities to considered. * \param posterior_prob the data assignment probabilities * to the different classes. * \param * \param done a promise containing the partial model * computed from the given data slice. If several routines * work together at updating the model, they need to be * summed and normalized (by the column sum) to get the * final model. */ void update_model_routine(size_t from, size_t to, const Matrix4D& posterior_prob, std::promise>& done) const ; + /*! + * \brief Sets the last class as background class with the + * mean base probababilities of the data at each position. + */ + void create_bckg_class() ; + } ; #endif // SEQUENCELAYER_HPP diff --git a/src/Clustering/SequenceModelComputer.cpp b/src/Clustering/SequenceModelComputer.cpp index 2c96b04..5ddc0dc 100644 --- a/src/Clustering/SequenceModelComputer.cpp +++ b/src/Clustering/SequenceModelComputer.cpp @@ -1,48 +1,49 @@ #include // std::move() #include #include #include #include SequenceModelComputer::SequenceModelComputer(Matrix2D&& data, const Matrix4D& post_prob, + bool bckg_class, size_t n_threads) : ModelComputer(), threads(nullptr) { // parameters size_t n_class = post_prob.get_dim()[1] ; size_t n_shift = post_prob.get_dim()[2] ; size_t n_flip = post_prob.get_dim()[3] ; bool flip = n_flip == 2 ; // the threads if(n_threads) { this->threads = new ThreadPool(n_threads) ; } // the data and the model this->data_layer = new SequenceLayer(std::move(data), n_class, n_shift, flip, - false) ; + bckg_class) ; this->data_layer->update_model(post_prob, this->threads) ; } SequenceModelComputer::~SequenceModelComputer() { // threads if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } // data and model if(this->data_layer != nullptr) { delete this->data_layer ; this->data_layer = nullptr ; } } diff --git a/src/Clustering/SequenceModelComputer.hpp b/src/Clustering/SequenceModelComputer.hpp index 63bd814..f185614 100644 --- a/src/Clustering/SequenceModelComputer.hpp +++ b/src/Clustering/SequenceModelComputer.hpp @@ -1,42 +1,46 @@ #ifndef SEQUENCEMODELCOMPUTER_HPP #define SEQUENCEMODELCOMPUTER_HPP #include #include #include #include class SequenceModelComputer : public ModelComputer { public: /*! * \brief Constructs an object to retrieve * the sequence model given the data and their * classification results. * \param data the data. * \param post_prob the data class assignment * probabilities. + * \param bckg_class whether the last class of the + * classification (posterior probabilities) is a + * background class. * \param n_threads the number of parallel threads * to run the computations. 0 means no parallel * computing, everything is run on the main thread. */ SequenceModelComputer(Matrix2D&& data, const Matrix4D& post_prob, + bool bckg_class, size_t n_threads) ; /*! * \brief Destructor. */ virtual ~SequenceModelComputer() override ; protected: /*! * \brief the threads. */ ThreadPool* threads ; } ; #endif // SEQUENCEMODELCOMPUTER_HPP diff --git a/src/main_test.cpp b/src/main_test.cpp index 7020fe0..2efa423 100644 --- a/src/main_test.cpp +++ b/src/main_test.cpp @@ -1,131 +1,182 @@ #include #include #include #include #include -# +#include +#include +#include using namespace std ; template std::ostream& operator << (std::ostream& stream, const std::vector& v) -{ for(const auto x : v) +{ for(const auto& x : v) { stream << x << " " ; } return stream ; } int main() -{ // path +{ + // path std::string bed_file = "/local/groux/scATAC-seq/data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks_rmsk_sampled.bed" ; std::string bam_file = "/local/groux/scATAC-seq/data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" ; std::string bai_file = "/local/groux/scATAC-seq/data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" ; - std::string prob_file = "/local/groux/scATAC-seq/results/10xgenomics_PBMC_5k_peaks_classification_6/peaks_rmsk_sampled_sequences_1kb_23class_prob.mat4d" ; + std::string seq_file = "/local/groux/scATAC-seq/data/genomes/hg19.fasta" ; + std::string prob_file = "/local/groux/scATAC-seq/results/10xgenomics_PBMC_5k_peaks_classification_6.old/peaks_rmsk_sampled_sequences_1kb_23class_prob.mat4d" ; // posterior prob std::cerr << "loading posterior prob" << std::endl ; Matrix4D prob(prob_file) ; size_t n_row = prob.get_dim()[0] ; - size_t n_class = prob.get_dim()[1] ; + // size_t n_class = prob.get_dim()[1] ; size_t n_shift = prob.get_dim()[2] ; size_t n_flip = prob.get_dim()[3] ; bool flip = n_flip == 2 ; std::cerr << "posterior prob " << prob.get_dim() << std::endl ; - // class prob - std::cerr << "computing class probabilities" << std::endl ; - std::vector prob_colsum(n_class, 0.) ; - double tot = 0. ; - for(size_t i=0; i data2 = mc.create_matrix() ; size_t n_col2 = data2.get_ncol() ; std::cerr << "data2 matrix " << data2.get_dim() << std::endl ; + + // realign matrix std::cerr << "computing data3 matrix" << std::endl ; size_t n_col3 = to1 - from1 + 1 ; - Matrix2D data3(n_row, + Matrix3D data3(n_row, n_col3, + 4, // A, C, G, T, N 0.) ; - std::cerr << "data3 matrix " << data3.get_dim() << std::endl ; + + // the int code of A, C, G, T, N + // static int a_code = dna::char_to_int('A') ; + // static int c_code = dna::char_to_int('C') ; + // static int g_code = dna::char_to_int('G') ; + // static int t_code = dna::char_to_int('T') ; + static int n_code = dna::char_to_int('N') ; + // the int code of the reverse complement of A, C, G, T + // static int a_code_r = dna::char_to_int('A', true) ; + // static int c_code_r = dna::char_to_int('C', true) ; + // static int g_code_r = dna::char_to_int('G', true) ; + // static int t_code_r = dna::char_to_int('T', true) ; + + std::cerr << "data3 matrix " << data3.get_dim() << std::endl ; for(size_t i=0; i= to_dat2_rev; j_dat2_rev--, j_dat3_fw++) - { data3(i,j_dat3_fw) += - (prob(i,class_k,s,1) * - data2(i,j_dat2_rev)) / - prob_colsum[class_k] ; + { int base = data2(i,j_dat2_rev) ; + int base_rev = 4 - base - 1 ; + // exclude N's + if(base_rev == n_code) + { continue ; } + double p = prob(i,class_k,s,1) ; + data3(i,j_dat3_fw, base_rev) += p ; + tot += p ; } } } + // normalize + for(size_t s=0; s() ; - // convert to integer - Matrix2D data4(data3.get_nrow(), - data3.get_ncol()) ; + for(size_t j=0; j<10; j++) + // for(size_t j=0; j data4(data3.get_dim()[0], + data3.get_dim()[1]) ; for(size_t i=0; i< data4.get_nrow(); i++) { for(size_t j=0; j::lowest() ; + size_t b_max = 0 ; + for(size_t base=0; base p_max) + { p_max = data3(i,j,base) ; + b_max = base ; + } + } + data4(i,j) = b_max ; + } } + std::cout << data4 << std::endl ; + */ + + // turn to prob matrix + /* + Matrix2D data4(4, data3.get_dim()[1], 0.) ; + for(size_t j=0; j tot(4,0.) ; + + for(size_t i=0; i