diff --git a/scripts/10xgenomics_PBMC_5k/analyse_fragment_lengths.R b/scripts/10xgenomics_PBMC_5k/analyse_fragment_lengths.R index 3c33c58..40d90ae 100644 --- a/scripts/10xgenomics_PBMC_5k/analyse_fragment_lengths.R +++ b/scripts/10xgenomics_PBMC_5k/analyse_fragment_lengths.R @@ -1,151 +1,239 @@ setwd(file.path("", "local", "groux", "scATAC-seq")) if(!file.exists(file.path("results", "10xgenomics_PBMC_5k"))) { dir.create(file.path("results", "10xgenomics_PBMC_5k")) } # library library("RColorBrewer") ############# data ############# data = read.table(file.path("data", "10xgenomics_PBMC_5k", "atac_v1_pbmc_5k_possorted_filtered_fragment_lengths.txt"), header=F) colnames(data) = c("nb", "size") ############# fit to gaussian mixture ############# set.seed(20190604) # d-day - 2 (les sanglots long de l'automne...) # fit data to gaussian mixture model size = data$size[1:1000] dens = data$nb[1:1000] / sum(data$nb[1:1000]) # model parameters, 1st guess by looking at plot m1 = 50 ; s1 = 10 ; a1 = 1 m2 = 200 ; s2 = 10 ; a2 = 1 m3 = 380 ; s3 = 30 ; a3 = 1 # fit init = c(m1=m1, s1=s1, a1=a1, m2=m2, s2=s2, a2=a2, m3=m3, s3=s3, a3=a3) f = nls(dens ~ a1 * exp(-((size-m1)**2)/(2*s1)) + a2 * exp(-((size-m2)**2)/(2*s2)) + a3 * exp(-((size-m3)**2)/(2*s3)), start=init) # parameter estimates param = matrix(nrow=3, ncol=3) colnames(param) = c("m", "s", "a") rownames(param) = c("class1", "class2", "class3") param[1,] = c(coef(f)["m1"], coef(f)["s1"], coef(f)["a1"]) param[2,] = c(coef(f)["m2"], coef(f)["s2"], coef(f)["a2"]) param[3,] = c(coef(f)["m3"], coef(f)["s3"], coef(f)["a3"]) # plot png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_classes.png"), width=10, height=8, units="in", res=720) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) plot(size, dens, type='l', lwd=2, main="Fragment lengths", xlab="length (bp)", ylab="density", cex.main=3, cex.axis=1.5, cex.lab=2.5) col = brewer.pal(4, "Set1") lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])), col=col[1], lwd=4, lty=2) lines(size, param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])), col=col[2], lwd=4, lty=2) lines(size, param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[3], lwd=4, lty=2) lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])) + param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])) + param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[4], lwd=4) legend("topright", legend=c("open chromatin", "mono-nucl.", "di-nucl.", "all"), col=col, lwd=c(4,4,4,4), lty=c(2,2,2,1), bty='n', cex=2) dev.off() # assign probabilities to fragment length prob = matrix(nrow=1000, ncol=3) rownames(prob) = size for(i in 1:nrow(prob)) { for(j in 1:ncol(prob)) { prob[i,j] = param[j,3] * exp(-((size[i]-param[j,1])**2)/(2*param[j,2])) } prob[i,] = prob[i,] / sum(prob[i,]) } # plot png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_class_prob.png"), width=10, height=8, units="in", res=720) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) plot(size, prob[,1], ylim=c(0, max(prob)), type='l', - main="Fragment classes", xlab="length (bp)", ylab="p(class)", + main="Fragment class probability", xlab="length (bp)", ylab="p(class)", cex.main=3, cex.axis=1.5, cex.lab=2.5, lwd=4, col=col[1]) lines(size, prob[,2], lwd=4, col=col[2]) lines(size, prob[,3], lwd=4, col=col[3]) # set limits at min 90 assignment to a class abline(v=30, lwd=2, lty=2) # class 1 lower limit (size limit) abline(v=84, lwd=2, lty=2) # class 1 upper limit abline(v=133, lwd=2, lty=2) # class 2 lower limit abline(v=266, lwd=2, lty=2) # class 2 upper limit abline(v=341, lwd=2, lty=2) # class 3 lower limit abline(v=500, lwd=2, lty=2) # class 3 upper limit (size limit) dev.off() ############# break dataset into classes ############# # size limits i_cl1_1 = which(size == 30) i_cl1_2 = which(size == 84) i_cl2_1 = which(size == 133) i_cl2_2 = which(size == 266) i_cl3_1 = which(size == 341) i_cl3_2 = which(size == 500) # nb of reads per class nb_all = sum(data$nb) nb_cl1 = sum(data$nb[i_cl1_1:i_cl1_2]) nb_cl2 = sum(data$nb[i_cl2_1:i_cl2_2]) nb_cl3 = sum(data$nb[i_cl3_1:i_cl3_2]) # nb of reads not assigned at the boundaries of classes nb_left1 = sum(data$nb[(i_cl1_2+1):(i_cl2_1-1)]) + sum(data$nb[(i_cl2_2+1):(i_cl3_1-1)]) # nb of reads > 500bp nb_left2 = sum(data$nb[(i_cl3_2+1):length(data$nb)]) nb_left = nb_left1 + nb_left2 # plot classes png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_groups.png"), width=10, height=8, units="in", res=720) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) plot(y=data$nb[1:1000], x=data$size[1:1000], type='l', lwd=4, - main="Fragment lengths", xlab="length (bp)", ylab="frequency", + main="Fragment classes", xlab="length (bp)", ylab="frequency", cex.main=3, cex.axis=1.5, cex.lab=2.5) # show limits abline(v=data$size[i_cl1_1], lwd=3, lty=2, col=col[1]) abline(v=data$size[i_cl1_2], lwd=3, lty=2, col=col[1]) abline(v=data$size[i_cl2_1], lwd=3, lty=2, col=col[2]) abline(v=data$size[i_cl2_2], lwd=3, lty=2, col=col[2]) abline(v=data$size[i_cl3_1], lwd=3, lty=2, col=col[3]) abline(v=data$size[i_cl3_2], lwd=3, lty=2, col=col[3]) # nb of reads in groups - text(x=550, y=0.85*max(data[,1]), labels=sprintf("%.2f mio reads", nb_all/1e6), cex=1.8, pos=4) - text(x=550, y=0.80*max(data[,1]), labels=sprintf("%.2f mio reads class 1", nb_cl1/1e6), cex=1.8, pos=4, col=col[1]) - text(x=550, y=0.75*max(data[,1]), labels=sprintf("%.2f mio reads class 2", nb_cl2/1e6), cex=1.8, pos=4, col=col[2]) - text(x=550, y=0.70*max(data[,1]), labels=sprintf("%.2f mio reads class 3", nb_cl3/1e6), cex=1.8, pos=4, col=col[3]) - text(x=550, y=0.65*max(data[,1]), labels=sprintf("%.2f mio reads left", nb_left/1e6), cex=1.8, pos=4) + text(x=550, y=0.85*max(data[,1]), labels=sprintf("%.2f mio reads", nb_all/1e6), cex=1.8, pos=4) + text(x=550, y=0.80*max(data[,1]), labels=sprintf("%.2f mio reads open", nb_cl1/1e6), cex=1.8, pos=4, col=col[1]) + text(x=550, y=0.75*max(data[,1]), labels=sprintf("%.2f mio reads mono nucl.", nb_cl2/1e6), cex=1.8, pos=4, col=col[2]) + text(x=550, y=0.70*max(data[,1]), labels=sprintf("%.2f mio reads di.nucl.", nb_cl3/1e6), cex=1.8, pos=4, col=col[3]) + text(x=550, y=0.65*max(data[,1]), labels=sprintf("%.2f mio reads left", nb_left/1e6), cex=1.8, pos=4) # shade the class areas # class 1 rect(size[i_cl1_1], 0, size[i_cl1_2], max(data$nb), col=rgb(red=1, green=0, blue=0, alpha=0.1), border="transparent") # class 2 rect(size[i_cl2_1], 0, size[i_cl2_2], max(data$nb), col=rgb(red=0, green=0, blue=1, alpha=0.1), border="transparent") # class 3 rect(size[i_cl3_1], 0, size[i_cl3_2], max(data$nb), col=rgb(red=0, green=1, blue=0, alpha=0.1), border="transparent") dev.off() + + + + + + + +# plot +# X11(width=20, height=6) +png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths.png"), width=20, height=6, units="in", res=720) + p = par(mar=c(5.1, 5.1, 5.1, 2.1), + mfrow=c(1,3)) + + # plot fragment sizes and gaussians + plot(size, dens, type='l', lwd=2, + main="Fragment lengths", xlab="length (bp)", ylab="density", + cex.main=3, cex.axis=1.5, cex.lab=2.5) + col = brewer.pal(4, "Set1") + lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])), col=col[1], lwd=4, lty=2) + lines(size, param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])), col=col[2], lwd=4, lty=2) + lines(size, param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[3], lwd=4, lty=2) + lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])) + + param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])) + + param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[4], lwd=4) + legend("topright", + legend=c("open chromatin", + "mono-nucl.", + "di-nucl.", + "all"), + col=col, lwd=c(4,4,4,4), lty=c(2,2,2,1), + bty='n', cex=2) + mtext('A', 3, 0, cex=4.5, at=-80) + + # plot class probability mass function + plot(size, prob[,1], ylim=c(0, max(prob)), type='l', + main="Fragment class probability", xlab="length (bp)", ylab="p(class)", + cex.main=3, cex.axis=1.5, cex.lab=2.5, lwd=4, col=col[1]) + lines(size, prob[,2], lwd=4, col=col[2]) + lines(size, prob[,3], lwd=4, col=col[3]) + + # set limits at min 90 assignment to a class + abline(v=30, lwd=2, lty=2) # class 1 lower limit (size limit) + abline(v=84, lwd=2, lty=2) # class 1 upper limit + abline(v=133, lwd=2, lty=2) # class 2 lower limit + abline(v=266, lwd=2, lty=2) # class 2 upper limit + abline(v=341, lwd=2, lty=2) # class 3 lower limit + abline(v=500, lwd=2, lty=2) # class 3 upper limit (size limit) + mtext('B', 3, 0, cex=4.5, at=-80) + + + # plot final categories + plot(y=data$nb[1:1000], x=data$size[1:1000], type='l', lwd=4, + main="Fragment classes", xlab="length (bp)", ylab="frequency", + cex.main=3, cex.axis=1.5, cex.lab=2.5) + # show limits + abline(v=data$size[i_cl1_1], lwd=3, lty=2, col=col[1]) + abline(v=data$size[i_cl1_2], lwd=3, lty=2, col=col[1]) + abline(v=data$size[i_cl2_1], lwd=3, lty=2, col=col[2]) + abline(v=data$size[i_cl2_2], lwd=3, lty=2, col=col[2]) + abline(v=data$size[i_cl3_1], lwd=3, lty=2, col=col[3]) + abline(v=data$size[i_cl3_2], lwd=3, lty=2, col=col[3]) + # nb of reads in groups + text(x=550, y=0.85*max(data[,1]), labels=sprintf("%.2f mio reads", nb_all/1e6), cex=1.8, pos=4) + text(x=550, y=0.80*max(data[,1]), labels=sprintf("%.2f mio reads open", nb_cl1/1e6), cex=1.8, pos=4, col=col[1]) + text(x=550, y=0.75*max(data[,1]), labels=sprintf("%.2f mio reads mono nucl.", nb_cl2/1e6), cex=1.8, pos=4, col=col[2]) + text(x=550, y=0.70*max(data[,1]), labels=sprintf("%.2f mio reads di.nucl.", nb_cl3/1e6), cex=1.8, pos=4, col=col[3]) + text(x=550, y=0.65*max(data[,1]), labels=sprintf("%.2f mio reads left", nb_left/1e6), cex=1.8, pos=4) + # shade the class areas + # class 1 + rect(size[i_cl1_1], + 0, + size[i_cl1_2], + max(data$nb), + col=rgb(red=1, green=0, blue=0, alpha=0.1), border="transparent") + # class 2 + rect(size[i_cl2_1], + 0, + size[i_cl2_2], + max(data$nb), + col=rgb(red=0, green=0, blue=1, alpha=0.1), border="transparent") + # class 3 + rect(size[i_cl3_1], + 0, + size[i_cl3_2], + max(data$nb), + col=rgb(red=0, green=1, blue=0, alpha=0.1), border="transparent") + mtext('C', 3, 0, cex=4.5, at=-80) +dev.off() diff --git a/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.R index a67e09a..fca9588 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.R @@ -1,277 +1,277 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) ################## aggregations around CTCF motifs ################## # data # open chromatin -data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin1bp_fragment.mat"))) -data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin2bp_fragment.mat"))) -data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin10bp_fragment.mat"))) +data.open.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_open_bin1bp_fragment.mat"))) +data.open.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_open_bin2bp_fragment.mat"))) +data.open.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_open_bin10bp_fragment.mat"))) -data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin1bp_read.mat"))) -data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin2bp_read.mat"))) -data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin10bp_read.mat"))) +data.open.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_open_bin1bp_read.mat"))) +data.open.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_open_bin2bp_read.mat"))) +data.open.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_open_bin10bp_read.mat"))) -data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"))) -data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin2bp_read_atac.mat"))) -data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin10bp_read_atac.mat"))) +data.open.1.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"))) +data.open.2.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_open_bin2bp_read_atac.mat"))) +data.open.10.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_open_bin10bp_read_atac.mat"))) # mono-nucleosomes -data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin1bp_fragment.mat"))) -data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin2bp_fragment.mat"))) -data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin10bp_fragment.mat"))) +data.1nucl.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_1nucl_bin1bp_fragment.mat"))) +data.1nucl.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_1nucl_bin2bp_fragment.mat"))) +data.1nucl.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_1nucl_bin10bp_fragment.mat"))) -data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin1bp_read.mat"))) -data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin2bp_read.mat"))) -data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin10bp_read.mat"))) +data.1nucl.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_1nucl_bin1bp_read.mat"))) +data.1nucl.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_1nucl_bin2bp_read.mat"))) +data.1nucl.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_1nucl_bin10bp_read.mat"))) -data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"))) -data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center.mat"))) -data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin10bp_fragment_center.mat"))) +data.1nucl.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"))) +data.1nucl.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center.mat"))) +data.1nucl.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_1nucl_bin10bp_fragment_center.mat"))) # di-nucleosomes -data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin1bp_fragment.mat"))) -data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin2bp_fragment.mat"))) -data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin10bp_fragment.mat"))) +data.2nucl.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nucl_bin1bp_fragment.mat"))) +data.2nucl.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nucl_bin2bp_fragment.mat"))) +data.2nucl.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nucl_bin10bp_fragment.mat"))) -data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin1bp_read.mat"))) -data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin2bp_read.mat"))) -data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin10bp_read.mat"))) +data.2nucl.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nucl_bin1bp_read.mat"))) +data.2nucl.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nucl_bin2bp_read.mat"))) +data.2nucl.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nucl_bin10bp_read.mat"))) -data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin1bp_fragment_center.mat"))) -data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin2bp_fragment_center.mat"))) -data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin10bp_fragment_center.mat"))) +data.2nucl.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nucl_bin1bp_fragment_center.mat"))) +data.2nucl.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nucl_bin2bp_fragment_center.mat"))) +data.2nucl.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nucl_bin10bp_fragment_center.mat"))) # mono-nucleosomes from di-nucleosome data -data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment.mat"))) -data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment.mat"))) -data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment.mat"))) +data.nucls.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment.mat"))) +data.nucls.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment.mat"))) +data.nucls.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment.mat"))) -data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_read.mat"))) -data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_read.mat"))) -data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_read.mat"))) +data.nucls.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_read.mat"))) +data.nucls.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_read.mat"))) +data.nucls.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_read.mat"))) -data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment_center.mat"))) -data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment_center.mat"))) -data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment_center.mat"))) +data.nucls.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment_center.mat"))) +data.nucls.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment_center.mat"))) +data.nucls.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment_center.mat"))) # colors col = brewer.pal(4, "Set1") # x-axis axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5) axis.lab.1 = seq(-400, 400, by=200) axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5) axis.lab.2 = seq(-400, 400, by=200) axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5) axis.lab.10 = seq(-1000, 1000, by=500) # X11(width=12, height=12) png(filename=file.path("results/10xgenomics_PBMC_5k/ctcf_motifs_10e-6_aggregations.png"), units="in", res=720, width=12, height=9) m = matrix(nrow=4, ncol=4, data=c(16,13,14,15, 10, 1, 4, 7, 11, 2, 5, 8, 12, 3, 6, 9), byrow=T) l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1)) layout.show(l) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) # 1bp resolution ## entire fragments plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', cex.axis=2, cex.lab=2) lines(colMeans(data.open.1.frag), col=col[1], lwd=3) lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) ## entire reads plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3) lines(colMeans(data.nucls.1.read), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 2bp resolution ## entire fragments plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) ## entire reads plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3) lines(colMeans(data.nucls.2.read), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) # 10bp resolution ## entire fragments plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) ## entire reads plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3) lines(colMeans(data.nucls.10.read), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) # some legends over the rows and columns p = par(mar=c(0,0,0,0)) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="FRAGMENTS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="READS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-400bp by 1bp", cex=2) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-400bp by 2bp", cex=2) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-1kp by 10bp", cex=2) par(p) dev.off() # footprint # x-axis axis.lab.1 = seq(-200, 200, by=100) axis.at.1 = seq(0, 400, length.out=length(axis.lab.1)) axis.lab.2 = seq(-200, 200, by=100) axis.at.2 = seq(0, 200, length.out=length(axis.lab.2)) axis.lab.10 = seq(-200, 200, by=100) axis.at.10 = seq(0, 41, length.out=length(axis.lab.10)) # X11(width=10, height=12) png(filename=file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_footprint.png"), units="in", res=720, width=10, height=12) p = par(mfrow=c(3,1), mar=c(5.1, 5.1, 4.1, 2.1)) # 1bp resolution index = 200:600 x = 1:length(index) plot(x, colMeans(data.open.1.atac[,index])/max(colMeans(data.open.1.atac[,index])), type='l', lwd=3, col=col[1], main="CTCF motif 1bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.1.cent[,index])/max(colMeans(data.1nucl.1.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.1.cent[,index])/max(colMeans(data.nucls.1.cent[,index])), lwd=3, col=col[4]) abline(v=191, lwd=3, lty=2) abline(v=211, lwd=3, lty=2) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 2bp resolution index = 100:300 x = 1:length(index) plot(x, colMeans(data.open.2.atac[,index])/max(colMeans(data.open.2.atac[,index])), type='l', lwd=3, col=col[1], main="CTCF motif 2bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.2.cent[,index])/max(colMeans(data.1nucl.2.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.2.cent[,index])/max(colMeans(data.nucls.2.cent[,index])), lwd=3, col=col[4]) abline(v=96, lwd=3, lty=2) abline(v=106, lwd=3, lty=2) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 10bp resolution index = 80:120 x = 1:length(index) plot(x, colMeans(data.open.10.atac[,index])/max(colMeans(data.open.10.atac[,index])), type='l', lwd=3, col=col[1], main="CTCF motif 10bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.10.cent[,index])/max(colMeans(data.1nucl.10.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.10.cent[,index])/max(colMeans(data.nucls.10.cent[,index])), lwd=3, col=col[4]) abline(v=20, lwd=3, lty=2) abline(v=22, lwd=3, lty=2) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) par(p) dev.off() diff --git a/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.sh index 32860f2..b103ae9 100755 --- a/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_motif.sh @@ -1,89 +1,96 @@ # some paths ## directories results_dir='data/10xgenomics_PBMC_5k_motifs' read_dir="data/10xgenomics_PBMC_5k" seq_dir="data/genomes" ## input1 file_bed=$read_dir'/ctcf_motifs_10e-6.bed' +file_bed_rmsk=$read_dir'/ctcf_motifs_10e-6_rmsk.bed' file_bam_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" file_bai_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" file_bam_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam" file_bai_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai" file_bam_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam" file_bai_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai" file_bam_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam" file_bai_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai" file_bam_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" file_bai_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" file_hg19="$seq_dir/hg19.fasta" +file_rmsk="$seq_dir/hg19_rmsk.bed" mkdir -p $results_dir +# filter out motifs with >=30% repeated region inside +bin/bedtools/bedtools subtract -A -f 0.3 -a $file_bed -b $file_rmsk > $file_bed_rmsk + # matrix creation -## sequences +## sequences and sequenced repeat masked file_mat_seq="$results_dir/ctcf_motifs_10e-6_sequences.mat" -bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq +file_mat_seq_rmsk="$results_dir/ctcf_motifs_10e-6_sequences_rmsk.mat" +bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq +bin/SequenceMatrixCreator --bed $file_bed_rmsk --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq_rmsk ## open chromatin around CTCF motif for method in 'read' 'read_atac' 'fragment' do file_mat_open_1="$results_dir/ctcf_motifs_10e-6_open_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1 file_mat_open_2="$results_dir/ctcf_motifs_10e-6_open_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2 file_mat_open_10="$results_dir/ctcf_motifs_10e-6_open_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10 done ## mono around CTCF motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_1nucl_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_1nucl_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_1nucl_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## di nucleosomes around CTCF motif for method in 'read' 'fragment' 'fragment_center' do ### di nucleosomes file_mat_2nucl_1="$results_dir/ctcf_motifs_10e-6_2nucl_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1 file_mat_2nucl_2="$results_dir/ctcf_motifs_10e-6_2nucl_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2 file_mat_2nucl_10="$results_dir/ctcf_motifs_10e-6_2nucl_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10 done ## mono nucleosomes from processed di-nucleosome data around CTCF motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## all nucleosomes around CTCF motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_nucl_1="$results_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_nucl_1 file_mat_nucl_2="$results_dir/ctcf_motifs_10e-6_nucleosomes_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_nucl_2 file_mat_nucl_10="$results_dir/ctcf_motifs_10e-6_nucleosomes_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_nucl_10 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_myc_motif.sh new file mode 100755 index 0000000..f43146f --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_myc_motif.sh @@ -0,0 +1,44 @@ +# some paths +## directories +results_dir='data/10xgenomics_PBMC_5k_motifs' +read_dir="data/10xgenomics_PBMC_5k" +seq_dir="data/genomes" +## input +file_bed_ctcf=$read_dir'/ctcf_motifs_10e-6_rmsk.bed' +file_bed_myc=$read_dir'/myc_motifs_10e-6_rmsk.bed' +file_bed=$read_dir'/ctcf_motifs_10e-6_myc_motifs_10e-6_rmsk.bed' +file_bam_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" +file_bai_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" +file_bam_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" +file_bai_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" +file_hg19="$seq_dir/hg19.fasta" +file_rmsk="$seq_dir/hg19_rmsk.bed" + +mkdir -p $results_dir + +# merge 5000 CTCF peaks and 2000 myc peaks +touch $file_bed +shuf $file_bed_ctcf | head -n 2000 >> $file_bed +shuf $file_bed_myc | head -n 2000 >> $file_bed + +# matrix creation +## sequences and sequenced repeat masked +file_mat_seq="$results_dir/ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk.mat" +bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq + +## open chromatin around motifs +for method in 'read_atac' +do + file_mat_open_1="$results_dir/ctcf_motifs_10e-6_myc_motifs_10e-6_open_bin1bp_"$method"_rmsk.mat" + bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1 +done + +## all nucleosomes around motifs +for method in 'fragment_center' +do + ### mono nucleosomes + file_mat_nucl_1="$results_dir/ctcf_motifs_10e-6_myc_motifs_10e-6_nucleosomes_bin1bp_"$method"_rmsk.mat" + bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_nucl_1 +done + + diff --git a/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_sp1_motif.sh new file mode 100755 index 0000000..2c1aaed --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ctcf_sp1_motif.sh @@ -0,0 +1,44 @@ +# some paths +## directories +results_dir='data/10xgenomics_PBMC_5k_motifs' +read_dir="data/10xgenomics_PBMC_5k" +seq_dir="data/genomes" +## input +file_bed_ctcf=$read_dir'/ctcf_motifs_10e-6_rmsk.bed' +file_bed_sp1=$read_dir'/sp1_motifs_10e-7_rmsk.bed' +file_bed=$read_dir'/ctcf_motifs_10e-6_sp1_motifs_10e-7_rmsk.bed' +file_bam_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" +file_bai_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" +file_bam_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" +file_bai_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" +file_hg19="$seq_dir/hg19.fasta" +file_rmsk="$seq_dir/hg19_rmsk.bed" + +mkdir -p $results_dir + +# merge 5000 CTCF peaks and 5000 SP1 peaks +touch $file_bed +shuf $file_bed_ctcf | head -n 5000 >> $file_bed +shuf $file_bed_sp1 | head -n 5000 >> $file_bed + +# matrix creation +## sequences and sequenced repeat masked +file_mat_seq="$results_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk.mat" +bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq + +## open chromatin around motifs +for method in 'read_atac' +do + file_mat_open_1="$results_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_"$method"_rmsk.mat" + bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1 +done + +## all nucleosomes around motifs +for method in 'fragment_center' +do + ### mono nucleosomes + file_mat_nucl_1="$results_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_nucleosomes_bin1bp_"$method"_rmsk.mat" + bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_nucl_1 +done + + diff --git a/scripts/10xgenomics_PBMC_5k_motifs/analysis_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ebf1_motif.R index 2504612..de580b7 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs/analysis_ebf1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_ebf1_motif.R @@ -1,307 +1,307 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) ################## aggregations around ebf1 motifs ################## # data # open chromatin -data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin1bp_fragment.mat"))) -data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin2bp_fragment.mat"))) -data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin10bp_fragment.mat"))) +data.open.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_open_bin1bp_fragment.mat"))) +data.open.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_open_bin2bp_fragment.mat"))) +data.open.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_open_bin10bp_fragment.mat"))) -data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin1bp_read.mat"))) -data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin2bp_read.mat"))) -data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin10bp_read.mat"))) +data.open.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_open_bin1bp_read.mat"))) +data.open.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_open_bin2bp_read.mat"))) +data.open.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_open_bin10bp_read.mat"))) -data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin1bp_read_atac.mat"))) -data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin2bp_read_atac.mat"))) -data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin10bp_read_atac.mat"))) +data.open.1.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_open_bin1bp_read_atac.mat"))) +data.open.2.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_open_bin2bp_read_atac.mat"))) +data.open.10.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_open_bin10bp_read_atac.mat"))) # mono-nucleosomes -data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin1bp_fragment.mat"))) -data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin2bp_fragment.mat"))) -data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin10bp_fragment.mat"))) +data.1nucl.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_1nucl_bin1bp_fragment.mat"))) +data.1nucl.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_1nucl_bin2bp_fragment.mat"))) +data.1nucl.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_1nucl_bin10bp_fragment.mat"))) -data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin1bp_read.mat"))) -data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin2bp_read.mat"))) -data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin10bp_read.mat"))) +data.1nucl.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_1nucl_bin1bp_read.mat"))) +data.1nucl.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_1nucl_bin2bp_read.mat"))) +data.1nucl.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_1nucl_bin10bp_read.mat"))) -data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"))) -data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center.mat"))) -data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin10bp_fragment_center.mat"))) +data.1nucl.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"))) +data.1nucl.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center.mat"))) +data.1nucl.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_1nucl_bin10bp_fragment_center.mat"))) # di-nucleosomes -data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin1bp_fragment.mat"))) -data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin2bp_fragment.mat"))) -data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin10bp_fragment.mat"))) +data.2nucl.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nucl_bin1bp_fragment.mat"))) +data.2nucl.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nucl_bin2bp_fragment.mat"))) +data.2nucl.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nucl_bin10bp_fragment.mat"))) -data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin1bp_read.mat"))) -data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin2bp_read.mat"))) -data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin10bp_read.mat"))) +data.2nucl.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nucl_bin1bp_read.mat"))) +data.2nucl.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nucl_bin2bp_read.mat"))) +data.2nucl.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nucl_bin10bp_read.mat"))) -data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin1bp_fragment_center.mat"))) -data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin2bp_fragment_center.mat"))) -data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin10bp_fragment_center.mat"))) +data.2nucl.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nucl_bin1bp_fragment_center.mat"))) +data.2nucl.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nucl_bin2bp_fragment_center.mat"))) +data.2nucl.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nucl_bin10bp_fragment_center.mat"))) # mono-nucleosomes from di-nucleosome data -data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment.mat"))) -data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment.mat"))) -data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment.mat"))) +data.nucls.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment.mat"))) +data.nucls.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment.mat"))) +data.nucls.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment.mat"))) -data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_read.mat"))) -data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_read.mat"))) -data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_read.mat"))) +data.nucls.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_read.mat"))) +data.nucls.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_read.mat"))) +data.nucls.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_read.mat"))) -data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment_center.mat"))) -data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment_center.mat"))) -data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment_center.mat"))) +data.nucls.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment_center.mat"))) +data.nucls.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment_center.mat"))) +data.nucls.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment_center.mat"))) # colors col = brewer.pal(4, "Set1") # x-axis axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5) axis.lab.1 = seq(-400, 400, by=200) axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5) axis.lab.2 = seq(-400, 400, by=200) axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5) axis.lab.10 = seq(-1000, 1000, by=500) # X11(width=12, height=12) png(filename=file.path("results/10xgenomics_PBMC_5k/ebf1_motifs_10e-6_aggregations.png"), units="in", res=720, width=12, height=9) m = matrix(nrow=4, ncol=4, data=c(16,13,14,15, 10, 1, 4, 7, 11, 2, 5, 8, 12, 3, 6, 9), byrow=T) l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1)) layout.show(l) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) # 1bp resolution ## entire fragments ylim = c(0,max(max(colMeans(data.open.1.frag)), max(colMeans(data.open.1.frag)), max(colMeans(data.1nucl.1.frag)), max(colMeans(data.2nucl.1.frag)), max(colMeans(data.nucls.1.frag)))) plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.open.1.frag), col=col[1], lwd=3) lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) ## entire reads ylim = c(0,max(max(colMeans(data.open.1.read)), max(colMeans(data.open.1.read)), max(colMeans(data.1nucl.1.read)), max(colMeans(data.2nucl.1.read)), max(colMeans(data.nucls.1.read)))) plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3) lines(colMeans(data.nucls.1.read), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 2bp resolution ## entire fragments ylim = c(0,max(max(colMeans(data.open.2.frag)), max(colMeans(data.open.2.frag)), max(colMeans(data.1nucl.2.frag)), max(colMeans(data.2nucl.2.frag)), max(colMeans(data.nucls.2.frag)))) plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) ## entire reads ylim = c(0,max(max(colMeans(data.open.2.read)), max(colMeans(data.open.2.read)), max(colMeans(data.1nucl.2.read)), max(colMeans(data.2nucl.2.read)), max(colMeans(data.nucls.2.read)))) plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3) lines(colMeans(data.nucls.2.read), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) # 10bp resolution ## entire fragments ylim = c(0,max(max(colMeans(data.open.10.frag)), max(colMeans(data.open.10.frag)), max(colMeans(data.1nucl.10.frag)), max(colMeans(data.2nucl.10.frag)), max(colMeans(data.nucls.10.frag)))) plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) ## entire reads ylim = c(0,max(max(colMeans(data.open.10.read)), max(colMeans(data.open.10.read)), max(colMeans(data.1nucl.10.read)), max(colMeans(data.2nucl.10.read)), max(colMeans(data.nucls.10.read)))) plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3) lines(colMeans(data.nucls.10.read), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) # some legends over the rows and columns p = par(mar=c(0,0,0,0)) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="FRAGMENTS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="READS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-400bp by 1bp", cex=2) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-400bp by 2bp", cex=2) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-1kp by 10bp", cex=2) par(p) dev.off() # footprint # x-axis axis.lab.1 = seq(-200, 200, by=100) axis.at.1 = seq(0, 400, length.out=length(axis.lab.1)) axis.lab.2 = seq(-200, 200, by=100) axis.at.2 = seq(0, 200, length.out=length(axis.lab.2)) axis.lab.10 = seq(-200, 200, by=100) axis.at.10 = seq(0, 41, length.out=length(axis.lab.10)) # X11(width=10, height=12) png(filename=file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_footprint.png"), units="in", res=720, width=10, height=12) p = par(mfrow=c(3,1), mar=c(5.1, 5.1, 4.1, 2.1)) # 1bp resolution index = 200:600 x = 1:length(index) plot(x, colMeans(data.open.1.atac[,index])/max(colMeans(data.open.1.atac[,index])), type='l', lwd=3, col=col[1], main="EBF1 motif 1bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.1.cent[,index])/max(colMeans(data.1nucl.1.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.1.cent[,index])/max(colMeans(data.nucls.1.cent[,index])), lwd=3, col=col[4]) abline(v=191, lwd=3, lty=2) abline(v=211, lwd=3, lty=2) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 2bp resolution index = 100:300 x = 1:length(index) plot(x, colMeans(data.open.2.atac[,index])/max(colMeans(data.open.2.atac[,index])), type='l', lwd=3, col=col[1], main="EBF1 motif 2bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.2.cent[,index])/max(colMeans(data.1nucl.2.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.2.cent[,index])/max(colMeans(data.nucls.2.cent[,index])), lwd=3, col=col[4]) abline(v=96, lwd=3, lty=2) abline(v=106, lwd=3, lty=2) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 10bp resolution index = 80:120 x = 1:length(index) plot(x, colMeans(data.open.10.atac[,index])/max(colMeans(data.open.10.atac[,index])), type='l', lwd=3, col=col[1], main="EBF1 motif 10bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.10.cent[,index])/max(colMeans(data.1nucl.10.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.10.cent[,index])/max(colMeans(data.nucls.10.cent[,index])), lwd=3, col=col[4]) abline(v=20, lwd=3, lty=2) abline(v=22, lwd=3, lty=2) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) par(p) dev.off() diff --git a/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.R b/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.R index 6fcdefb..1145a83 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.R @@ -1,307 +1,307 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) ################## aggregations around myc motifs ################## # data # open chromatin -data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin1bp_fragment.mat"))) -data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin2bp_fragment.mat"))) -data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin10bp_fragment.mat"))) +data.open.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_open_bin1bp_fragment.mat"))) +data.open.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_open_bin2bp_fragment.mat"))) +data.open.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_open_bin10bp_fragment.mat"))) -data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin1bp_read.mat"))) -data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin2bp_read.mat"))) -data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin10bp_read.mat"))) +data.open.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_open_bin1bp_read.mat"))) +data.open.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_open_bin2bp_read.mat"))) +data.open.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_open_bin10bp_read.mat"))) -data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin1bp_read_atac.mat"))) -data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin2bp_read_atac.mat"))) -data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin10bp_read_atac.mat"))) +data.open.1.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_open_bin1bp_read_atac.mat"))) +data.open.2.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_open_bin2bp_read_atac.mat"))) +data.open.10.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_open_bin10bp_read_atac.mat"))) # mono-nucleosomes -data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin1bp_fragment.mat"))) -data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin2bp_fragment.mat"))) -data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin10bp_fragment.mat"))) +data.1nucl.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_1nucl_bin1bp_fragment.mat"))) +data.1nucl.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_1nucl_bin2bp_fragment.mat"))) +data.1nucl.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_1nucl_bin10bp_fragment.mat"))) -data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin1bp_read.mat"))) -data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin2bp_read.mat"))) -data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin10bp_read.mat"))) +data.1nucl.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_1nucl_bin1bp_read.mat"))) +data.1nucl.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_1nucl_bin2bp_read.mat"))) +data.1nucl.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_1nucl_bin10bp_read.mat"))) -data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"))) -data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin2bp_fragment_center.mat"))) -data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin10bp_fragment_center.mat"))) +data.1nucl.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"))) +data.1nucl.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_1nucl_bin2bp_fragment_center.mat"))) +data.1nucl.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_1nucl_bin10bp_fragment_center.mat"))) # di-nucleosomes -data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin1bp_fragment.mat"))) -data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin2bp_fragment.mat"))) -data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin10bp_fragment.mat"))) +data.2nucl.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nucl_bin1bp_fragment.mat"))) +data.2nucl.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nucl_bin2bp_fragment.mat"))) +data.2nucl.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nucl_bin10bp_fragment.mat"))) -data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin1bp_read.mat"))) -data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin2bp_read.mat"))) -data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin10bp_read.mat"))) +data.2nucl.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nucl_bin1bp_read.mat"))) +data.2nucl.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nucl_bin2bp_read.mat"))) +data.2nucl.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nucl_bin10bp_read.mat"))) -data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin1bp_fragment_center.mat"))) -data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin2bp_fragment_center.mat"))) -data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin10bp_fragment_center.mat"))) +data.2nucl.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nucl_bin1bp_fragment_center.mat"))) +data.2nucl.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nucl_bin2bp_fragment_center.mat"))) +data.2nucl.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nucl_bin10bp_fragment_center.mat"))) # mono-nucleosomes from di-nucleosome data -data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment.mat"))) -data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment.mat"))) -data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment.mat"))) +data.nucls.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment.mat"))) +data.nucls.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment.mat"))) +data.nucls.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment.mat"))) -data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin1bp_read.mat"))) -data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin2bp_read.mat"))) -data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin10bp_read.mat"))) +data.nucls.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nuclsplitintwo_bin1bp_read.mat"))) +data.nucls.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nuclsplitintwo_bin2bp_read.mat"))) +data.nucls.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nuclsplitintwo_bin10bp_read.mat"))) -data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment_center.mat"))) -data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment_center.mat"))) -data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment_center.mat"))) +data.nucls.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment_center.mat"))) +data.nucls.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment_center.mat"))) +data.nucls.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "myc_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment_center.mat"))) # colors col = brewer.pal(4, "Set1") # x-axis axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5) axis.lab.1 = seq(-400, 400, by=200) axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5) axis.lab.2 = seq(-400, 400, by=200) axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5) axis.lab.10 = seq(-1000, 1000, by=500) # X11(width=12, height=12) png(filename=file.path("results/10xgenomics_PBMC_5k/myc_motifs_10e-6_aggregations.png"), units="in", res=720, width=12, height=9) m = matrix(nrow=4, ncol=4, data=c(16,13,14,15, 10, 1, 4, 7, 11, 2, 5, 8, 12, 3, 6, 9), byrow=T) l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1)) layout.show(l) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) # 1bp resolution ## entire fragments ylim = c(0,max(max(colMeans(data.open.1.frag)), max(colMeans(data.open.1.frag)), max(colMeans(data.1nucl.1.frag)), max(colMeans(data.2nucl.1.frag)), max(colMeans(data.nucls.1.frag)))) plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.open.1.frag), col=col[1], lwd=3) lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) ## entire reads ylim = c(0,max(max(colMeans(data.open.1.read)), max(colMeans(data.open.1.read)), max(colMeans(data.1nucl.1.read)), max(colMeans(data.2nucl.1.read)), max(colMeans(data.nucls.1.read)))) plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3) lines(colMeans(data.nucls.1.read), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 2bp resolution ## entire fragments ylim = c(0,max(max(colMeans(data.open.2.frag)), max(colMeans(data.open.2.frag)), max(colMeans(data.1nucl.2.frag)), max(colMeans(data.2nucl.2.frag)), max(colMeans(data.nucls.2.frag)))) plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) ## entire reads ylim = c(0,max(max(colMeans(data.open.2.read)), max(colMeans(data.open.2.read)), max(colMeans(data.1nucl.2.read)), max(colMeans(data.2nucl.2.read)), max(colMeans(data.nucls.2.read)))) plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3) lines(colMeans(data.nucls.2.read), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) # 10bp resolution ## entire fragments ylim = c(0,max(max(colMeans(data.open.10.frag)), max(colMeans(data.open.10.frag)), max(colMeans(data.1nucl.10.frag)), max(colMeans(data.2nucl.10.frag)), max(colMeans(data.nucls.10.frag)))) plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) ## entire reads ylim = c(0,max(max(colMeans(data.open.10.read)), max(colMeans(data.open.10.read)), max(colMeans(data.1nucl.10.read)), max(colMeans(data.2nucl.10.read)), max(colMeans(data.nucls.10.read)))) plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3) lines(colMeans(data.nucls.10.read), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) # some legends over the rows and columns p = par(mar=c(0,0,0,0)) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="FRAGMENTS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="READS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-400bp by 1bp", cex=2) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-400bp by 2bp", cex=2) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-1kp by 10bp", cex=2) par(p) dev.off() # footprint # x-axis axis.lab.1 = seq(-200, 200, by=100) axis.at.1 = seq(0, 400, length.out=length(axis.lab.1)) axis.lab.2 = seq(-200, 200, by=100) axis.at.2 = seq(0, 200, length.out=length(axis.lab.2)) axis.lab.10 = seq(-200, 200, by=100) axis.at.10 = seq(0, 41, length.out=length(axis.lab.10)) # X11(width=10, height=12) png(filename=file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_footprint.png"), units="in", res=720, width=10, height=12) p = par(mfrow=c(3,1), mar=c(5.1, 5.1, 4.1, 2.1)) # 1bp resolution index = 200:600 x = 1:length(index) plot(x, colMeans(data.open.1.atac[,index])/max(colMeans(data.open.1.atac[,index])), type='l', lwd=3, col=col[1], main="myc motif 1bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.1.cent[,index])/max(colMeans(data.1nucl.1.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.1.cent[,index])/max(colMeans(data.nucls.1.cent[,index])), lwd=3, col=col[4]) abline(v=191, lwd=3, lty=2) abline(v=211, lwd=3, lty=2) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 2bp resolution index = 100:300 x = 1:length(index) plot(x, colMeans(data.open.2.atac[,index])/max(colMeans(data.open.2.atac[,index])), type='l', lwd=3, col=col[1], main="myc motif 2bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.2.cent[,index])/max(colMeans(data.1nucl.2.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.2.cent[,index])/max(colMeans(data.nucls.2.cent[,index])), lwd=3, col=col[4]) abline(v=96, lwd=3, lty=2) abline(v=106, lwd=3, lty=2) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 10bp resolution index = 80:120 x = 1:length(index) plot(x, colMeans(data.open.10.atac[,index])/max(colMeans(data.open.10.atac[,index])), type='l', lwd=3, col=col[1], main="myc motif 10bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.10.cent[,index])/max(colMeans(data.1nucl.10.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.10.cent[,index])/max(colMeans(data.nucls.10.cent[,index])), lwd=3, col=col[4]) abline(v=20, lwd=3, lty=2) abline(v=22, lwd=3, lty=2) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) par(p) dev.off() \ No newline at end of file diff --git a/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.sh index d927a88..64a7ec2 100755 --- a/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_myc_motif.sh @@ -1,179 +1,186 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k' data_dir='data' read_dir="$data_dir/10xgenomics_PBMC_5k" seq_dir="$data_dir/genomes" ## input1 file_bed=$read_dir'/myc_motifs_10e-6.bed' +file_bed_rmsk=$read_dir'/myc_motifs_10e-6_rmsk.bed' file_bam_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" file_bai_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" file_bam_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam" file_bai_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai" file_bam_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam" file_bai_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai" file_bam_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam" file_bai_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai" file_bam_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" file_bai_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" file_hg19="$seq_dir/hg19.fasta" +file_rmsk="$seq_dir/hg19_rmsk.bed" mkdir -p $results_dir +# filter out motifs with >=30% repeated region inside +bin/bedtools/bedtools subtract -A -f 0.3 -a $file_bed -b $file_rmsk > $file_bed_rmsk + # matrix creation ## sequences file_mat_seq="$results_dir/myc_motifs_10e-6_sequences.mat" -# bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq +file_mat_seq_rmsk="$results_dir/myc_motifs_10e-6_sequences_rmsk.mat" +bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq +bin/SequenceMatrixCreator --bed $file_bed_rmsk --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq_rmsk ## open chromatin around myc motif for method in 'read' 'read_atac' 'fragment' do file_mat_open_1="$results_dir/myc_motifs_10e-6_open_bin1bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1 file_mat_open_2="$results_dir/myc_motifs_10e-6_open_bin2bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2 file_mat_open_10="$results_dir/myc_motifs_10e-6_open_bin10bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10 done ## mono around myc motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/myc_motifs_10e-6_1nucl_bin1bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/myc_motifs_10e-6_1nucl_bin2bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/myc_motifs_10e-6_1nucl_bin10bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## di nucleosomes around myc motif for method in 'read' 'fragment' 'fragment_center' do ### di nucleosomes file_mat_2nucl_1="$results_dir/myc_motifs_10e-6_2nucl_bin1bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1 file_mat_2nucl_2="$results_dir/myc_motifs_10e-6_2nucl_bin2bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2 file_mat_2nucl_10="$results_dir/myc_motifs_10e-6_2nucl_bin10bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10 done ## mono nucleosomes from processed di-nucleosome data around myc motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/myc_motifs_10e-6_2nuclsplitintwo_bin1bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/myc_motifs_10e-6_2nuclsplitintwo_bin2bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/myc_motifs_10e-6_2nuclsplitintwo_bin10bp_$method.mat" # bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## all nucleosomes around myc motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_nucl_1="$results_dir/myc_motifs_10e-6_nucleosomes_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_nucl_1 file_mat_nucl_2="$results_dir/myc_motifs_10e-6_nucleosomes_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_nucl_2 file_mat_nucl_10="$results_dir/myc_motifs_10e-6_nucleosomes_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_nucl_10 done # some paths ## directories results_dir='results/10xgenomics_PBMC_5k' data_dir='data' read_dir="$data_dir/10xgenomics_PBMC_5k" seq_dir="$data_dir/genomes" ## input1 file_bed=$read_dir'/myc_motifs_10e-6.bed' file_bam_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" file_bai_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" file_bam_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam" file_bai_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai" file_bam_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam" file_bai_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai" file_bam_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam" file_bai_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai" file_bam_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" file_bai_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" file_hg19="$seq_dir/hg19.fasta" mkdir -p $results_dir # matrix creation ## sequences file_mat_seq="$results_dir/myc_motifs_10e-6_sequences.mat" bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq ## open chromatin around myc motif for method in 'read' 'read_atac' 'fragment' do file_mat_open_1="$results_dir/myc_motifs_10e-6_open_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1 file_mat_open_2="$results_dir/myc_motifs_10e-6_open_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2 file_mat_open_10="$results_dir/myc_motifs_10e-6_open_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10 done ## mono around myc motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/myc_motifs_10e-6_1nucl_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/myc_motifs_10e-6_1nucl_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/myc_motifs_10e-6_1nucl_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## di nucleosomes around myc motif for method in 'read' 'fragment' 'fragment_center' do ### di nucleosomes file_mat_2nucl_1="$results_dir/myc_motifs_10e-6_2nucl_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1 file_mat_2nucl_2="$results_dir/myc_motifs_10e-6_2nucl_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2 file_mat_2nucl_10="$results_dir/myc_motifs_10e-6_2nucl_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10 done ## mono nucleosomes from processed di-nucleosome data around myc motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/myc_motifs_10e-6_2nuclsplitintwo_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/myc_motifs_10e-6_2nuclsplitintwo_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/myc_motifs_10e-6_2nuclsplitintwo_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## all nucleosomes around myc motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_nucl_1="$results_dir/myc_motifs_10e-6_nucleosomes_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_nucl_1 file_mat_nucl_2="$results_dir/myc_motifs_10e-6_nucleosomes_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_nucl_2 file_mat_nucl_10="$results_dir/myc_motifs_10e-6_nucleosomes_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_nucl_10 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.R index 5011e4b..267a246 100644 --- a/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.R +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.R @@ -1,307 +1,307 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) ################## aggregations around myc motifs ################## # data # open chromatin -data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin1bp_fragment.mat"))) -data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin2bp_fragment.mat"))) -data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin10bp_fragment.mat"))) +data.open.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_open_bin1bp_fragment.mat"))) +data.open.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_open_bin2bp_fragment.mat"))) +data.open.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_open_bin10bp_fragment.mat"))) -data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin1bp_read.mat"))) -data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin2bp_read.mat"))) -data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin10bp_read.mat"))) +data.open.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_open_bin1bp_read.mat"))) +data.open.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_open_bin2bp_read.mat"))) +data.open.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_open_bin10bp_read.mat"))) -data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin1bp_read_atac.mat"))) -data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin2bp_read_atac.mat"))) -data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin10bp_read_atac.mat"))) +data.open.1.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_open_bin1bp_read_atac.mat"))) +data.open.2.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_open_bin2bp_read_atac.mat"))) +data.open.10.atac = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_open_bin10bp_read_atac.mat"))) # mono-nucleosomes -data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin1bp_fragment.mat"))) -data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin2bp_fragment.mat"))) -data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin10bp_fragment.mat"))) +data.1nucl.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_1nucl_bin1bp_fragment.mat"))) +data.1nucl.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_1nucl_bin2bp_fragment.mat"))) +data.1nucl.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_1nucl_bin10bp_fragment.mat"))) -data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin1bp_read.mat"))) -data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin2bp_read.mat"))) -data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin10bp_read.mat"))) +data.1nucl.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_1nucl_bin1bp_read.mat"))) +data.1nucl.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_1nucl_bin2bp_read.mat"))) +data.1nucl.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_1nucl_bin10bp_read.mat"))) -data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"))) -data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center.mat"))) -data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin10bp_fragment_center.mat"))) +data.1nucl.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"))) +data.1nucl.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center.mat"))) +data.1nucl.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_1nucl_bin10bp_fragment_center.mat"))) # di-nucleosomes -data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin1bp_fragment.mat"))) -data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin2bp_fragment.mat"))) -data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin10bp_fragment.mat"))) +data.2nucl.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nucl_bin1bp_fragment.mat"))) +data.2nucl.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nucl_bin2bp_fragment.mat"))) +data.2nucl.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nucl_bin10bp_fragment.mat"))) -data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin1bp_read.mat"))) -data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin2bp_read.mat"))) -data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin10bp_read.mat"))) +data.2nucl.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nucl_bin1bp_read.mat"))) +data.2nucl.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nucl_bin2bp_read.mat"))) +data.2nucl.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nucl_bin10bp_read.mat"))) -data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin1bp_fragment_center.mat"))) -data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin2bp_fragment_center.mat"))) -data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin10bp_fragment_center.mat"))) +data.2nucl.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nucl_bin1bp_fragment_center.mat"))) +data.2nucl.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nucl_bin2bp_fragment_center.mat"))) +data.2nucl.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nucl_bin10bp_fragment_center.mat"))) # mono-nucleosomes from di-nucleosome data -data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_fragment.mat"))) -data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_fragment.mat"))) -data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_fragment.mat"))) +data.nucls.1.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_fragment.mat"))) +data.nucls.2.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_fragment.mat"))) +data.nucls.10.frag = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_fragment.mat"))) -data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_read.mat"))) -data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_read.mat"))) -data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_read.mat"))) +data.nucls.1.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_read.mat"))) +data.nucls.2.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_read.mat"))) +data.nucls.10.read = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_read.mat"))) -data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_fragment_center.mat"))) -data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_fragment_center.mat"))) -data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_fragment_center.mat"))) +data.nucls.1.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_fragment_center.mat"))) +data.nucls.2.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_fragment_center.mat"))) +data.nucls.10.cent = as.matrix(read.table(file.path("data", "10xgenomics_PBMC_5k_motifs", "sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_fragment_center.mat"))) # colors col = brewer.pal(4, "Set1") # x-axis axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5) axis.lab.1 = seq(-400, 400, by=200) axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5) axis.lab.2 = seq(-400, 400, by=200) axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5) axis.lab.10 = seq(-1000, 1000, by=500) # X11(width=12, height=12) png(filename=file.path("results/10xgenomics_PBMC_5k/sp1_motifs_10e-7_aggregations.png"), units="in", res=720, width=12, height=9) m = matrix(nrow=4, ncol=4, data=c(16,13,14,15, 10, 1, 4, 7, 11, 2, 5, 8, 12, 3, 6, 9), byrow=T) l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1)) layout.show(l) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) # 1bp resolution ## entire fragments ylim = c(0,max(max(colMeans(data.open.1.frag)), max(colMeans(data.open.1.frag)), max(colMeans(data.1nucl.1.frag)), max(colMeans(data.2nucl.1.frag)), max(colMeans(data.nucls.1.frag)))) plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.open.1.frag), col=col[1], lwd=3) lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) ## entire reads ylim = c(0,max(max(colMeans(data.open.1.read)), max(colMeans(data.open.1.read)), max(colMeans(data.1nucl.1.read)), max(colMeans(data.2nucl.1.read)), max(colMeans(data.nucls.1.read)))) plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3) lines(colMeans(data.nucls.1.read), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 2bp resolution ## entire fragments ylim = c(0,max(max(colMeans(data.open.2.frag)), max(colMeans(data.open.2.frag)), max(colMeans(data.1nucl.2.frag)), max(colMeans(data.2nucl.2.frag)), max(colMeans(data.nucls.2.frag)))) plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) ## entire reads ylim = c(0,max(max(colMeans(data.open.2.read)), max(colMeans(data.open.2.read)), max(colMeans(data.1nucl.2.read)), max(colMeans(data.2nucl.2.read)), max(colMeans(data.nucls.2.read)))) plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3) lines(colMeans(data.nucls.2.read), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8) # 10bp resolution ## entire fragments ylim = c(0,max(max(colMeans(data.open.10.frag)), max(colMeans(data.open.10.frag)), max(colMeans(data.1nucl.10.frag)), max(colMeans(data.2nucl.10.frag)), max(colMeans(data.nucls.10.frag)))) plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3) lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) ## entire reads ylim = c(0,max(max(colMeans(data.open.10.read)), max(colMeans(data.open.10.read)), max(colMeans(data.1nucl.10.read)), max(colMeans(data.2nucl.10.read)), max(colMeans(data.nucls.10.read)))) plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l', main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n', ylim=ylim, cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3) lines(colMeans(data.nucls.10.read), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) ## atac reads and centers plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)), col=col[1], lwd=3, type='l', xaxt='n', main="", xlab="pos[bp]", ylab="Prop max signal", cex.axis=2, cex.lab=2) lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)), col=col[2], lwd=3) lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)), col=col[3], lwd=3) lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)), col=col[4], lwd=3) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) # some legends over the rows and columns p = par(mar=c(0,0,0,0)) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="FRAGMENTS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="READS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-400bp by 1bp", cex=2) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-400bp by 2bp", cex=2) plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n') text(0, 0, labels="+/-1kp by 10bp", cex=2) par(p) dev.off() # x-axis axis.lab.1 = seq(-200, 200, by=100) axis.at.1 = seq(0, 400, length.out=length(axis.lab.1)) axis.lab.2 = seq(-200, 200, by=100) axis.at.2 = seq(0, 200, length.out=length(axis.lab.2)) axis.lab.10 = seq(-200, 200, by=100) axis.at.10 = seq(0, 41, length.out=length(axis.lab.10)) # X11(width=10, height=12) png(filename=file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_footprint.png"), units="in", res=720, width=10, height=12) p = par(mfrow=c(3,1), mar=c(5.1, 5.1, 4.1, 2.1)) # 1bp resolution index = 200:600 x = 1:length(index) plot(x, colMeans(data.open.1.atac[,index])/max(colMeans(data.open.1.atac[,index])), type='l', lwd=3, col=col[1], main="SP1 motif 1bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.1.cent[,index])/max(colMeans(data.1nucl.1.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.1.cent[,index])/max(colMeans(data.nucls.1.cent[,index])), lwd=3, col=col[4]) abline(v=191, lwd=3, lty=2) abline(v=211, lwd=3, lty=2) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 2bp resolution index = 100:300 x = 1:length(index) plot(x, colMeans(data.open.2.atac[,index])/max(colMeans(data.open.2.atac[,index])), type='l', lwd=3, col=col[1], main="SP1 motif 2bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.2.cent[,index])/max(colMeans(data.1nucl.2.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.2.cent[,index])/max(colMeans(data.nucls.2.cent[,index])), lwd=3, col=col[4]) abline(v=96, lwd=3, lty=2) abline(v=106, lwd=3, lty=2) axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8) # 10bp resolution index = 80:120 x = 1:length(index) plot(x, colMeans(data.open.10.atac[,index])/max(colMeans(data.open.10.atac[,index])), type='l', lwd=3, col=col[1], main="SP1 motif 10bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n', cex.axis=2, cex.lab=2, cex.main=2) lines(x, colMeans(data.1nucl.10.cent[,index])/max(colMeans(data.1nucl.10.cent[,index])), lwd=3, col=col[2]) lines(x, colMeans(data.nucls.10.cent[,index])/max(colMeans(data.nucls.10.cent[,index])), lwd=3, col=col[4]) abline(v=20, lwd=3, lty=2) abline(v=22, lwd=3, lty=2) axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8) par(p) dev.off() diff --git a/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.sh index 7c2012e..8904a67 100755 --- a/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs/analysis_sp1_motif.sh @@ -1,90 +1,96 @@ # some paths ## directories -results_dir='results/10xgenomics_PBMC_5k' -data_dir='data' -read_dir="$data_dir/10xgenomics_PBMC_5k" -seq_dir="$data_dir/genomes" +results_dir='data/10xgenomics_PBMC_5k_motifs' +read_dir="data/10xgenomics_PBMC_5k" +seq_dir="data/genomes" ## input1 file_bed=$read_dir'/sp1_motifs_10e-7.bed' +file_bed_rmsk=$read_dir'/sp1_motifs_10e-7_rmsk.bed' file_bam_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam" file_bai_open="$read_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai" file_bam_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam" file_bai_1nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai" file_bam_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam" file_bai_2nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai" file_bam_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam" file_bai_1nucl2="$read_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai" file_bam_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam" file_bai_nucl="$read_dir/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai" file_hg19="$seq_dir/hg19.fasta" +file_rmsk="$seq_dir/hg19_rmsk.bed" mkdir -p $results_dir +# filter out peaks with >=30% repeated region inside +bin/bedtools/bedtools subtract -A -f 0.3 -a $file_bed -b $file_rmsk > $file_bed_rmsk + # matrix creation -## sequences +## sequences and sequenced repeat masked file_mat_seq="$results_dir/sp1_motifs_10e-7_sequences.mat" -bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq +file_mat_seq_rmsk="$results_dir/sp1_motifs_10e-7_sequences_rmsk.mat" +bin/SequenceMatrixCreator --bed $file_bed --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq +bin/SequenceMatrixCreator --bed $file_bed_rmsk --fasta $file_hg19 --from -400 --to 400 > $file_mat_seq_rmsk ## open chromatin around sp1 motif for method in 'read' 'read_atac' 'fragment' do file_mat_open_1="$results_dir/sp1_motifs_10e-7_open_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1 file_mat_open_2="$results_dir/sp1_motifs_10e-7_open_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2 file_mat_open_10="$results_dir/sp1_motifs_10e-7_open_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10 done ## mono around sp1 motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/sp1_motifs_10e-7_1nucl_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/sp1_motifs_10e-7_1nucl_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/sp1_motifs_10e-7_1nucl_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## di nucleosomes around sp1 motif for method in 'read' 'fragment' 'fragment_center' do ### di nucleosomes file_mat_2nucl_1="$results_dir/sp1_motifs_10e-7_2nucl_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1 file_mat_2nucl_2="$results_dir/sp1_motifs_10e-7_2nucl_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2 file_mat_2nucl_10="$results_dir/sp1_motifs_10e-7_2nucl_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10 done ## mono nucleosomes from processed di-nucleosome data around sp1 motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_1nucl_1="$results_dir/sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1 file_mat_1nucl_2="$results_dir/sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2 file_mat_1nucl_10="$results_dir/sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10 done ## all nucleosomes around sp1 motif for method in 'read' 'fragment' 'fragment_center' do ### mono nucleosomes file_mat_nucl_1="$results_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_nucl_1 file_mat_nucl_2="$results_dir/sp1_motifs_10e-7_nucleosomes_bin2bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_nucl_2 file_mat_nucl_10="$results_dir/sp1_motifs_10e-7_nucleosomes_bin10bp_$method.mat" bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_nucl_10 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ctcf_motif.R new file mode 100644 index 0000000..17992c1 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ctcf_motif.R @@ -0,0 +1,171 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the minimum number of classes searched +k.min = 1 +# the maximum number of classes searched +k.max = 6 + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## open chromatin patterns around ctcf motifs with flip ################## + +for(k in k.min:k.max) +{ + # open chromatin + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) + model.open = data$models + model.prob = data$prob + data = NULL + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_nucleosomes_fragment_center_model.mat", k)))$models + # sequence + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ctcf_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) + x.at = (x.lab + ncol(ref.open)) / 2 + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + row_n = 1 # row counter + col_n = 1 # column counter + for(i in 1:nrow(ref.open)) + { # plot logo center + right = 0.5*col_n - 0.01 + left = right - 0.2 + bottom = 1-(row_n*(0.2))+0.05 + top = bottom + 0.15 + par(fig=c(left, right, bottom, top), new=T) + idx = 380:420 + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = 1:length(idx) + axis(1, at=x.at, labels=x.at) + # yaxis + x.at = seq(0, 2, by=1) + axis(2, at=x.at, labels=x.at) + row_n = row_n + 1 + if(i %% 5 == 0) + { col_n = col_n + 1 + row_n = 1 + } + } + dev.off() +} + + +################## open chromatin patterns around ctcf motifs without flip ################## + +for(k in k.min:k.max) +{ + # open chromatin + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_noflip_model.mat", k))) + model.open = data$models + model.prob = data$prob + data = NULL + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_noflip_nucleosomes_fragment_center_model.mat", k)))$models + # sequence + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ctcf_motifs_10e-6_open_bin1bp_read_atac_%dclass_noflip_sequences_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ctcf_motifs_10e-6_classification_open_bin1bp_%dclass_noflip.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) + x.at = (x.lab + ncol(ref.open)) / 2 + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + row_n = 1 # row counter + col_n = 1 # column counter + for(i in 1:nrow(ref.open)) + { # plot logo center + right = 0.5*col_n - 0.01 + left = right - 0.2 + bottom = 1-(row_n*(0.2))+0.05 + top = bottom + 0.15 + par(fig=c(left, right, bottom, top), new=T) + idx = 380:420 + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = 1:length(idx) + axis(1, at=x.at, labels=x.at) + # yaxis + x.at = seq(0, 2, by=1) + axis(2, at=x.at, labels=x.at) + row_n = row_n + 1 + if(i %% 5 == 0) + { col_n = col_n + 1 + row_n = 1 + } + } + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ctcf_motif.sh new file mode 100755 index 0000000..0306079 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ctcf_motif.sh @@ -0,0 +1,49 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_0' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" +file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat" +file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" + +## file with seeds +file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# parameters +n_iter='20' +n_shift='1' +n_core=6 + +# open chromatin with flip +for k in 1 2 3 4 5 6 +do + # seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d' + file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_nucleosomes_fragment_center_model.mat' + file_mod3=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done + +# open chromatin without flip +for k in 1 2 3 4 5 6 +do + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_prob.mat4d' + file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_model.mat' + file_mod2=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_nucleosomes_fragment_center_model.mat' + file_mod3=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_sequences_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ebf1_motif.R new file mode 100644 index 0000000..c15695e --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ebf1_motif.R @@ -0,0 +1,171 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the minimum number of classes searched +k.min = 1 +# the maximum number of classes searched +k.max = 6 + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## open chromatin patterns around ebf1 motifs with flip ################## + +for(k in k.min:k.max) +{ + # open chromatin + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) + model.open = data$models + model.prob = data$prob + data = NULL + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_nucleosomes_fragment_center_model.mat", k)))$models + # sequence + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ebf1_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) + x.at = (x.lab + ncol(ref.open)) / 2 + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + row_n = 1 # row counter + col_n = 1 # column counter + for(i in 1:nrow(ref.open)) + { # plot logo center + right = 0.5*col_n - 0.01 + left = right - 0.2 + bottom = 1-(row_n*(0.2))+0.05 + top = bottom + 0.15 + par(fig=c(left, right, bottom, top), new=T) + idx = 380:420 + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = 1:length(idx) + axis(1, at=x.at, labels=x.at) + # yaxis + x.at = seq(0, 2, by=1) + axis(2, at=x.at, labels=x.at) + row_n = row_n + 1 + if(i %% 5 == 0) + { col_n = col_n + 1 + row_n = 1 + } + } + dev.off() +} + + +################## open chromatin patterns around ebf1 motifs without flip ################## + +for(k in k.min:k.max) +{ + # open chromatin + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_noflip_model.mat", k))) + model.open = data$models + model.prob = data$prob + data = NULL + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_noflip_nucleosomes_fragment_center_model.mat", k)))$models + # sequence + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ebf1_motifs_10e-6_open_bin1bp_read_atac_%dclass_noflip_sequences_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("ebf1_motifs_10e-6_classification_open_bin1bp_%dclass_noflip.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) + x.at = (x.lab + ncol(ref.open)) / 2 + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + row_n = 1 # row counter + col_n = 1 # column counter + for(i in 1:nrow(ref.open)) + { # plot logo center + right = 0.5*col_n - 0.01 + left = right - 0.2 + bottom = 1-(row_n*(0.2))+0.05 + top = bottom + 0.15 + par(fig=c(left, right, bottom, top), new=T) + idx = 380:420 + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = 1:length(idx) + axis(1, at=x.at, labels=x.at) + # yaxis + x.at = seq(0, 2, by=1) + axis(2, at=x.at, labels=x.at) + row_n = row_n + 1 + if(i %% 5 == 0) + { col_n = col_n + 1 + row_n = 1 + } + } + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ebf1_motif.sh new file mode 100755 index 0000000..2dbcbda --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_ebf1_motif.sh @@ -0,0 +1,49 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_0' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat" +file_mat_nucl="$data_dir/ebf1_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat" +file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat" + +## file with seeds +file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# parameters +n_iter='20' +n_shift='1' +n_core=6 + +# open chromatin with flip +for k in 1 2 3 4 5 6 +do + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d' + file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_nucleosomes_fragment_center_model.mat' + file_mod3=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done + +# open chromatin without flip +for k in 1 2 3 4 5 6 +do + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_prob.mat4d' + file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_model.mat' + file_mod2=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_nucleosomes_fragment_center_model.mat' + file_mod3=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_sequences_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_myc_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_myc_motif.R new file mode 100644 index 0000000..a4d12ad --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_myc_motif.R @@ -0,0 +1,171 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the minimum number of classes searched +k.min = 1 +# the maximum number of classes searched +k.max = 6 + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## open chromatin patterns around myc motifs with flip ################## + +for(k in k.min:k.max) +{ + # open chromatin + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_model.mat", k))) + model.open = data$models + model.prob = data$prob + data = NULL + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_nucleosomes_fragment_center_model.mat", k)))$models + # sequence + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("myc_motifs_10e-6_classification_open_bin1bp_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) + x.at = (x.lab + ncol(ref.open)) / 2 + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + row_n = 1 # row counter + col_n = 1 # column counter + for(i in 1:nrow(ref.open)) + { # plot logo center + right = 0.5*col_n - 0.01 + left = right - 0.2 + bottom = 1-(row_n*(0.2))+0.05 + top = bottom + 0.15 + par(fig=c(left, right, bottom, top), new=T) + idx = 380:420 + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = 1:length(idx) + axis(1, at=x.at, labels=x.at) + # yaxis + x.at = seq(0, 2, by=1) + axis(2, at=x.at, labels=x.at) + row_n = row_n + 1 + if(i %% 5 == 0) + { col_n = col_n + 1 + row_n = 1 + } + } + dev.off() +} + + +################## open chromatin patterns around myc motifs without flip ################## + +for(k in k.min:k.max) +{ + # open chromatin + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_noflip_model.mat", k))) + model.open = data$models + model.prob = data$prob + data = NULL + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_noflip_nucleosomes_fragment_center_model.mat", k)))$models + # sequence + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("myc_motifs_10e-6_open_bin1bp_read_atac_%dclass_noflip_sequences_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("myc_motifs_10e-6_classification_open_bin1bp_%dclass_noflip.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) + x.at = (x.lab + ncol(ref.open)) / 2 + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + row_n = 1 # row counter + col_n = 1 # column counter + for(i in 1:nrow(ref.open)) + { # plot logo center + right = 0.5*col_n - 0.01 + left = right - 0.2 + bottom = 1-(row_n*(0.2))+0.05 + top = bottom + 0.15 + par(fig=c(left, right, bottom, top), new=T) + idx = 380:420 + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = 1:length(idx) + axis(1, at=x.at, labels=x.at) + # yaxis + x.at = seq(0, 2, by=1) + axis(2, at=x.at, labels=x.at) + row_n = row_n + 1 + if(i %% 5 == 0) + { col_n = col_n + 1 + row_n = 1 + } + } + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_myc_motif.sh new file mode 100755 index 0000000..062819e --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_myc_motif.sh @@ -0,0 +1,49 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_0' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat" +file_mat_nucl="$data_dir/myc_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat" +file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat" + +## file with seeds +file_seed=$results_dir'/myc_motifs_10e-6_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# parameters +n_iter='20' +n_shift='1' +n_core=6 + +# open chromatin with flip +for k in 1 2 3 4 5 6 +do + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d' + file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_nucleosomes_fragment_center_model.mat' + file_mod3=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_sequences_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done + +# open chromatin without flip +for k in 1 2 3 4 5 6 +do + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_prob.mat4d' + file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_model.mat' + file_mod2=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_nucleosomes_fragment_center_model.mat' + file_mod3=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_noflip_sequences_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_sp1_motif.R new file mode 100644 index 0000000..272cf1f --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_sp1_motif.R @@ -0,0 +1,171 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# the minimum number of classes searched +k.min = 1 +# the maximum number of classes searched +k.max = 6 + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## open chromatin patterns around sp1 motifs with flip ################## + +for(k in k.min:k.max) +{ + # open chromatin + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_model.mat", k))) + model.open = data$models + model.prob = data$prob + data = NULL + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_nucleosomes_fragment_center_model.mat", k)))$models + # sequence + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_sequences_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("sp1_motifs_10e-7_classification_open_bin1bp_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) + x.at = (x.lab + ncol(ref.open)) / 2 + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + row_n = 1 # row counter + col_n = 1 # column counter + for(i in 1:nrow(ref.open)) + { # plot logo center + right = 0.5*col_n - 0.01 + left = right - 0.2 + bottom = 1-(row_n*(0.2))+0.05 + top = bottom + 0.15 + par(fig=c(left, right, bottom, top), new=T) + idx = 380:420 + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = 1:length(idx) + axis(1, at=x.at, labels=x.at) + # yaxis + x.at = seq(0, 2, by=1) + axis(2, at=x.at, labels=x.at) + row_n = row_n + 1 + if(i %% 5 == 0) + { col_n = col_n + 1 + row_n = 1 + } + } + dev.off() +} + + +################## open chromatin patterns around sp1 motifs without flip ################## + +for(k in k.min:k.max) +{ + # open chromatin + data = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_noflip_model.mat", k))) + model.open = data$models + model.prob = data$prob + data = NULL + # nucleosomes + model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_noflip_nucleosomes_fragment_center_model.mat", k)))$models + # sequence + model.seq = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("sp1_motifs_10e-7_open_bin1bp_read_atac_%dclass_noflip_sequences_model.mat", k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_0", + sprintf("sp1_motifs_10e-7_classification_open_bin1bp_%dclass_noflip.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F] + ref.nucl = model.nucl[ord,, drop=F] + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-ncol(ref.open), ncol(ref.open), length.out=3) + x.at = (x.lab + ncol(ref.open)) / 2 + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + row_n = 1 # row counter + col_n = 1 # column counter + for(i in 1:nrow(ref.open)) + { # plot logo center + right = 0.5*col_n - 0.01 + left = right - 0.2 + bottom = 1-(row_n*(0.2))+0.05 + top = bottom + 0.15 + par(fig=c(left, right, bottom, top), new=T) + idx = 380:420 + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + x.at = 1:length(idx) + axis(1, at=x.at, labels=x.at) + # yaxis + x.at = seq(0, 2, by=1) + axis(2, at=x.at, labels=x.at) + row_n = row_n + 1 + if(i %% 5 == 0) + { col_n = col_n + 1 + row_n = 1 + } + } + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_sp1_motif.sh new file mode 100755 index 0000000..7c74512 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/classification_sp1_motif.sh @@ -0,0 +1,49 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_0' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat" +file_mat_nucl="$data_dir/sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center.mat" +file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" + +## file with seeds +file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# parameters +n_iter='20' +n_shift='1' +n_core=6 + +# open chromatin with flip +for k in 1 2 3 4 5 6 +do + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_prob.mat4d' + file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_model.mat' + file_mod2=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_nucleosomes_fragment_center_model.mat' + file_mod3=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_sequences_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done + +# open chromatin without flip +for k in 1 2 3 4 5 6 +do + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_noflip_prob.mat4d' + file_mod1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_noflip_model.mat' + file_mod2=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_noflip_nucleosomes_fragment_center_model.mat' + file_mod3=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_noflip_sequences_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_open --class $k --shift $n_shift --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 +done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_0/run_all.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/run_all.sh new file mode 100755 index 0000000..11b7a41 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_0/run_all.sh @@ -0,0 +1,14 @@ + +dir='scripts/10xgenomics_PBMC_5k_motifs_classification_1' + +# classification +$dir/classification_ctcf_motif.sh +$dir/classification_myc_motif.sh +$dir/classification_ebf1_motif.sh +$dir/classification_sp1_motif.sh + +# analysis of classification results +Rscript $dir/classification_ctcf_motif.R +Rscript $dir/classification_myc_motif.R +Rscript $dir/classification_ebf1_motif.R +Rscript $dir/classification_sp1_motif.R diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh index 013fa9f..7ae6bef 100755 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ctcf_motif.sh @@ -1,36 +1,36 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2' data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=28 +n_core=32 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh index e938ca5..870ae2d 100755 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_ebf1_motif.sh @@ -1,36 +1,36 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2' data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ebf1_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=28 +n_core=32 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ebf1_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh index 345c0cd..408aa18 100755 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_2/classification_myc_motif.sh @@ -1,36 +1,36 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_motifs_classification_2' data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/myc_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/myc_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=28 +n_core=32 # open chromatin and nucleosomes for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d' file_mod1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'myc_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh index bcaa4f4..8be1be7 100755 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_3/classification_ctcf_motif.sh @@ -1,38 +1,38 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_motifs_classification_3' data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='21' -n_core=28 +n_core=32 # sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh index f9a59cf..920f9d3 100755 --- a/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_4/classification_ctcf_motif.sh @@ -1,38 +1,38 @@ # some paths ## directories results_dir='results/10xgenomics_PBMC_5k_motifs_classification_4' data_dir='data/10xgenomics_PBMC_5k_motifs' ## input file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat" file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat" file_mat_nucl="$data_dir/ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center.mat" file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" ## file with seeds file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' mkdir -p $results_dir touch $file_seed # parameters n_iter='20' n_shift='1' -n_core=28 +n_core=32 # sequences for k in 1 2 3 4 5 6 7 8 9 10 do seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d' file_mod1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'ctcf_motifs_10e-6_nucleosomes_bin1bp_fragment_center_'$k'class_model.mat' file_mod4=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt' echo "$file_prob $seed" >> $file_seed bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_1nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod3 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod4 done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_ctcf_motif.R new file mode 100644 index 0000000..23f7561 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_ctcf_motif.R @@ -0,0 +1,100 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(seqLogo) + +# functions +source(file.path("scripts", "functions.R")) + +# the minimum number of classes searched +k.min = 1 +# the maximum number of classes searched +k.max = 10 + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around ctcf motifs ################## + +for(k in k.min:k.max) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_5", + sprintf("ctcf_motifs_10e-6_sequences_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + + data = NULL + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_5", + sprintf("ctcf_motifs_10e-6_classification_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:dim(ref.seq)[3])[ord] + for(i in 1:(dim(ref.seq)[3])) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-floor(ncol(ref.seq)/2), floor(ncol(ref.seq)/2), length.out=3) + x.at = seq(1, ncol(ref.seq), length.out=3) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + } + dev.off() +} + + +################## sequence patterns around ctcf motifs repeat masked ################## + +for(k in k.min:k.max) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_5", + sprintf("ctcf_motifs_10e-6_sequences_rmsk_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + + data = NULL + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_5", + sprintf("ctcf_motifs_10e-6_classification_sequences_rmsk_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:dim(ref.seq)[3])[ord] + for(i in 1:(dim(ref.seq)[3])) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-floor(ncol(ref.seq)/2), floor(ncol(ref.seq)/2), length.out=3) + x.at = seq(1, ncol(ref.seq), length.out=3) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + } + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_ctcf_motif.sh new file mode 100755 index 0000000..41efee5 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_ctcf_motif.sh @@ -0,0 +1,40 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_5' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_seq="$data_dir/ctcf_motifs_10e-6_sequences.mat" +file_mat_seq_rmsk="$data_dir/ctcf_motifs_10e-6_sequences_rmsk.mat" + +## file with seeds +file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt' +file_seed_rmsk=$results_dir'/ctcf_motifs_10e-6_seed_rmsk.txt' + +mkdir -p $results_dir +touch $file_seed +touch $file_seed_rmsk + +# parameters +n_iter='20' +n_shift='771' +n_core=32 + +# sequences +for k in 1 2 3 4 5 6 7 8 9 10 +do + # all sequences + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_'$k'class_prob.mat4d' + file_mod=$results_dir/'ctcf_motifs_10e-6_sequences_'$k'class_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod + + # repeat masked sequences + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_sequences_rmsk_'$k'class_prob.mat4d' + file_mod=$results_dir/'ctcf_motifs_10e-6_sequences_rmsk_'$k'class_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq_rmsk --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --seq $file_mat_seq_rmsk --prob $file_prob --thread $n_core 1> $file_mod +done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_sp1_motif.R new file mode 100644 index 0000000..d405144 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_sp1_motif.R @@ -0,0 +1,100 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(seqLogo) + +# functions +source(file.path("scripts", "functions.R")) + +# the minimum number of classes searched +k.min = 1 +# the maximum number of classes searched +k.max = 10 + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +################## sequence patterns around sp1 motifs ################## + +for(k in k.min:k.max) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_5", + sprintf("sp1_motifs_10e-7_sequences_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + + data = NULL + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_5", + sprintf("sp1_motifs_10e-7_classification_sequences_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:dim(ref.seq)[3])[ord] + for(i in 1:(dim(ref.seq)[3])) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-floor(ncol(ref.seq)/2), floor(ncol(ref.seq)/2), length.out=3) + x.at = seq(1, ncol(ref.seq), length.out=3) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + } + dev.off() +} + + +################## sequence patterns around sp1 motifs repeat masked ################## + +for(k in k.min:k.max) +{ + # sequence + data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_motifs_classification_5", + sprintf("sp1_motifs_10e-7_sequences_rmsk_%dclass_model.mat", k))) + model.seq = data$models + model.prob = data$prob + + data = NULL + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=17, height=10) + png(filename=file.path("results", "10xgenomics_PBMC_5k_motifs_classification_5", + sprintf("sp1_motifs_10e-7_classification_sequences_rmsk_%dclass.png", k)), + units="in", res=720, width=18, height=12) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.seq = model.seq[,,ord, drop=F] + prob = model.prob[ord] + class = c(1:dim(ref.seq)[3])[ord] + for(i in 1:(dim(ref.seq)[3])) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-floor(ncol(ref.seq)/2), floor(ncol(ref.seq)/2), length.out=3) + x.at = seq(1, ncol(ref.seq), length.out=3) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + x.at = seq(0, 1, 0.5) + axis(2, at=x.at, labels=x.at) + } + dev.off() +} diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_sp1_motif.sh new file mode 100755 index 0000000..8674f3f --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/classification_sp1_motif.sh @@ -0,0 +1,40 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_5' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_seq="$data_dir/sp1_motifs_10e-7_sequences.mat" +file_mat_seq_rmsk="$data_dir/sp1_motifs_10e-7_sequences_rmsk.mat" + +## file with seeds +file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt' +file_seed_rmsk=$results_dir'/sp1_motifs_10e-7_seed_rmsk.txt' + +mkdir -p $results_dir +touch $file_seed +touch $file_seed_rmsk + +# parameters +n_iter='20' +n_shift='771' +n_core=32 + +# sequences +for k in 1 2 3 4 5 6 7 8 9 10 +do + # all sequences + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_'$k'class_prob.mat4d' + file_mod=$results_dir/'sp1_motifs_10e-7_sequences_'$k'class_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod + + # repeat masked sequences + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_sequences_rmsk_'$k'class_prob.mat4d' + file_mod=$results_dir/'sp1_motifs_10e-7_sequences_rmsk_'$k'class_model.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq_rmsk --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --seq $file_mat_seq_rmsk --prob $file_prob --thread $n_core 1> $file_mod +done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_5/run_all.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/run_all.sh new file mode 100755 index 0000000..4b406e0 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_5/run_all.sh @@ -0,0 +1,10 @@ + +dir='scripts/10xgenomics_PBMC_5k_motifs_classification_4' + +# classification +$dir/classification_ctcf_motif.sh +$dir/classification_sp1_motif.sh + +# analysis of classification results +Rscript $dir/classification_ctcf_motif.R +Rscript $dir/classification_sp1_motif.R diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_6/analyse_motifs.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_6/analyse_motifs.R new file mode 100644 index 0000000..e8a6f3f --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_6/analyse_motifs.R @@ -0,0 +1,140 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(clues) + +# functions +source(file.path("scripts", "functions.R")) + +#' Performs a hard assigment. +#' Each region is assigned to the +#' class for which it has been classified +#' with the highest probability, over +#' all shift and flip states. +#' \param prob a 4D array containing +#' the posterior probabilities. It has +#' the following dimensions : +#' 1st number of regions +#' 2nd number of classes +#' 3rd number of shifts +#' 4th number of flips +#' \return a vector of labels, 1 per +#' region. +#' \author Romain Groux +hard.assign = function(prob) +{ + prob.per.region = apply(prob, c(1,2), sum) + cluster = apply(prob.per.region, 1, which.max) + return(cluster) +} + +#' Given a set of n labels corresponding to the cluster/class +#' assignment of n data point, this function creates a +#' co-occurence matrix where the element [i,j] indicates whether +#' the i-th and the j-th data points are in the same cluster +#' (=1) or not (=0). +#' @param clusters a vector of numerical indicating the cluster assignment +#' for a set of data points. +#' @return the co-occurence matrix. Only the lower triangle is filled. +#' 0 means that both points are not assigned the same label, whereas +#' 1 means that they are. +#' @author Romain Groux +construct.cooccurence.matrix = function(clusters) +{ + n = length(clusters) + # square matrix + matrix.cooc = matrix(data=0, nrow=n, ncol=n) + # only fill the lower triangle of the matrix + for(i in 1:n) + { j = 1 + while(j <= i) + { if( clusters[i] == clusters[j]) + { matrix.cooc[i,j] = 1 } + j = j + 1 + } + } + return(matrix.cooc) +} + +#' Computes the Hubert Gamma statistic of a clustering given two vectors of +#' cluster labels. The elements in one vector are expected to +#' correspond to the same in the other vector. +#' @param labels.true a vector of size containing cluster +#' labels corresponding to data cluster assignments. +#' @param labels.cand a vector of size containing cluster +#' labels corresponding to data cluster assignments. +#' @return the Hubert Gamma statistic. +#' @seealso construct.cooccurence.matrix() +#' @author Romain Groux +gamma.stat = function(labels.true, labels.cand) +{ labels.true.m = construct.cooccurence.matrix(labels.true) + labels.cand.m = construct.cooccurence.matrix(labels.cand) + return(cor(as.vector(labels.true.m), as.vector(labels.cand.m))) +} + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +# the true labels 5000 CTCF sites and 5000 SP1 sites +true.labels = c(rep(1,5000), + rep(2,5000)) + +# the expected dimensionality of the prob array to read +# -1 indicate values that will change (class and shift) +dim = c(10000, -1, -1, 2) + +# the parameters used to run the EM +n.shifts = c(1, 771) +n.classes = 2:6 + +#number of time a classification was repeated +n.repeat = 10 + +# where the results are +dir.results = file.path("results", + "10xgenomics_PBMC_5k_motifs_classification_6") + +# ari values +ari = array(dim=c(length(n.classes), + length(n.shifts), + n.repeat)) + +for(i in 1:length(n.shifts)) +{ + n.shift = n.shifts[i] + + for(j in 1:length(n.classes)) + { + n.class = n.classes[j] + + # update dimensions + dim[2] = n.class + dim[3] = n.shift + + # go over each repetition + for(k in 1:n.repeat) + { print(sprintf("%d shift %d class %d repead", n.shift, n.class, k)) + file.prob = file.path(dir.results, + sprintf("ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_%dclass_%dshift_prob_%d.txt", + n.class, n.shift, k)) + file.motif = file.path(dir.results, + sprintf("ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_%dclass_%dshift_model_%d.mat", + n.class, n.shift, k)) + + prob = read.arraytxt(file.prob, dim) + print(apply(prob, 2, sum)/sum(prob)) + cluster = hard.assign(prob) + ari[i,j,k] = adjustedRand(cluster, true.labels)["HA"] + + # motif = read.sequence.models(file.motif)$models + # plot.logo(motif[,390:410,1], path.a, path.c, path.g, path.t) + # plot.logo(motif[,390:410,2], path.a, path.c, path.g, path.t) + } + } +} + + diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_6/analyse_reads.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_6/analyse_reads.R new file mode 100644 index 0000000..e69de29 diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_6/classification_motifs.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_6/classification_motifs.sh new file mode 100755 index 0000000..f6fe2fb --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_6/classification_motifs.sh @@ -0,0 +1,45 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_6' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_seq="$data_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk.mat" + +## file with seeds +file_seed=$results_dir'/ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# parameters +n_iter='200' +n_shift1='1' +n_shift2='771' +n_core=32 + +# sequences +for i in {1..10} +do + for k in 2 3 4 5 6 + do + # without shift + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_'$k'class_'$n_shift1'shift_prob_'$i'.mat4d' + file_prob2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_'$k'class_'$n_shift1'shift_prob_'$i'.txt' + file_mod=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_'$k'class_'$n_shift1'shift_model_'$i'.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift1 --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod + bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 + + # with shift + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7__sequences_rmsk_'$k'class_'$n_shift2'shift_prob_'$i'.mat4d' + file_prob2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_'$k'class_'$n_shift2'shift_prob_'$i'.txt' + file_mod=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_'$k'class_'$n_shift2'shift_model_'$i'.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift2 --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod + bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 + done +done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_6/classification_reads.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_6/classification_reads.sh new file mode 100755 index 0000000..fe864dc --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_6/classification_reads.sh @@ -0,0 +1,55 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_6' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_seq="$data_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk.mat" +file_mat_open="$data_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk.mat" +file_mat_nucl="$data_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_rmsk.mat" + +## file with seeds +file_seed=$results_dir'/ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# parameters +n_iter='20' +n_shift1='1' +n_shift2='21' +n_core=32 + +# open chromatin +for i in {1..10} +do + for k in 2 3 4 5 6 + do + # without shift + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift1'shift_prob_'$i'.mat4d' + file_prob2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift1'shift_prob_'$i'.txt' + file_mod1=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift1'shift_model_'$i'.mat' + file_mod2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_rmsk_'$k'class_'$n_shift1'shift_model_'$i'.mat' + file_mod3=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_'$k'class_'$n_shift1'shift_model_'$i'.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_seq --class $k --shift $n_shift1 --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 + + # with shift + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift2'shift_prob_'$i'.mat4d' + file_prob2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift2'shift_prob_'$i'.txt' + file_mod1=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift2'shift_model_'$i'.mat' + file_mod2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_rmsk_'$k'class_'$n_shift2'shift_model_'$i'.mat' + file_mod3=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_'$k'class_'$n_shift2'shift_model_'$i'.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_seq --class $k --shift $n_shift2 --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 + done +done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_6/run_all.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_6/run_all.sh new file mode 100755 index 0000000..25f35b5 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_6/run_all.sh @@ -0,0 +1,10 @@ + +dir='scripts/10xgenomics_PBMC_5k_motifs_classification_6' + +# classification +$dir/classification_motifs.sh +$dir/classification_reads.sh + +# analysis of classification results +Rscript $dir/analysis_motifs.R +Rscript $dir/analysis_reads.R diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_7/analyse_motifs.R b/scripts/10xgenomics_PBMC_5k_motifs_classification_7/analyse_motifs.R new file mode 100644 index 0000000..38fa96f --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_7/analyse_motifs.R @@ -0,0 +1,98 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(clues) + +# functions +source(file.path("scripts", "functions.R")) + +#' Performs a hard assigment. +#' Each region is assigned to the +#' class for which it has been classified +#' with the highest probability, over +#' all shift and flip states. +#' \param prob a 4D array containing +#' the posterior probabilities. It has +#' the following dimensions : +#' 1st number of regions +#' 2nd number of classes +#' 3rd number of shifts +#' 4th number of flips +#' \return a vector of labels, 1 per +#' region. +#' \author Romain Groux +hard.assign = function(prob) +{ + prob.per.region = apply(prob, c(1,2), sum) + cluster = apply(prob.per.region, 1, which.max) + return(cluster) +} + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +# the true labels 5000 CTCF sites and 5000 SP1 sites +true.labels = c(rep(1,1000), + rep(2,1000)) + +# the expected dimensionality of the prob array to read +# -1 indicate values that will change (class and shift) +dim = c(2000, -1, -1, 2) + +# the parameters used to run the EM +n.shifts = c(90) +n.classes = 1:2 + +#number of time a classification was repeated +n.repeat = 50 + +# where the results are +dir.results = file.path("results", + "10xgenomics_PBMC_5k_motifs_classification_7") + +# ari values +ari = array(dim=c(length(n.classes), + length(n.shifts), + n.repeat)) + +for(i in 1:length(n.classes)) +{ + n.class = n.classes[i] + + for(j in 1:length(n.shifts)) + { + n.shift = n.shifts[j] + + # update dimensions + dim[2] = n.class + dim[3] = n.shift + + + # go over each repetition + for(k in 1:n.repeat) + { file.prob = file.path(dir.results, + sprintf("ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_%dclass_%dshift_prob_%d.txt", + n.class, n.shift, k)) + file.motif = file.path(dir.results, + sprintf("ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_%dclass_%dshift_model_%d.mat", + n.class, n.shift, k)) + + # prob = read.arraytxt(file.prob, dim) + # print(apply(prob, 2, sum)/sum(prob)) + # cluster = hard.assign(prob) + # ari[i,j,k] = adjustedRand(cluster, true.labels)["HA"] + + X11(width=10, height=12) + par(mfrow=c(2,1)) + motif = read.sequence.models(file.motif)$models + plot.logo(motif[,,1], path.a, path.c, path.g, path.t) + plot.logo(motif[,,2], path.a, path.c, path.g, path.t) + } + } +} + + diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_7/classification_motifs.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_7/classification_motifs.sh new file mode 100755 index 0000000..a0e5c69 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_7/classification_motifs.sh @@ -0,0 +1,54 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_7' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_seq="$data_dir/ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk.mat" + +## file with seeds +file_seed=$results_dir'/ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# parameters +n_iter='200' +n_shift1='1' +n_shift2='771' +n_core=32 + +# sequences +for i in {1..10} +do + for k in 2 3 4 5 6 + do + # without shift + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_'$k'class_'$n_shift1'shift_prob_'$i'.mat4d' + file_prob2=$results_dir/'ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_'$k'class_'$n_shift1'shift_prob_'$i'.txt' + file_mod=$results_dir/'ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_'$k'class_'$n_shift1'shift_model_'$i'.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift1 --flip --iter $n_iter --seed $seed --thread 10 --out $file_prob + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread 10 1> $file_mod + bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 + + # with shift + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_myc_motifs_10e-6__sequences_rmsk_'$k'class_'$n_shift2'shift_prob_'$i'.mat4d' + file_prob2=$results_dir/'ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_'$k'class_'$n_shift2'shift_prob_'$i'.txt' + file_mod=$results_dir/'ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_'$k'class_'$n_shift2'shift_model_'$i'.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift2 --flip --iter $n_iter --seed $seed --thread 32 --out $file_prob + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread 32 1> $file_mod + bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 + done +done + + +# with shift +file_prob=$results_dir/'ctcf_motifs_10e-6_myc_motifs_10e-6__sequences_rmsk_10class_771shift_prob_kmer.mat4d' +file_prob2=$results_dir/'ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_10class_771shift_prob_kmer.txt' +file_mod=$results_dir/'ctcf_motifs_10e-6_myc_motifs_10e-6_sequences_rmsk_10class_771shift_model_kmer.mat' +bin/EMSequence --seq $file_mat_seq --class 10 --shift 771 --flip --iter 100 --thread 32 --out $file_prob +bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread 32 1> $file_mod +bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_7/classification_reads.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_7/classification_reads.sh new file mode 100755 index 0000000..fe864dc --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_7/classification_reads.sh @@ -0,0 +1,55 @@ +# some paths +## directories +results_dir='results/10xgenomics_PBMC_5k_motifs_classification_6' +data_dir='data/10xgenomics_PBMC_5k_motifs' +## input +file_mat_seq="$data_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk.mat" +file_mat_open="$data_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk.mat" +file_mat_nucl="$data_dir/ctcf_motifs_10e-6_sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_rmsk.mat" + +## file with seeds +file_seed=$results_dir'/ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_seed.txt' + +mkdir -p $results_dir +touch $file_seed + +# parameters +n_iter='20' +n_shift1='1' +n_shift2='21' +n_core=32 + +# open chromatin +for i in {1..10} +do + for k in 2 3 4 5 6 + do + # without shift + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift1'shift_prob_'$i'.mat4d' + file_prob2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift1'shift_prob_'$i'.txt' + file_mod1=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift1'shift_model_'$i'.mat' + file_mod2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_rmsk_'$k'class_'$n_shift1'shift_model_'$i'.mat' + file_mod3=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_'$k'class_'$n_shift1'shift_model_'$i'.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_seq --class $k --shift $n_shift1 --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 + + # with shift + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift2'shift_prob_'$i'.mat4d' + file_prob2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift2'shift_prob_'$i'.txt' + file_mod1=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_open_bin1bp_read_atac_rmsk_'$k'class_'$n_shift2'shift_model_'$i'.mat' + file_mod2=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_nucleosomes_bin1bp_fragment_center_rmsk_'$k'class_'$n_shift2'shift_model_'$i'.mat' + file_mod3=$results_dir/'ctcf_motifs_10e-6_sp1_motifs_10e-7_sequences_rmsk_'$k'class_'$n_shift2'shift_model_'$i'.mat' + echo "$file_prob $seed" >> $file_seed + bin/EMRead --read $file_mat_seq --class $k --shift $n_shift2 --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 + bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 + bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 + done +done diff --git a/scripts/10xgenomics_PBMC_5k_motifs_classification_7/run_all.sh b/scripts/10xgenomics_PBMC_5k_motifs_classification_7/run_all.sh new file mode 100755 index 0000000..0967903 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_motifs_classification_7/run_all.sh @@ -0,0 +1,10 @@ + +dir='scripts/10xgenomics_PBMC_5k_motifs_classification_7' + +# classification +$dir/classification_motifs.sh +$dir/classification_reads.sh + +# analysis of classification results +Rscript $dir/analysis_motifs.R +Rscript $dir/analysis_reads.R diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh index 17b2445..788e02a 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_0/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_0" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='971' -n_core=8 +n_core=32 # classify for k in 10 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh index a32bc7e..d317884 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_1/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_1" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='981' -n_core=24 +n_core=32 # classify for k in 10 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --bgclass --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core --bgclass 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core --bgclass 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core --bgclass 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh index 9a59ce4..66660d2 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_2/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_2" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='981' -n_core=24 +n_core=32 # classify for k in 10 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh index 30bbc56..56e6072 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_3/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_3" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='981' -n_core=24 +n_core=32 # classify for k in 10 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_openchromatin-sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed bin/EMJoint --read $file_mat_open --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh index 383afb1..0283689 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_4/classification_peaks_sampled.sh @@ -1,55 +1,55 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" pwm_dir="data/pwm/jaspar_2018_clustering/" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_4" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='971' -n_core=24 +n_core=32 ## PWM files jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" e2f2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" # classify for k in 17 20 30 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed bin/EMSequence --seq $file_mat_seq --class $k --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$stat6,$nfe2,$ahr,$e2f2,$ctcf --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh index d44282f..ba851a6 100755 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_5/classification_peaks_sampled.sh @@ -1,35 +1,35 @@ # paths ## dir data_dir="data/10xgenomics_PBMC_5k_peaks" results_dir="results/10xgenomics_PBMC_5k_peaks_classification_5" ## matrix files file_mat_open=$data_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac.mat' file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center.mat' file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' ## file with seeds file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' mkdir -p $results_dir touch $file_seed # EM param n_iter='100' n_shift='991' -n_core=24 +n_core=32 # classify for k in 20 30 40 do ## results files file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_prob.mat4d' file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_read_atac_'$k'class_model.mat' file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_fragment_center_'$k'class_model.mat' file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_'$k'class_model.mat' seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) echo "$file_prob $seed" >> $file_seed bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 done diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R index 43375ee..4edf115 100644 --- a/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_7/classification_peaks.R @@ -1,210 +1,216 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # libraries library(RColorBrewer) # functions source(file.path("scripts", "functions.R")) # the number of classes searched n.classes = c(23) -class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1", - "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF", - "NR4A1", "EGR", "GATA", "NFAT", "RUNX") +# class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1", +# "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF", +# "NR4A1", "EGR", "GATA", "NFAT", "RUNX") + +class.tf = c("AP1", "HIF1a", "myc", "PU.1", "CEBP", "IRF4", "IRF2", "LHX3", "FOXH1", + "SOX", "MEF2", "ELF", "STAT6", "NFE2", "AHR", "E2F", "CTCF", "KLF", + "NR4A1", "EGR", "GATA", "NFAT", "RUNX") # path to the images for the logo path.a = file.path("res/A.png") path.c = file.path("res/C.png") path.g = file.path("res/G.png") path.t = file.path("res/T.png") ################## plot architecture around TF motifs ################## for(k in n.classes) { # sequence data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k))) model.seq = data$models model.prob = data$prob data = NULL # open chromatin model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models # plot classes col = brewer.pal(3, "Set1") # X11(width=24, height=12) png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), - units="in", res=720, width=18, height=12) - m = matrix(1:24, nrow=6, ncol=4, byrow=F) + units="in", res=720, width=18, height=16) + m = matrix(1:24, nrow=8, ncol=3, byrow=F) layout(m) # order from most to least probable class ord = order(model.prob, decreasing=T) ref.open = model.open[ord,, drop=F][,] ref.nucl = model.nucl[ord,, drop=F][,] ref.seq = model.seq[,,ord, drop=F][,,] prob = model.prob[ord] class = c(1:nrow(ref.open))[ord] tf = class.tf[ord] for(i in 1:nrow(ref.open)) { # plot logo par(mar=c(2,2,2,0)) plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, main=sprintf("%s (p=%.2f)", tf[i], prob[i])) # x-axis x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) axis(1, at=x.at, labels=x.lab) # y-axis is [0,1] for min/max signal y.at = seq(0, 2, length.out=2) y.lab = c("min", "max") axis(2, at=y.at, labels=y.lab) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) } # inlets with center row_n = 1 # row counter col_n = 1 # column counter row_h = 1/nrow(m) # height of row col_w = 1/ncol(m) # width of column row_cor = row_h / 3 col_cor = col_w / 3 for(i in 1:nrow(ref.open)) { # plot logo center left = (col_w*col_n) - col_w right = left + col_w left = right - col_cor - bottom = 1 - (row_h*row_n) - top = bottom + row_h - bottom = top - row_cor + bottom = 1 - ((row_h*row_n)-(0.2*row_h)) + top = bottom + row_cor + # top = bottom + row_h + # bottom = top - row_cor p= par(fig=c(left, right, bottom, top), mar=c(0,0,0,0), new=T) idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10) plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) # plot signal (multiplies by 2 because the y-axis goes to 2 bits) lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) # xaxis # x.at = seq(1, length(idx), length.out = 3) # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] x.at = ceiling(length(idx)/2) x.lab = 0 axis(1, at=x.at, labels=x.lab) # yaxis axis(2, at=y.at, labels=y.lab) row_n = row_n + 1 if(i %% nrow(m) == 0) { col_n = col_n + 1 row_n = 1 } par(p) } dev.off() } ################## zoom in the center ################## for(k in n.classes) { idx = 516 + c(-100:+100) # sequence data = read.sequence.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", sprintf("peaks_rmsk_sequences_1kb_%dclass_model_extended.mat", k))) model.seq = data$models[,idx,] model.prob = data$prob data = NULL # open chromatin model.open = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", sprintf("peaks_rmsk_openchromatin_1kb_read_atac_%dclass_model_extended.mat", k)))$models[,idx] # nucleosomes model.nucl = read.read.models(file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", sprintf("peaks_rmsk_nucleosomes_1kb_fragment_center_%dclass_model_extended.mat", k)))$models[,idx] # plot classes col = brewer.pal(3, "Set1") # X11(width=24, height=12) png(filename=file.path("results", "10xgenomics_PBMC_5k_peaks_classification_7", sprintf("peaks_rmsk_sampled_sequences_%dclass_2.png", k)), - units="in", res=720, width=18, height=12) - m = matrix(1:24, nrow=6, ncol=4, byrow=F) - layout(m) - # order from most to least probable class - ord = order(model.prob, decreasing=T) - ref.open = model.open[ord,, drop=F][,] - ref.nucl = model.nucl[ord,, drop=F][,] - ref.seq = model.seq[,,ord, drop=F][,,] - prob = model.prob[ord] - class = c(1:nrow(ref.open))[ord] - tf = class.tf[ord] - for(i in 1:nrow(ref.open)) - { # plot logo - par(mar=c(2,2,2,0)) - plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, - main=sprintf("%s (p=%.2f)", tf[i], prob[i])) - # x-axis - x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) - x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) - axis(1, at=x.at, labels=x.lab) - # y-axis is [0,1] for min/max signal - y.at = seq(0, 2, length.out=2) - y.lab = c("min", "max") - axis(2, at=y.at, labels=y.lab) - # plot signal (multiplies by 2 because the y-axis goes to 2 bits) - lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) - lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) - } + units="in", res=720, width=18, height=16) + m = matrix(1:24, nrow=8, ncol=3, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F][,] + ref.nucl = model.nucl[ord,, drop=F][,] + ref.seq = model.seq[,,ord, drop=F][,,] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + tf = class.tf[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + par(mar=c(2,2,2,0)) + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("%s (p=%.2f)", tf[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + + # inlets with center + row_n = 1 # row counter + col_n = 1 # column counter + row_h = 1/nrow(m) # height of row + col_w = 1/ncol(m) # width of column + row_cor = row_h / 3 + col_cor = col_w / 3 + for(i in 1:nrow(ref.open)) + { # plot logo center + left = (col_w*col_n) - col_w + right = left + col_w + left = right - col_cor + bottom = 1 - ((row_h*row_n)-(0.2*row_h)) + top = bottom + row_cor + # top = bottom + row_h + # bottom = top - row_cor - # inlets with center - row_n = 1 # row counter - col_n = 1 # column counter - row_h = 1/nrow(m) # height of row - col_w = 1/ncol(m) # width of column - row_cor = row_h / 3 - col_cor = col_w / 3 - for(i in 1:nrow(ref.open)) - { # plot logo center - left = (col_w*col_n) - col_w - right = left + col_w - left = right - col_cor - bottom = 1 - (row_h*row_n) - top = bottom + row_h - bottom = top - row_cor - - p= par(fig=c(left, right, bottom, top), - mar=c(0,0,0,0), - new=T) - idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10) - plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) - # plot signal (multiplies by 2 because the y-axis goes to 2 bits) - lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) - lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) - # xaxis - # x.at = seq(1, length(idx), length.out = 3) - # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] - x.at = ceiling(length(idx)/2) - x.lab = 0 - axis(1, at=x.at, labels=x.lab) - # yaxis - axis(2, at=y.at, labels=y.lab) - row_n = row_n + 1 - if(i %% nrow(m) == 0) - { col_n = col_n + 1 - row_n = 1 - } - par(p) + p= par(fig=c(left, right, bottom, top), + mar=c(0,0,0,0), + new=T) + idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10) + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + x.at = ceiling(length(idx)/2) + x.lab = 0 + axis(1, at=x.at, labels=x.lab) + # yaxis + axis(2, at=y.at, labels=y.lab) + row_n = row_n + 1 + if(i %% nrow(m) == 0) + { col_n = col_n + 1 + row_n = 1 } + par(p) + } dev.off() } diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_8/analysis_test.R b/scripts/10xgenomics_PBMC_5k_peaks_classification_8/analysis_test.R new file mode 100644 index 0000000..042553b --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_8/analysis_test.R @@ -0,0 +1,121 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) + +# functions +source(file.path("scripts", "functions.R")) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + + +# the TF names +class.tf = c("jun", "HIF1a", "myc", "PU.1", "CEBPb", "IRF4", "IRF2", "LHX3", "FOXH1", + "SOX3", "MEF2c", "ELF5", "STAT6", "NFE2", "AHR", "E2F2", "CTCF", "KLF", + "NR4A1", "EGR", "GATA", "NFAT", "RUNX") + +# the number of classes searched for each TF +# n.classes = 2:10 +n.classes = 10:10 + +# the methods used for the classification for each TF +# em.methods = c("read", "consensussequence", "read_consensussequence") +em.methods = c("consensussequence_kmer") + +# make a loop here for final analysis +for(tf in class.tf) +{ + # make a loop here for final analysis + for(method in em.methods) + { + dir.tf = file.path("results", "10xgenomics_PBMC_5k_peaks_classification_8", tf, method) + + for(k in n.classes) + { + # sequence + data = read.sequence.models(file.path(dir.tf, sprintf("data_class%s_%dclass_model_sequence.mat2d", tf, k))) + model.seq = data$models + model.prob = data$prob + data = NULL + # open chromatin + model.open = read.read.models(file.path(dir.tf, sprintf("data_class%s_%dclass_model_open.mat2d", tf, k)))$models + # nucleosomes + model.nucl = read.read.models(file.path(dir.tf, sprintf("data_class%s_%dclass_model_nucl.mat2d", tf, k)))$models + + # plot classes + col = brewer.pal(3, "Set1") + # X11(width=20, height=10) + png(filename=file.path(dir.tf, sprintf("data_class%s_%dclass.png", tf, k)), + units="in", res=720, width=20, height=10) + m = matrix(1:10, nrow=5, ncol=2, byrow=F) + layout(m) + # order from most to least probable class + ord = order(model.prob, decreasing=T) + ref.open = model.open[ord,, drop=F][,] + ref.nucl = model.nucl[ord,, drop=F][,] + ref.seq = model.seq[,,ord, drop=F][,,] + prob = model.prob[ord] + class = c(1:nrow(ref.open))[ord] + for(i in 1:nrow(ref.open)) + { # plot logo + plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, + main=sprintf("class %d (p=%.2f)", class[i], prob[i])) + # x-axis + x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) + x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) + axis(1, at=x.at, labels=x.lab) + # y-axis is [0,1] for min/max signal + y.at = seq(0, 2, length.out=2) + y.lab = c("min", "max") + axis(2, at=y.at, labels=y.lab) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) + } + # inlets with center + row_n = 1 # row counter + col_n = 1 # column counter + row_h = 1/nrow(m) # height of row + col_w = 1/ncol(m) # width of column + row_cor = row_h / 3 + col_cor = col_w / 3 + for(i in 1:nrow(ref.open)) + { # plot logo center + left = (col_w*col_n) - col_w + right = left + col_w + left = right - col_cor + bottom = 1 - (row_h*row_n) + top = bottom + row_h + bottom = top - row_cor + + p= par(fig=c(left, right, bottom, top), + mar=c(0,0,0,0), + new=T) + idx = (ceiling(dim(ref.seq)[2]/2)-1-10):(ceiling(dim(ref.seq)[2]/2)-1+10) + plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) + # plot signal (multiplies by 2 because the y-axis goes to 2 bits) + lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) + lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) + # xaxis + # x.at = seq(1, length(idx), length.out = 3) + # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] + x.at = ceiling(length(idx)/2) + x.lab = 0 + axis(1, at=x.at, labels=x.lab) + # yaxis + axis(2, at=y.at, labels=y.lab) + row_n = row_n + 1 + if(i %% nrow(m) == 0) + { col_n = col_n + 1 + row_n = 1 + } + par(p) + } + dev.off() + } + } +} diff --git a/scripts/10xgenomics_PBMC_5k_peaks_classification_8/classification_peaks.sh b/scripts/10xgenomics_PBMC_5k_peaks_classification_8/classification_peaks.sh new file mode 100755 index 0000000..8c29086 --- /dev/null +++ b/scripts/10xgenomics_PBMC_5k_peaks_classification_8/classification_peaks.sh @@ -0,0 +1,119 @@ + +file_bed='data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks_rmsk_sampled.bed' +file_bam_open='data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam' +file_bai_open='data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai' +file_bam_nucl='data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam' +file_bai_nucl='data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_nucleosomes.bam.bai' +file_fasta='data/genomes/hg19.fasta' +file_prob='results/10xgenomics_PBMC_5k_peaks_classification_6/peaks_rmsk_sampled_sequences_1kb_23class_prob.mat4d' + + +# the TF associated to each of the 23 classes (in the good order) +# tfs[0] = class 1 +tfs=('jun' 'HIF1a' 'myc' 'PU.1' 'CEBPb' 'IRF4' 'IRF2' + 'LHX3' 'FOXH1' 'SOX3' 'MEF2c' 'ELF5' 'STAT6' 'NFE2' + 'AHR' 'E2F2' 'CTCF' 'KLF' 'NR4A1' 'EGR' 'GATA' + 'NFAT' 'RUNX') + + +# EM parameters +n_class_min=1 # min nb of classes to search +n_class_max=10 # max nb of classes to search +n_iter=20 # nb of iter for read pattern search +n_iter2=100 # nb of iter for sequence pattern search +n_shift=1 # shift freedom to find diff. footprint on motif of TF of interest +n_shift2=21 # shift freedom to find diff. other footprint/motif +n_shift3=981 # shift freedom to find other motifs (=20bp motif) +n_core=32 + + +# get each class +for class in $(seq 1 ${#tfs[*]}) +do + # TF for that class + tf=${tfs[$(($class-1))]} + echo "extracting class $tf..." + + # create directories for each type of partitioning for this class + dir_class="results/10xgenomics_PBMC_5k_peaks_classification_8/$tf" + dir_class_read=$dir_class'/read' + dir_class_cons=$dir_class'/consensussequence' + dir_class_cons=$dir_class'/consensussequence_kmer' + dir_class_joint=$dir_class'/read_consensussequence' + mkdir -p $dir_class + mkdir -p $dir_class_read + mkdir -p $dir_class_cons + mkdir -p $dir_class_joint + + # extract class + file_class_open=$dir_class'/data_class'$tf'_open.mat2d' + file_class_nucl=$dir_class'/data_class'$tf'_nucl.mat2d' + file_class_consseq=$dir_class'/data_class'$tf'_consensus_sequence.mat3d' + bin/ClassReadDataCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --prob $file_prob --from -500 --to 500 --binSize 1 --k $class --method "read_atac" > $file_class_open + bin/ClassReadDataCreator --bed $file_bed --bam $file_bam_nucl --bai $file_bai_nucl --prob $file_prob --from -500 --to 500 --binSize 1 --k $class --method "fragment_center" > $file_class_nucl + bin/ClassSequenceDataCreator --bed $file_bed --fasta $file_fasta --prob $file_prob --from -500 --to 500 --k $class --out $file_class_consseq + + # list 0 signal rows in open chromatin + # regions will be classifier according to open chromatin + # profiles -> allow to ignore these regions during + # classification process + filter_file=$dir_class'/data_class'$tf'_open_emptyrows.mat2d' + bin/WhichNullRows --mat $file_class_open > $filter_file + + # seed file + file_seed_open=$dir_class_read'/data_class'$tf'_seed.txt' + file_seed_cseq=$dir_class_cons'/data_class'$tf'_seed.txt' + file_seed_joint=$dir_class_joint'/data_class'$tf'_seed.txt' + touch $file_seed_open + touch $file_seed_joint + touch $file_seed_cseq + + # partition data + echo "classifying class $tf..." + for k in $(seq $n_class_min $n_class_max) + do + # open chromatin only + # find different types of footprint for this TF + # no flip, motifs are already aligned/oriented in the same orientation + file_prob=$dir_class_read'/data_class'$tf'_'$k'class_prob.mat4d' + file_mod1=$dir_class_read'/data_class'$tf'_'$k'class_model_open.mat2d' + file_mod2=$dir_class_read'/data_class'$tf'_'$k'class_model_nucl.mat2d' + file_mod3=$dir_class_read'/data_class'$tf'_'$k'class_model_sequence.mat2d' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed_open + bin/EMRead --read $file_class_open --iter $n_iter --class $k --shift $n_shift --filter $filter_file --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_class_open --prob $file_prob --filter $filter_file --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_class_nucl --prob $file_prob --filter $filter_file --thread $n_core 1> $file_mod2 + bin/ProbToModel --consseq $file_class_consseq --prob $file_prob --filter $filter_file --thread $n_core 1> $file_mod3 + + # open chromatin and sequence + # find different footprints for this TF and other TF if there are + # use flip because only this TF motifs are aligned/oriented in same orientation + file_prob=$dir_class_joint'/data_class'$tf'_'$k'class_prob.mat4d' + file_mod1=$dir_class_joint'/data_class'$tf'_'$k'class_model_open.mat2d' + file_mod2=$dir_class_joint'/data_class'$tf'_'$k'class_model_nucl.mat2d' + file_mod3=$dir_class_joint'/data_class'$tf'_'$k'class_model_sequence.mat2d' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed_joint + bin/EMJoint --read $file_class_open --consseq $file_class_consseq --iter $n_iter --class $k --shift $n_shift2 --flip --filter $filter_file --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --read $file_class_open --prob $file_prob --filter $filter_file --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_class_nucl --prob $file_prob --filter $filter_file --thread $n_core 1> $file_mod2 + bin/ProbToModel --consseq $file_class_consseq --prob $file_prob --filter $filter_file --thread $n_core 1> $file_mod3 + + # sequence + # find other motifs + # use flip because only this TF motifs are aligned/oriented in same orientation + file_prob=$dir_class_cons'/data_class'$tf'_'$k'class_prob_seq.mat4d' + file_mod1=$dir_class_cons'/data_class'$tf'_'$k'class_model_open.mat2d' + file_mod2=$dir_class_cons'/data_class'$tf'_'$k'class_model_nucl.mat2d' + file_mod3=$dir_class_cons'/data_class'$tf'_'$k'class_model_sequence.mat2d' + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + echo "$file_prob $seed" >> $file_seed_cseq + bin/EMConsensusSequence --consseq $file_class_consseq --iter $n_iter2 --class $k --shift $n_shift3 --flip --filter $filter_file --seed $seed --thread $n_core --out $file_prob + bin/EMConsensusSequence --consseq $file_class_consseq --iter $n_iter2 --class $k --shift $n_shift3 --flip --filter $filter_file --thread $n_core --out $file_prob + bin/ProbToModel --read $file_class_open --prob $file_prob --filter $filter_file --thread $n_core 1> $file_mod1 + bin/ProbToModel --read $file_class_nucl --prob $file_prob --filter $filter_file --thread $n_core 1> $file_mod2 + bin/ProbToModel --consseq $file_class_consseq --prob $file_prob --filter $filter_file --thread $n_core 1> $file_mod3 + done +done + diff --git a/scripts/functions.R b/scripts/functions.R index 0345e93..11fdcb4 100644 --- a/scripts/functions.R +++ b/scripts/functions.R @@ -1,422 +1,490 @@ +#' Reads the text version of a mat4d file +#' and loads it into an array. +#' \param path the path to the file to read +#' \param dim a vector containing the dimensions +#' of the matrix stored, for instance c(10,3,1,2). +#' \return a 4D array +#' \author Romain Groux +read.arraytxt = function(path, dim) +{ # to use str_count + require(stringr) + con = file(path, "r") + + dimensionality = length(dim) + array = array(dim=dim) + dim_current = rep(0, dimensionality) + + n_line = 1 + reading = TRUE + while (reading) + { line = readLines(con, n = 1) + + # number of ',' in line + n_coma = str_count(line, ',') + + # eof + if(length(line) == 0) + { reading = FALSE + break + # 1st line : contains dimensions of the matrix + # } else if(n_line == 1) { + # fields = unlist(strsplit(line, split=" ")) + # if(fields[1] != "dim") + # { stop("Error! Could not find dimensions in 1st line!") } + # dim_read = as.numeric(fields[2:length(fields)]) + # dimensionality = length(dim_read) + # dim_current = c(rep(1, 2), rep(0, dimensionality-2)) + # dim_current = rep(0, dimensionality) + # array = array(dim=dim_read) + # line contains ',' : indicates a coordinate + } else if(n_coma != 0) { + # sets the given coordinates according to what is found in file (correct 0 to 1 based) + x = as.numeric(unlist(strsplit(line, ','))[n_coma+1]) + 1 + dim_current[n_coma+1] = x + # every coordinate before in the vector should be reset + for(i in n_coma:1) + { dim_current[i] = 1 } + # line contains data + } else { + # fill a row + # na.omit because some extra ' ' in may create NA values in + fields = na.omit(as.numeric(unlist(strsplit(line, split=" ")))) + for(j in 1:length(fields)) + { array[t(dim_current)] = fields[j] + # column + 1 + dim_current[2] = dim_current[2] + 1 + } + # reset column + dim_current[2] = 1 + # row + 1 + dim_current[1] = dim_current[1] + 1 + } + n_line = n_line + 1 + } + close(con) + return(array) +} + + #' Reads a read density model file and returns a list #' with the class models and the associated #' class probabilities. #' \param file the path to the file of interest. #' \return a list of two elements : "models" #' a matrix with the class models on each row #' and "prob" the associated class probabilities. #' read.read.models = function(file) { mod = as.matrix(read.table(file), drop=F) prob = mod[,1] mod = mod[,-1, drop=F] rownames(mod) = paste("class", 1:nrow(mod)) colnames(mod) = 1:ncol(mod) return(list(models=mod, prob=prob)) } #' Reads a sequence model file and returns a list #' with the class models and the associated #' class probabilities. #' \param file the path to the file of interest. #' \return a list of two elements : "models" #' an array containing the models as probability #' matrices with the following dimensions : #' 1) 4 for A,C,G,T #' 2) the model length #' 3) the numbler of classes #' and "prob" the associated class probabilities. #' read.sequence.models = function(file) { data = as.matrix(read.table(file.path(file))) # prob = unique(data[,1]) prob = data[,1][rep(c(T,F,F,F), rep=nrow(data)/4)] n_class = length(prob) l_model = ncol(data) - 1 n_row = 4 models = array(dim=c(n_row, l_model, n_class)) dimnames(models)[[1]] = c('A', 'C', 'G', 'T') dimnames(models)[[2]] = 1:l_model dimnames(models)[[3]] = paste("class" , 1:n_class) i_from = 1 i_to = i_from + n_row - 1 for(k in 1:n_class) { models[,,k] = data[i_from:i_to,-1] i_from = i_to + 1 i_to = i_from + n_row - 1 } return(list(models=models, prob=prob)) } #' Computes the reverse complement of a #' DNA motif. #' \param the motif of interest with #' A,C,G,T on the rows and the positions #' on the columns. #' \return the reverse complement motif. #' \author Romain Groux reverse.complement = function(motif) { n.row = nrow(motif) n.col = ncol(motif) motif.rev = matrix(nrow=n.row, ncol=n.col) for(i in 1:n.row) { for(j in 1:n.col) { i_rev = n.row - i + 1 j_rev = n.col - j + 1 motif.rev[i_rev,j_rev] = motif[i,j] } } return(motif.rev) } #' Computes the Kullback-Leibler #' divergence of a given distristribution #' x to its corresponding uniform #' counterpart. #' For instance c(0.7, 0.1, 0.1, 0.1) #' will be compared to #' c(0.25, 0.25, 0.25, 0.25) #' \param x a vector containing the #' probability mass function values of #' the distribution for all possible #' values. #' \return the Kullback-Leibler #' divergence kl.divergence = function(x) { kl = 0 p0 = 1 / length(x) for(i in x) { kl = kl + (i * log(i/p0)) } return(kl) } #' A function to plot a DNA logo of a letter probability #' matrix (pwm). In essence, it does exactly the same #' as seqLogo::seqLogo except that it does not need #' a new display device on its own. #' \param pwm the letter probability matrix. #' \param path.a the path to a file containing #' the image to display for the A character, #' in PNG format. #' \param path.c the path to a file containing #' the image to display for the C character, #' in PNG format. #' \param path.g the path to a file containing #' the image to display for the G character, #' in PNG format. #' \param path.t the path to a file containing #' the image to display for the T character, #' in PNG format. #' \param pseudocounts a pseudocounts to add to #' the probabilities to avoid 0's. #' \param ... additional plotting parameters for #' plot(). #' \author Romain Groux plot.logo = function(pwm, path.a, path.c, path.g, path.t, pseudocounts=10e-10, ...) { n.row = 4 n.col = ncol(pwm) if(nrow(pwm) != n.row) { stop("Error! pwm should have 4 rows!") } if(length(dim(pwm)) != 2) { stop("Error! pwm should be a matrix!") } # images for nucleotides require(png) image.a = readPNG(path.a) image.c = readPNG(path.c) image.g = readPNG(path.g) image.t = readPNG(path.t) # add pseudo-counts to avoid 0's pwm = pwm + pseudocounts for(j in 1:n.col) { pwm[,j] = pwm[,j] / sum(pwm[,j]) } # entropy h = rep(0, n.col) for(j in 1:n.col) { for(i in 1:n.row) { h[j] = h[j] - pwm[i,j] * log2(pwm[i,j]) } } # information content r = -h + log2(4) # height heights = matrix(nrow=n.row, ncol=n.col, data=0) for(i in 1:n.row) { for(j in 1:n.col) { heights[i,j] = pwm[i,j] * r[j] } } # compute coordinates x.coord = matrix(nrow=2, ncol=n.col, data=0) rownames(x.coord) = c("from", "to") for(i in 1:n.col) { x.coord[1,i] = i - 0.5 x.coord[2,i] = i + 0.5 } # plot x.lim = c(1,n.col) y.lim = c(0,2) x.at = 1:n.col plot(0, 0, col=0, xlim=x.lim, ylim=y.lim, bty='n', xaxt='n', yaxt='n', xlab="", ylab="", ...) # axis(1, at=x.at, labels=x.at) for(j in 1:n.col) { # highest at top ord = order(heights[,j], decreasing=F) x_left = x.coord[1,j] x_right = x.coord[2,j] y_curr = 0 for(i in ord) { height = heights[i,j] y_bottom = y_curr y_top = y_bottom + height if(i == 1) { rasterImage(image.a, x_left, y_bottom, x_right, y_top) } if(i == 2) { rasterImage(image.c, x_left, y_bottom, x_right, y_top) } if(i == 3) { rasterImage(image.g, x_left, y_bottom, x_right, y_top) } if(i == 4) { rasterImage(image.t, x_left, y_bottom, x_right, y_top) } y_curr = y_curr + height } } } #' Compute the euclidean distance between two models. #' It also check if a reference is in reverse orientation #' and returns the smallest distance value. #' \param ref1 a vector containing the first reference. #' \param ref2 a vector containing the second reference. #' \return the euclidean distance. eucl.dist.models = function(mod1, mod2) { return(min(sqrt(sum(((mod1 - mod2 ) ^ 2))), sqrt(sum(((mod1 - rev(mod2)) ^ 2))))) } #' Compute the correlation distance between two models. #' It also check if a reference is in reverse orientation #' and returns the smallest distance value. #' \param ref1 a vector containing the first reference. #' \param ref2 a vector containing the second reference. #' \return the euclidean distance. cor.dist.models= function(mod1, mod2) { return(1 - min(cor(mod1, mod2 ), cor(mod1, rev(mod2)))) } #' Computes the (eucliden) distance matrix for all the given #' the models As some models may be in reverse #' orientation compared to others, the distance in both #' orientation is computed, for each pair, and the best is #' returned. #' \param models a matrix with the models on each row. #' \return a matrix containing the distances between each reference. distance.model = function(models) { n = nrow(models) d = matrix(nrow=n, ncol=n, data=0) for(i in 1:n) { for(j in 1:i) { x = eucl.dist.models(models[i,], models[j,]) d[i,j] = x d[j,i] = x } } return(d) } get_matches = function(distances, run_value) { matches = matrix(nrow=0, ncol=4) # references of run i on the row -> y coord # references of run j on the col -> x coord # run labels run_i = 1 # run_j = 2 for(run_j in setdiff(unique(run_value), run_i)) { # number of references in each run n_i = length(which(run_value == run_i)) n_j = length(which(run_value == run_j)) index_i = which(run_value == run_i) # rows of run i index_j = which(run_value == run_j) # columns of run j i_taken = c() # classes of i already matched -> rows to ignore j_taken = c() # classes of j already matched -> columns to ignore # while not all classes in j have been assigned a best match row_n = 1 while(length(j_taken) < n_j) { if(length(i_taken) == 0 && length(j_taken) == 0) { distances_tmp = distances[index_i, index_j, drop=F] coord = which(distances_tmp == min(distances_tmp), arr.ind=T) coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) coord = c(coord_i, coord_j) } else { rows = setdiff(index_i, i_taken) cols = setdiff(index_j, j_taken) distances_tmp = distances[rows, cols, drop=F] coord = which(distances_tmp == min(distances_tmp), arr.ind=T) coord_i = as.numeric(rownames(distances_tmp)[coord[1]]) coord_j = as.numeric(colnames(distances_tmp)[coord[2]]) coord = c(coord_i, coord_j) } coord = c(coord, row_n, run_j) i_taken = c(i_taken, coord[1]) j_taken = c(j_taken, coord[2]) matches = rbind(matches, coord) row_n = row_n + 1 } } return(matches) } #'Creates a composite figure in which several class references from #'several partitions, with different numbers of classes, are plotted. #'The figure is composed of a matrix of rows and #'columns where is the highest number of classes in all #'partitions and the number of different partition. T #'The first column will contain the references of the #'partition with classes. The next columns will contain the #'references of the partition with the second biggest number of #'classes (and so on). In a given column, except the 1st one, #'the references are ordered (over the rows) such that the #'overall similarity (euclidean distance) with the 1st column #'references are maximized. #'\param file the file name where the image will be saved. #'\param references a matrix with the different references to draw on #'each row. #'\param references a vector containing the class probability (or weight) associated #'to each corresponding reference (row) in matrix. #'\param probabilities a vector of values that will be displayed atop of each #'column of plots. #'\param colors a vector of colors to draw the class profiles. There should #'be colors, they can be the same. #'\param distances a distance matrix containing the distance between all #'references. The row and column labels have to be the row and column #'number (1, 2, 3, ...)! #'\param n_run the total number of different partitions to which all #'references belong. #'\param run_value a vector indicating to which partition each reference #'(row of references) belong to. It should be a simple vector of integers, #'for instance 1,1,1,1,2,2,2,3,3 #'\param n_class_max, the highest number of classes searches in all partitions () plot.references = function(file, references, probabilities, colors, col.titles, distances, n_run, run_value, n_class_max, width=15, height=18) { # compute the best matches between all references to 1st run references matches = get_matches(distances, run_value) # make a matrix for layout with good plot numbers plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles) plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab) plots.lab[-1,1] = 1:n_class_max # for run with max number of classes z = n_class_max + 1 for(i in 1:nrow(matches)) { coord = matches[i,] # plots.lab[coord[3], coord[4]] = z plots.lab[coord[1]+1, coord[4]] = z z = z + 1 } # these will be the empty plots for(i in 1:nrow(plots.lab)) { for(j in 1:ncol(plots.lab)) { if(is.na(plots.lab[i,j])) { plots.lab[i,j] = z z = z + 1 } } } # plot if(!is.null(file)) { png(filename=file, width=width, height=height, unit="in", res=720) } else { X11(width=width, height=height) } # a grid m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) ) layout.show(m) x = 1:ncol(references) # plot references of partition with highest number of classes for(i in 1:n_class_max) { plot(x=x, y=references[i,], lwd=2, type='l', ylim=c(0, 1.2*max(references[i,])), col=colors[i], main="", xlab="pos [bp]", ylab="Nb reads") # prob x_ = 0.85*length(references[i,]) y_ = max(references[i,]) lab = round(probabilities[i],3) text(x=x_, y=y_, labels=lab, cex=1.2) } # plot others for(i in 1:nrow(matches)) { ref_index = matches[i,2] col_index = matches[i,3] plot(x=x, y=references[ref_index,], lwd=2, type='l', ylim=c(0, 1.2*max(references[ref_index,])), col=colors[col_index], main="", xlab="pos [bp]", ylab="Nb reads") # prob x_ = 0.85*length(references[ref_index,]) y_ = max(references[ref_index,]) lab = round(probabilities[ref_index],3) text(x=x_, y=y_, labels=lab, cex=1.2) } # empty plots for(i in (length(run_value)+1):(n_run*n_class_max)) { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") } # col titles p = par(mar=c(0,0,0,0)) for(i in 1:length(col.titles)) { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") text(1,1, labels=col.titles[i], cex=2) } par(p) if(!is.null(file)) { dev.off() } } diff --git a/scripts/test.R b/scripts/test.R deleted file mode 100644 index 68eb808..0000000 --- a/scripts/test.R +++ /dev/null @@ -1,284 +0,0 @@ -setwd(file.path("/", "local", "groux", "scATAC-seq")) - -# libraries -library(RColorBrewer) - -# functions -source(file.path("scripts", "functions.R")) - -#' Converts a sequence in character format -#' to integer format A->0, C->1, N->2, G->3 -#' T->4. -#' \param seq a vector containing the sequence -#' in character format. -#' \return a vector containing the sequence -#' in integer format. -#' \author Romain Groux -char.to.int = function(seq) -{ seq_int = vector(length=length(seq)) - for(i in 1:length(seq)) - { if(seq[i] == 'A') { seq_int[i] = 0 } - if(seq[i] == 'C') { seq_int[i] = 1 } - if(seq[i] == 'N') { seq_int[i] = 2 } - if(seq[i] == 'G') { seq_int[i] = 3 } - if(seq[i] == 'T') { seq_int[i] = 4 } - } - return(seq_int) -} - -#' Generates the reverse complement of a kmer. -#' \param kmer a vector containing the kmer in -#' integer format. -#' \return a vector containing the reverse -#' complement kmer -#' \author Romain Groux -get_rev_compl = function(kmer) -{ kmer_rv = vector(length=length(kmer), mode="numeric") - i_rv = length(kmer) - for(i in 1:length(kmer)) - { if(kmer[i] == 0) { kmer_rv[i_rv] = 4 } # A - if(kmer[i] == 1) { kmer_rv[i_rv] = 3 } # C - if(kmer[i] == 2) { kmer_rv[i_rv] = 2 } # N - if(kmer[i] == 3) { kmer_rv[i_rv] = 1 } # G - if(kmer[i] == 4) { kmer_rv[i_rv] = 0 } # T - } - return(kmer_rv) -} - -#' Generates a hash given a kmer. -#' Kmers with a same length are guaranteed -#' to have different hashes. -#' AA..AA will generate a hash of 1, -#' AA..AC will generate a hash of 2, -#' AA..AN will generate a hash of 3, -#' AA..AG will generate a hash of 4, -#' AA..AT will generate a hash of 5, -#' TT..TG will generate a hash of 5**k - 1, -#' TT..TT will generate a hash of 5**k -#' \param seq a vector containing the kmer -#' in integer format : A->0, C->1, N->2, G->3, -#' T->4. -#' \return the kmer hash -#' \author Romain Groux -hash = function(seq) -{ k = length(seq) ; z = 5 - h = 0 - for(i in 0:(length(seq)-1)) - { if(seq[i+1] == 0) { h = h + (0*(z**(k-i-1))) } # A - if(seq[i+1] == 1) { h = h + (1*(z**(k-i-1))) } # C - if(seq[i+1] == 2) { h = h + (2*(z**(k-i-1))) } # N - if(seq[i+1] == 3) { h = h + (3*(z**(k-i-1))) } # G - if(seq[i+1] == 4) { h = h + (4*(z**(k-i-1))) } # T - } - return(h+1) -} - -#' Computes the hash of a sequence and of -#' its reverse complement and returns the -#' smallest one. -#' \param seq a vector containing the -#' sequence in integer format : : A->0, -#' C->1, N->2, G->3, T->4. -#' \author Romain Groux -hash.min(seq) -{ seq_r = get_rev_compl(seq) - return(min(hash(seq), hash(seq_r))) -} - -#' Generates all kmers for a given value of K -#' and return them in lexicographic order. -#' \param k the kmer length. -#' \return a matrix with the different kmers -#' on the rows and k columns. The kmers are -#' in integer format : A->0, C->1, N->2, G->3, -#' T->4. -#' \author Romain Groux -generate_all_kmers = function(k) -{ kmers = matrix(nrow=5**k, ncol=k, data=-1) - n = k - currentWord = rep(1, n) - i = 1 - while(n > 0) - { kmers[i,] = currentWord - i = i + 1 - while(n>0 && currentWord[n+1-1] == 5) - { currentWord[n] = 1 - n = n - 1 - } - if(n > 0) - { currentWord[n] = currentWord[n] + 1 - n = k - } - } - return(kmers - 1) -} - - -data = as.matrix(read.table(file.path("data", - "10xgenomics_PBMC_5k_peaks", - "peaks_rmsk_sampled_sequences_1kb.mat"))) - -data = as.matrix(read.table(file.path("data/toy_data/simulated_sequences_2class_flip.mat"))) -data = apply(data, 1, char.to.int) - -k = 5 -n_kmer = 5**k -hmax = ceiling(n_kmer / 2) -n_shift = ncol(data) - k + 1 - -# transitions and counts -counts = vector(length=n_kmer, mode="numeric") -kmers = generate_all_kmers(k) -counts = vector(length=n_kmer, mode="numeric") -t_out = matrix(nrow=n_kmer, ncol=n_kmer, data=0) -t_in = t_out -t_all = t_out -for(i in 1:nrow(data)) -{ for(j in 1:n_shift) - { # no in transition (1st kmer) - if(j == 1) - { # kmer1 < kmer2 - from1 = j ; to1 = from1 + k - 1 ; kmer1 = data[i,from1:to1] ; - from2 = j+1 ; to2 = from2 + k - 1 ; kmer2 = data[i,from2:to2] ; - kmer1r = get_rev_compl(kmer2) ; kmer2r = get_rev_compl(kmer1) ; - idx1 = hash(kmer1) ; idx1r = hash(kmer1r) ; - idx2 = hash(kmer2) ; idx2r = hash(kmer2r) ; - # out transition kmer1 -> kmer2 - t_out[idx1,idx2] = t_out[idx1,idx2] + 1 - t_out[idx1r,idx2r] = t_out[idx1r,idx2r] + 1 - # number of edges - t_all[idx1,idx2] = t_all[idx1,idx2] + 1 - t_all[idx2,idx1] = t_all[idx2,idx1] + 1 - t_all[idx1r,idx2r] = t_all[idx1r,idx2r] + 1 - t_all[idx2r,idx1r] = t_all[idx2r,idx1r] + 1 - # counts - counts[idx1] = counts[idx1] + 1 - counts[idx1r] = counts[idx1r] + 1 - } - # no out transition (last kmer) - else if(j == n_shift) - { # kmer1 < kmer2 - from1 = j-1 ; to1 = from1 + k - 1 ; kmer1 = data[i,from1:to1] ; - from2 = j ; to2 = from2 + k - 1 ; kmer2 = data[i,from2:to2] ; - kmer1r = get_rev_compl(kmer2) ; kmer2r = get_rev_compl(kmer1) ; - idx1 = hash(kmer1) ; idx1r = hash(kmer1r) ; - idx2 = hash(kmer2) ; idx2r = hash(kmer2r) ; - # in transition kmer1 <- kmer2 - t_in[idx1,idx2] = t_in[idx1,idx2] + 1 - t_in[idx1r,idx2r] = t_in[idx1r,idx2r] + 1 - # number of edges - t_all[idx1,idx2] = t_all[idx1,idx2] + 1 - t_all[idx2,idx1] = t_all[idx2,idx1] + 1 - t_all[idx1r,idx2r] = t_all[idx1r,idx2r] + 1 - t_all[idx2r,idx1r] = t_all[idx2r,idx1r] + 1 - # counts - # no need, kmer2 was counted at last iteration as kmer2 - } - # both out and in transitions (middle) - else - { # kmer0 < kmer1 < kmer2 - from0 = j ; to0 = from0 + k - 1 ; kmer0 = data[i,from0:to0] ; - from1 = j ; to1 = from1 + k - 1 ; kmer1 = data[i,from1:to1] ; - from2 = j+1 ; to2 = from2 + k - 1 ; kmer2 = data[i,from2:to2] ; - kmer0r = get_rev_compl(kmer2) ; kmer1r = get_rev_compl(kmer1) ; kmer2r = get_rev_compl(kmer0) ; - idx0 = hash(kmer0) ; idx0r = hash(kmer0r) ; - idx1 = hash(kmer1) ; idx1r = hash(kmer1r) ; - idx2 = hash(kmer2) ; idx2r = hash(kmer2r) ; - # out transition kmer1 -> kmer2 - t_out[idx1,idx2] = t_out[idx1,idx2] + 1 - t_out[idx1r,idx2r] = t_out[idx1r,idx2r] + 1 - # in transition kmer0 -> kmer1 - t_in[idx1,idx0] = t_in[idx1,idx0] + 1 - t_in[idx1r,idx0r] = t_in[idx1r,idx0r] + 1 - # number of edges - t_all[idx0,idx1] = t_all[idx0,idx1] + 1 - t_all[idx1,idx0] = t_all[idx1,idx0] + 1 - t_all[idx1,idx2] = t_all[idx1,idx2] + 1 - t_all[idx2,idx1] = t_all[idx2,idx1] + 1 - t_all[idx0r,idx1r] = t_all[idx0r,idx1r] + 1 - t_all[idx1r,idx0r] = t_all[idx1r,idx0r] + 1 - t_all[idx1r,idx2r] = t_all[idx1r,idx2r] + 1 - t_all[idx2r,idx1r] = t_all[idx2r,idx1r] + 1 - # counts - counts[idx1] = counts[idx1] + 1 - counts[idx1r] = counts[idx1r] + 1 - } - } -} - -# spectral clustering -# t_all is the affinity matrix -# compute the degree matrix -d = diag(apply(t_in, 1, sum)) # sum rows -# unormalized laplacian -u = d - t_in -# get eigen values and vectors -evL = eigen(u, symmetric=TRUE) -# plot eigen values -plot(1:20, rev(evL$values)[1:20], type='b') -# partition -partitions = list() -for(n_clust in 2:20) -{ print(n_clust) - # get K biggest eigen values and vectors -> embedding space - z = evL$vectors[,(ncol(evL$vectors)-n_clust+1):ncol(evL$vectors)] - partitions[[n_clust]] = kmeans(z, centers=n_clust, iter.max=100, nstart=10) -} - - -plot(evL$vectors[,3124:3125]) - -# motif 1 is ACGTTGCA -kmers_motif1 = matrix(ncol=k, - data=c(0,1,2,3,3, - 1,2,3,3,2, - 2,3,3,2,1, - 3,3,2,1,0), - byrow=T) -# motif 2 is GCGAATTT -kmers_motif2 = matrix(ncol=k, - data=c(2,1,2,0,0, - 1,3,0,0,3, - 3,0,0,3,3, - 0,0,3,3,3), - byrow=T) -idx1 = apply(kmers_motif1, 1, hash) -idx2 = apply(kmers_motif2, 1, hash) - -partitions[[2]]$size - -partitions[[2]]$cluster[idx1] -partitions[[2]]$cluster[idx2] - - - - -c1 = which(partitions[[2]]$cluster == 1) -c2 = which(partitions[[2]]$cluster == 2) - -plot(evL$vectors[,3124:3125], col=partitions[[2]]$cluster+1, cex=0.1) - -points(evL$vectors[idx1,3124:3125], col=2) -points(evL$vectors[idx2,3124:3125], col=3) - -par(mfrow=c(3,1)) -plot(t_all[idx1[1],], type='l', ylim=c(0,50)) ; abline(v=idx1, col="red", lwd=0.2) ; abline(v=idx1[1], col="blue") -plot(t_all[idx1[2],], type='l', ylim=c(0,50)) ; abline(v=idx1, col="red", lwd=0.2) -plot(t_all[idx1[3],], type='l', ylim=c(0,50)) ; abline(v=idx1, col="red", lwd=0.2) - - -boxplot(counts, counts[idx1], counts[idx2], outline=F) - - - - -# reconstruct kmers -best.k = 2 -partition = partitions[[best.k]] -clusters = partition$cluster -c1 = which(clusters == 1) -c2 = which(clusters == 2) -best1 = which.max(counts[c1]) -best2 = which.max(counts[c2]) - - diff --git a/scripts/test/analysis_test_sampled.R b/scripts/test/analysis_test_sampled.R deleted file mode 100644 index afcf023..0000000 --- a/scripts/test/analysis_test_sampled.R +++ /dev/null @@ -1,97 +0,0 @@ -setwd(file.path("/", "local", "groux", "scATAC-seq")) - -# libraries -library(RColorBrewer) -library(seqLogo) - -# functions -source(file.path("scripts", "functions.R")) - -# the number of classes searched -n.classes = c(10, 20, 30) - -# path to the images for the logo -path.a = file.path("res/A.png") -path.c = file.path("res/C.png") -path.g = file.path("res/G.png") -path.t = file.path("res/T.png") - -################## sequence patterns around ctcf motifs ################## - -for(k in n.classes) -{ - # sequence - data = read.sequence.models(file.path("results", "test_1kb", - sprintf("peaks_rmsk_sampled_sequences_%dclass_model.mat", k))) - model.seq = data$models - model.prob = data$prob - data = NULL - - # open chromatin - model.open = read.read.models(file.path("results", "test_1kb", - sprintf("peaks_rmsk_sampled_openchromatin_%dclass_model.mat", k)))$models - # nucleosomes - model.nucl = read.read.models(file.path("results", "test_1kb", - sprintf("peaks_rmsk_sampled_nucleosomes_%dclass_model.mat", k)))$models - - # plot classes - col = brewer.pal(3, "Set1") - # X11(width=26, height=12) - png(filename=file.path("results", "test_1kb", - sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), - units="in", res=720, width=18, height=12) - m = matrix(1:30, nrow=6, ncol=5, byrow=F) - layout(m) - # order from most to least probable class - ord = order(model.prob, decreasing=T) - ref.open = model.open[ord,, drop=F] - ref.nucl = model.nucl[ord,, drop=F] - ref.seq = model.seq[,,ord, drop=F] - prob = model.prob[ord] - class = c(1:nrow(ref.open))[ord] - for(i in 1:nrow(ref.open)) - { # plot logo - plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, - main=sprintf("class %d (p=%.2f)", class[i], prob[i])) - # x-axis - x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) - x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) - axis(1, at=x.at, labels=x.lab) - # y-axis is [0,1] for min/max signal - y.at = seq(0, 2, length.out=2) - y.lab = c("min", "max") - axis(2, at=y.at, labels=y.lab) - # plot signal (multiplies by 2 because the y-axis goes to 2 bits) - lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) - lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) - } - # inlets with center - # row_n = 1 # row counter - # col_n = 1 # column counter - # for(i in 1:nrow(ref.open)) - # { # plot logo center - # right = 0.5*col_n - 0.01 - # left = right - 0.2 - # bottom = 1-(row_n*(0.2))+0.05 - # top = bottom + 0.15 - # par(fig=c(left, right, bottom, top), new=T) - # idx = (391-1-20):(391+1+20) - # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) - # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) - # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) - # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) - # # xaxis - # x.at = seq(1, length(idx), length.out = 3) - # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] - # axis(1, at=x.at, labels=x.lab) - # # yaxis - # axis(2, at=y.at, labels=y.lab) - # row_n = row_n + 1 - # if(i %% 5 == 0) - # { col_n = col_n + 1 - # row_n = 1 - # } - # } - dev.off() -} - diff --git a/scripts/test/test_1kb.sh b/scripts/test/test_1kb.sh deleted file mode 100755 index 3bc3541..0000000 --- a/scripts/test/test_1kb.sh +++ /dev/null @@ -1,35 +0,0 @@ - -# paths -## dir -data_dir="results/10xgenomics_PBMC_5k" -results_dir="results/test_1kb" -## matrix files -file_mat_open=$data_dir/'peaks_rmsk_open_bin1bp_1kb_read_atac.mat' -file_mat_nucl=$data_dir/'peaks_rmsk_nucleosomes_bin1bp_1kb_fragment_center.mat' -file_mat_seq=$data_dir/'peaks_rmsk_sequences_1kb.mat' -## file with seeds -file_seed=$results_dir'/peaks_rmsk_seed.txt' - -mkdir -p $results_dir -touch $file_seed - -# EM param -n_iter='100' -n_shift='951' -n_core=12 - -# classify -for k in 10 20 30 -do - ## results files - file_prob=$results_dir/'peaks_rmsk_sequences_'$k'class_prob.mat4d' - file_mod1=$results_dir/'peaks_rmsk_openchromatin_'$k'class_model.mat' - file_mod2=$results_dir/'peaks_rmsk_nucleosomes_'$k'class_model.mat' - file_mod3=$results_dir/'peaks_rmsk_sequences_'$k'class_model.mat' - seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) - echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 -done diff --git a/scripts/test/test_1kb_pwms.sh b/scripts/test/test_1kb_pwms.sh deleted file mode 100755 index f7c51f8..0000000 --- a/scripts/test/test_1kb_pwms.sh +++ /dev/null @@ -1,49 +0,0 @@ - -# paths -## dir -pwm_dir="data/pwm/jaspar_2018_clustering/" -data_dir="results/10xgenomics_PBMC_5k" -results_dir="results/test_1kb_pwms" -## matrix files -file_mat_open=$data_dir/'peaks_rmsk_open_bin1bp_1kb_read_atac.mat' -file_mat_nucl=$data_dir/'peaks_rmsk_nucleosomes_1kb_bin1bp_fragment_center.mat' -file_mat_seq=$data_dir/'peaks_rmsk_sequences_1kb.mat' -## PWM files -jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" -hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" -myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" -pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" -cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" -irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" -irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" -lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" -foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" -sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" -mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" -elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" -stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" -nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" -ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" -elf2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" -ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" - -mkdir -p $results_dir - -# EM param -n_iter='100' -n_shift='951' -n_core=12 - -# classify -## results files -file_prob=$results_dir/'peaks_rmsk_sequences_1kb_15class_prob.mat4d' -file_mod1=$results_dir/'peaks_rmsk_openchromatin_1kb_15class_model.mat' -file_mod2=$results_dir/'peaks_rmsk_nucleosomes_1kb_15class_model.mat' -file_mod3=$results_dir/'peaks_rmsk_sequences_1kb_15class_model.mat' - -bin/EMSequence --seq $file_mat_seq --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$nfe2,$ahr,$elf2 --shift $n_shift --flip --iter $n_iter --thread $n_core > $file_prob - -bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 -bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 -bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 - diff --git a/scripts/test/test_1kb_sampled.sh b/scripts/test/test_1kb_sampled.sh deleted file mode 100755 index edd22fe..0000000 --- a/scripts/test/test_1kb_sampled.sh +++ /dev/null @@ -1,35 +0,0 @@ - -# paths -## dir -data_dir="results/10xgenomics_PBMC_5k" -results_dir="results/test_1kb" -## matrix files -file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_1kb_read_atac.mat' -file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_bin1bp_1kb_fragment_center.mat' -file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' -## file with seeds -file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' - -mkdir -p $results_dir -touch $file_seed - -# EM param -n_iter='100' -n_shift='951' -n_core=12 - -# classify -for k in 10 20 30 -do - ## results files - file_prob=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_prob.mat4d' - file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_'$k'class_model.mat' - file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_'$k'class_model.mat' - file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_model.mat' - seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) - echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 -done diff --git a/scripts/test/test_1kb_sampled_pwms.sh b/scripts/test/test_1kb_sampled_pwms.sh deleted file mode 100755 index 454cae9..0000000 --- a/scripts/test/test_1kb_sampled_pwms.sh +++ /dev/null @@ -1,49 +0,0 @@ - -# paths -## dir -pwm_dir="data/pwm/jaspar_2018_clustering/" -data_dir="results/10xgenomics_PBMC_5k" -results_dir="results/test_1kb_pwms" -## matrix files -file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_1kb_read_atac.mat' -file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_1kb_bin1bp_fragment_center.mat' -file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' -## PWM files -jun="$pwm_dir/cluster_3_node_23_20_motifs_prob.mat" -hif1a="$pwm_dir/cluster_4_node_31_3_motifs_prob.mat" -myc="$pwm_dir/cluster_4_node_22_4_motifs_prob.mat" -pu1="$pwm_dir/cluster_7_node_13_2_motifs_prob.mat" -cebpb="$pwm_dir/cluster_5_node_20_5_motifs_prob.mat" -irf4="$pwm_dir/cluster_31_node_4_5_motifs_prob.mat" -irf2="$pwm_dir/cluster_31_node_5_2_motifs_prob.mat" -lhx3="$pwm_dir/cluster_1_node_74_2_motifs_prob.mat" -foxh1="$pwm_dir/cluster_66_1_motifs_prob.mat" -sox3="$pwm_dir/cluster_33_node_1_2_motifs_prob.mat" -mef2c="$pwm_dir/cluster_20_4_motifs_prob.mat" -elf5="$pwm_dir/cluster_7_node_17_5_motifs_prob.mat" -# stat6="$pwm_dir/cluster_32_node_STAT6_1_motifs_prob.mat" -nfe2="$pwm_dir/cluster_3_node_24_4_motifs_prob.mat" -ahr="$pwm_dir/cluster_4_node_30_2_motifs_prob.mat" -elf2="$pwm_dir/cluster_39_node_1_2_motifs_prob.mat" -# ctcf="$pwm_dir/cluster_48_node_ctcf_1_motifs_prob.mat" - -mkdir -p $results_dir - -# EM param -n_iter='100' -n_shift='951' -n_core=12 - -# classify -## results files -file_prob=$results_dir/'peaks_rmsk_sampled_sequences_1kb_15class_prob.mat4d' -file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_1kb_15class_model.mat' -file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_1kb_15class_model.mat' -file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_1kb_15class_model.mat' - -bin/EMSequence --seq $file_mat_seq --motifs $jun,$hif1a,$myc,$pu1,$cebpb,$irf4,$irf2,$lhx3,$foxh1,$sox3,$mef2c,$elf5,$nfe2,$ahr,$elf2 --shift $n_shift --flip --iter $n_iter --thread $n_core > $file_prob - -bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 -bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 -bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 - diff --git a/scripts/test/test_2kb.sh b/scripts/test/test_2kb.sh deleted file mode 100755 index dbabf08..0000000 --- a/scripts/test/test_2kb.sh +++ /dev/null @@ -1,36 +0,0 @@ - -# paths -## dir -data_dir="results/10xgenomics_PBMC_5k" -results_dir="results/test_2kb" -## matrix files -file_mat_open=$data_dir/'peaks_rmsk_open_bin1bp_2kb_read_atac.mat' -file_mat_nucl=$data_dir/'peaks_rmsk_nucleosomes_bin1bp_2kb_fragment_center.mat' -file_mat_seq=$data_dir/'peaks_rmsk_sequences_2kb.mat' -## file with seeds -file_seed=$results_dir'/peaks_rmsk_seed.txt' - -mkdir -p $results_dir -touch $file_seed - -# EM param -n_iter='100' -n_shift='201' -n_core=12 - -# classify -for k in 10 20 30 -do - ## results files - file_prob=$results_dir/'peaks_rmsk_sequences_'$k'class_prob.mat4d' - file_mod1=$results_dir/'peaks_rmsk_openchromatin_'$k'class_model.mat' - file_mod2=$results_dir/'peaks_rmsk_nucleosomes_'$k'class_model.mat' - file_mod3=$results_dir/'peaks_rmsk_sequences_'$k'class_model.mat' - seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) - echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 -done - diff --git a/scripts/test/test_2kb_sampled.sh b/scripts/test/test_2kb_sampled.sh deleted file mode 100755 index 2f3e309..0000000 --- a/scripts/test/test_2kb_sampled.sh +++ /dev/null @@ -1,36 +0,0 @@ - -# paths -## dir -data_dir="results/10xgenomics_PBMC_5k" -results_dir="results/test_2kb" -## matrix files -file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_2kb_read_atac.mat' -file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_bin1bp_2kb_fragment_center.mat' -file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_2kb.mat' -## file with seeds -file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' - -mkdir -p $results_dir -touch $file_seed - -# EM param -n_iter='100' -n_shift='201' -n_core=12 - -# classify -for k in 10 20 30 -do - ## results files - file_prob=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_prob.mat4d' - file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_'$k'class_model.mat' - file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_'$k'class_model.mat' - file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_model.mat' - seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) - echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 -done - diff --git a/scripts/test_2/analysis_test_2_sampled.R b/scripts/test_2/analysis_test_2_sampled.R deleted file mode 100644 index afcf023..0000000 --- a/scripts/test_2/analysis_test_2_sampled.R +++ /dev/null @@ -1,97 +0,0 @@ -setwd(file.path("/", "local", "groux", "scATAC-seq")) - -# libraries -library(RColorBrewer) -library(seqLogo) - -# functions -source(file.path("scripts", "functions.R")) - -# the number of classes searched -n.classes = c(10, 20, 30) - -# path to the images for the logo -path.a = file.path("res/A.png") -path.c = file.path("res/C.png") -path.g = file.path("res/G.png") -path.t = file.path("res/T.png") - -################## sequence patterns around ctcf motifs ################## - -for(k in n.classes) -{ - # sequence - data = read.sequence.models(file.path("results", "test_1kb", - sprintf("peaks_rmsk_sampled_sequences_%dclass_model.mat", k))) - model.seq = data$models - model.prob = data$prob - data = NULL - - # open chromatin - model.open = read.read.models(file.path("results", "test_1kb", - sprintf("peaks_rmsk_sampled_openchromatin_%dclass_model.mat", k)))$models - # nucleosomes - model.nucl = read.read.models(file.path("results", "test_1kb", - sprintf("peaks_rmsk_sampled_nucleosomes_%dclass_model.mat", k)))$models - - # plot classes - col = brewer.pal(3, "Set1") - # X11(width=26, height=12) - png(filename=file.path("results", "test_1kb", - sprintf("peaks_rmsk_sampled_sequences_%dclass.png", k)), - units="in", res=720, width=18, height=12) - m = matrix(1:30, nrow=6, ncol=5, byrow=F) - layout(m) - # order from most to least probable class - ord = order(model.prob, decreasing=T) - ref.open = model.open[ord,, drop=F] - ref.nucl = model.nucl[ord,, drop=F] - ref.seq = model.seq[,,ord, drop=F] - prob = model.prob[ord] - class = c(1:nrow(ref.open))[ord] - for(i in 1:nrow(ref.open)) - { # plot logo - plot.logo(ref.seq[,,i], path.a, path.c, path.g, path.t, - main=sprintf("class %d (p=%.2f)", class[i], prob[i])) - # x-axis - x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2, length.out=3) - x.at = seq(1, ncol(ref.open), length.out=length(x.lab)) - axis(1, at=x.at, labels=x.lab) - # y-axis is [0,1] for min/max signal - y.at = seq(0, 2, length.out=2) - y.lab = c("min", "max") - axis(2, at=y.at, labels=y.lab) - # plot signal (multiplies by 2 because the y-axis goes to 2 bits) - lines(2*(ref.open[i,] / max(ref.open[i,])), lwd=1, col=col[1]) - lines(2*(ref.nucl[i,] / max(ref.nucl[i,])), lwd=1, col=col[2]) - } - # inlets with center - # row_n = 1 # row counter - # col_n = 1 # column counter - # for(i in 1:nrow(ref.open)) - # { # plot logo center - # right = 0.5*col_n - 0.01 - # left = right - 0.2 - # bottom = 1-(row_n*(0.2))+0.05 - # top = bottom + 0.15 - # par(fig=c(left, right, bottom, top), new=T) - # idx = (391-1-20):(391+1+20) - # plot.logo(ref.seq[,idx,i], path.a, path.c, path.g, path.t) - # # plot signal (multiplies by 2 because the y-axis goes to 2 bits) - # lines(2*(ref.open[i,idx] / max(ref.open[i,])), lwd=1, col=col[1]) - # lines(2*(ref.nucl[i,idx] / max(ref.nucl[i,])), lwd=1, col=col[2]) - # # xaxis - # x.at = seq(1, length(idx), length.out = 3) - # x.lab = seq(-(ncol(ref.open)-1)/2, (ncol(ref.open)-1)/2)[idx][x.at] - # axis(1, at=x.at, labels=x.lab) - # # yaxis - # axis(2, at=y.at, labels=y.lab) - # row_n = row_n + 1 - # if(i %% 5 == 0) - # { col_n = col_n + 1 - # row_n = 1 - # } - # } - dev.off() -} - diff --git a/scripts/test_2/test_1kb_sampled.sh b/scripts/test_2/test_1kb_sampled.sh deleted file mode 100755 index 2b9e096..0000000 --- a/scripts/test_2/test_1kb_sampled.sh +++ /dev/null @@ -1,35 +0,0 @@ - -# paths -## dir -data_dir="results/10xgenomics_PBMC_5k" -results_dir="results/test_1kb_2" -## matrix files -file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_1kb_read_atac.mat' -file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_bin1bp_1kb_fragment_center.mat' -file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_1kb.mat' -## file with seeds -file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' - -mkdir -p $results_dir -touch $file_seed - -# EM param -n_iter='100' -n_shift='971' -n_core=14 - -# classify -for k in 10 20 30 -do - ## results files - file_prob=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_prob.mat4d' - file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_'$k'class_model.mat' - file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_'$k'class_model.mat' - file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_model.mat' - seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) - echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 -done diff --git a/scripts/test_2/test_2kb_sampled.sh b/scripts/test_2/test_2kb_sampled.sh deleted file mode 100755 index 02af0c9..0000000 --- a/scripts/test_2/test_2kb_sampled.sh +++ /dev/null @@ -1,36 +0,0 @@ - -# paths -## dir -data_dir="results/10xgenomics_PBMC_5k" -results_dir="results/test_2kb_2" -## matrix files -file_mat_open=$data_dir/'peaks_rmsk_sampled_open_bin1bp_2kb_read_atac.mat' -file_mat_nucl=$data_dir/'peaks_rmsk_sampled_nucleosomes_bin1bp_2kb_fragment_center.mat' -file_mat_seq=$data_dir/'peaks_rmsk_sampled_sequences_2kb.mat' -## file with seeds -file_seed=$results_dir'/peaks_rmsk_sampled_seed.txt' - -mkdir -p $results_dir -touch $file_seed - -# EM param -n_iter='100' -n_shift='971' -n_core=14 - -# classify -for k in 10 20 30 -do - ## results files - file_prob=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_prob.mat4d' - file_mod1=$results_dir/'peaks_rmsk_sampled_openchromatin_'$k'class_model.mat' - file_mod2=$results_dir/'peaks_rmsk_sampled_nucleosomes_'$k'class_model.mat' - file_mod3=$results_dir/'peaks_rmsk_sampled_sequences_'$k'class_model.mat' - seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) - echo "$file_prob $seed" >> $file_seed - bin/EMSequence --seq $file_mat_seq --class $k --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core > $file_prob - bin/ProbToModel --read $file_mat_open --prob $file_prob --thread $n_core 1> $file_mod1 - bin/ProbToModel --read $file_mat_nucl --prob $file_prob --thread $n_core 1> $file_mod2 - bin/ProbToModel --seq $file_mat_seq --prob $file_prob --thread $n_core 1> $file_mod3 -done - diff --git a/scripts/toy_data/analyse_data_sequence.R b/scripts/toy_data/analyse_data_sequence.R new file mode 100644 index 0000000..bccfc2b --- /dev/null +++ b/scripts/toy_data/analyse_data_sequence.R @@ -0,0 +1,360 @@ +setwd(file.path("/", "local", "groux", "scATAC-seq")) + +# libraries +library(RColorBrewer) +library(motifStack) +library(TFBSTools) + +# functions +source(file.path("scripts", "functions.R")) + + +#' Scores a sequence, in integer format, given +#' a probability matrix. +#' @pwm a probability matrix of dimensions : +#' 1st 4 for A, C, G, T +#' 2nd the number of positions +#' @param sequence a vector of integer encoding +#' the sequence A:1, C:2, G:3, T:4 +#' @param prior a vector containing the prior +#' probabilities about each nucleotide. It may +#' be the base composition. It should sum up +#' to 1. +#' @return the sequence cumulative score. +#' @author Romain Groux +score.seq = function(sequence, pwm, prior=rep(0.25,4)) +{ + score = 0 + l_seq = length(sequence) + l_pwm = ncol(pwm) + n_shifts = l_seq - l_pwm + 1 + + pwm = log(pwm) + prior = log(prior) + + for(shift in 1:n_shifts) + { score_ = 0 + for(j in 1:l_pwm) + { base = sequence[shift+j-1] + score_ = score_ + pwm[base, j] - prior[base] + } + score = score + exp(score_) + } + return(score) +} + +#' Given a set of sequences and two motifs, this function +#' assigns to each sequence a class label. +#' The class label is determined by scoring a sequence +#' with both motifs. The best scoring motif determine +#' the class label. +#' @param sequences a matrix containing the sequences +#' in integer format. It has the following dimensions : +#' 1st the number of sequences +#' 2nd the sequence length +#' @param motif.1 a probability matrix containing the +#' 1st motif. It has the following dimensions : +#' 1st 4 for A, C, G, T +#' 2nd the number of positions +#' @param motif.2 a probability matrix containing the +#' 2st motif. It has the following dimensions : +#' 1st 4 for A, C, G, T +#' 2nd the number of positions +#' @return a list with 2 items +#' scores : a matrix containing the sequence scores. It +#' has the following dimensions : +#' 1st the number of sequences +#' 2nd 2 for each motif +#' labels : a vector containing the sequence labels. +#' @author Romain Groux +predict.class = function(sequences, motif.1, motif.2) +{ scores.1 = apply(sequences, 1, score.seq, motif.1) + scores.2 = apply(sequences, 1, score.seq, motif.2) + + labels = rep(1, nrow(sequences)) + + labels[which(scores.1 < scores.2)] = 2 + + return(list("scores"=cbind(scores.1, scores.2), "labels"=labels)) + +} + +#' Computes the x and y coordinates of the ROC curve +#' given the sequence scores and their corresponding +#' true class labels. +#' @param scores a matrix containing the scores for +#' each sequence (on the rows), for each class (on +#' the columns) +#' @param labels the true class labels for each +#' sequence. +#' @return a matrix with 2 columns containing the +#' x and y coordinates of the curve. The x coordinates +#' correspond to the specificity values and the y +#' coordinates the sensitivity values. +#' @author Romain Groux +get.roc.coord = function(scores, labels) +{ # number of points + n = length(labels) + + # order by descending label for class 1 and then class2 + ord = order(scores[,1], scores[,2], decreasing=T) + labels = labels[ord] + + idx1 = which(labels == 1) + idx2 = which(labels == 2) + + # true positive and negative discovery rates + true = vector(length=length(labels)) ; n_true = 0 ; + false = vector(length=length(labels)) ; n_false = 0 ; + + for(i in 1:length(labels)) + { if(labels[i] == 1) + { n_true = n_true + 1 } + else + { n_false = n_false + 1 } + true[i] = n_true / (0.5*n) # 0.5 because as many class 1 as 2 + false[i] = n_false / (0.5*n) + } + # to draw plot + m = cbind(true, false) + colnames(m) = c("y", "x") + return(m) +} + + + +# seed +set.seed(20191007) + +# path to the images for the logo +path.a = file.path("res/A.png") +path.c = file.path("res/C.png") +path.g = file.path("res/G.png") +path.t = file.path("res/T.png") + +# 2000 sequences classified +n.seq = 2000 +#number of time a classification was repeated +n.runs = 50 +# number of classes searches +n.classes = 2 +# the shifting freedom allowed +n.shifts = 90 + +# the expected dimensionality of the prob array to read +dim = c(n.seq, n.classes, n.shifts, 2) # 2000 seq, 2 classes, 90 shifts, 2 flips + +# where the data are +dir.data = file.path("data", + "toy_data") +# where the results are +dir.results = file.path("results", + "toy_data") + + + +# sequences +sequences = as.matrix(read.table(file.path(dir.data, + "simulated_sequences_2class_flip.mat"))) + 1 + +# true motifs +motif.true.1 = as.matrix(read.table(file.path(dir.data, + "simulated_sequences_2class_flip_motif1.txt"))) +motif.true.2 = as.matrix(read.table(file.path(dir.data, + "simulated_sequences_2class_flip_motif2.txt"))) +# true class labels +labels.true = as.matrix(read.table(file.path(dir.data, + "simulated_sequences_2class_flip_classes.txt")))[,1] + +# AUC sequences with true motifs +scores.true = predict.class(sequences, motif.true.1, motif.true.2) +labels = scores.true$labels +auc.true = auc(labels.true, labels) +scores.true = scores.true$scores + +# sequence scores with found motifs +scores.found = array(dim=c(n.runs, n.seq, 2), data=0) + +# AUCs +auc.found = rep(0, n.runs) + +# go over each run +for(i in 1:n.runs) +{ # get motifs found + file.motif = file.path(dir.results, + sprintf("simulated_sequences_2class_flip_class_model_%d.mat", + i)) + motifs.found = read.sequence.models(file.motif)$models + + # display logo + # par(mfrow=c(2,2)) + # plot.logo(motifs.found[,,1], path.a, path.c, path.g, path.t, main=sprintf("run %d class1", i)) + # plot.logo(motifs.found[,,2], path.a, path.c, path.g, path.t, main=sprintf("run %d class2", i)) + # plot.logo(motif.true.1, path.a, path.c, path.g, path.t, main=sprintf("true class1", i)) + # plot.logo(motif.true.2, path.a, path.c, path.g, path.t, main=sprintf("true class2", i)) + + # get AUC + # class 1 -> label 1, class 2 -> label2 + scores1 = predict.class(sequences, motifs.found[,,1], motifs.found[,,2]) + labels1 = scores1$labels + auc1 = auc(labels.true, labels1) + auc.found[i] = auc1 + scores.found[i,,] = scores1$scores + + # class 1 -> label 2, class 2 -> label1 + # NO NEED ! AUC value is the same. This is simply an anti-classifier :) + # scores2 = predict.class(sequences, motifs.found[,,2], motifs.found[,,1]) + # labels2 = scores2$labels + # auc2 = auc(labels.true, labels2) + # auc.found[i,2] = auc2 + # scores.found[i,,2,] = scores2$scores + + # coord1 = get.roc.coord(scores1$scores, labels.true) + # coord2 = get.roc.coord(scores2$scores, labels.true) + # coord.true = get.roc.coord(scores.true, labels.true) + + # plot(coord1[,"x"], coord1[,"y"], lwd=2, type='l') + # lines(coord2[,"x"], coord2[,"y"], lwd=2, col="blue") + # lines(coord.true[,"x"], coord.true[,"y"], lwd=2, col="red") + # segments(0,0,1,1) +} + + + +# # plot results +# X11(width=12, height=8) +# +# m = matrix(nrow=3, ncol=2, +# data=1:6, byrow=T) +# +# layout(m, heights=c(2,1,1)) +# +# # boxplot AUCs +# boxplot(auc.found, ylim=c(0,1), main="AUCs", ylab="AUC", +# cex.lab=2, cex.main=2) +# abline(h=auc.true, col="red", lwd=2, lty=2) +# +# # plot all ROC +# coord.true = get.roc.coord(scores.true, labels.true) +# # empty plot +# plot(seq(0,1,by=0.2), seq(0,1,by=0.2), col=0, +# main="ROCs", xlab="specificity", ylab="sensitivity", +# cex.lab=2, cex.main=2) +# for(i in 1:n.runs) +# { # compute true positive and negative discovery rates +# coord.found = get.roc.coord(scores.found[i,,], labels.true) +# # plot found motifs ROC +# lines(coord.found[,"x"], coord.found[,"y"], lwd=0.5) +# } +# # plot diagonale line +# segments(0,0,1,1, lwd=2, lty=2) +# # plot true motifs ROC +# lines(coord.true[,"x"], coord.true[,"y"], lwd=3, col="red") +# +# # true motif class 1 +# plot.logo(motif.true.1, path.a, path.c, path.g, path.t, +# main="True motif class 1", cex.main=2) +# # x-axis +# axis(1, at=1:ncol(motif.true.1), labels=1:ncol(motif.true.1)) +# # x-axis +# axis(2, at=0:2, labels=0:2) +# +# # true motif class 2 +# plot.logo(motif.true.2, path.a, path.c, path.g, path.t, +# main="True motif class 2", cex.main=2) +# # x-axis +# axis(1, at=1:ncol(motif.true.2), labels=1:ncol(motif.true.2)) +# # x-axis +# axis(2, at=0:2, labels=0:2) +# +# # best motif found +# idx.best = which.max(auc.found) +# file.motif.best = file.path(dir.results, +# sprintf("simulated_sequences_2class_flip_class_model_%d.mat", +# idx.best)) +# motif.found.best = read.sequence.models(file.motif.best)$models +# # best found motif class 1 +# plot.logo(motif.found.best[,,1], path.a, path.c, path.g, path.t, +# main="Best found motif class 1", cex.main=2) +# # x-axis +# axis(1, at=1:ncol(motif.found.best[,,1]), labels=1:ncol(motif.found.best[,,1])) +# # x-axis +# axis(2, at=0:2, labels=0:2) +# # best found motif class 1 +# plot.logo(motif.found.best[,,2], path.a, path.c, path.g, path.t, +# main="Best found motif class 2", cex.main=2) +# # x-axis +# axis(1, at=1:ncol(motif.found.best[,,2]), labels=1:ncol(motif.found.best[,,2])) +# # x-axis +# axis(2, at=0:2, labels=0:2) +# +# dev.off() + + +# plot results +png(filename=file.path(dir.results, + "simulated_sequences_2class_flip_auc_roc.png"), + units="in", res=720, width=12, height=6) +# X11(width=12, height=6) + + par(mfrow=c(1,2), + mar=c(5.1,5.1,4.1,2.1)) + + + # boxplot AUCs + boxplot(auc.found, ylim=c(0,1), main="AUC values", ylab="AUC", + cex.lab=2, cex.main=2) + abline(h=auc.true, col="red", lwd=2, lty=2) + + # plot all ROC + coord.true = get.roc.coord(scores.true, labels.true) + # empty plot + plot(seq(0,1,by=0.2), seq(0,1,by=0.2), col=0, + main="ROC curves", xlab="specificity", ylab="sensitivity", + cex.lab=2, cex.main=2) + for(i in 1:n.runs) + { # compute true positive and negative discovery rates + coord.found = get.roc.coord(scores.found[i,,], labels.true) + # plot found motifs ROC + lines(coord.found[,"x"], coord.found[,"y"], lwd=0.5) + } + # plot diagonale line + segments(0,0,1,1, lwd=2, lty=2) + # plot true motifs ROC + lines(coord.true[,"x"], coord.true[,"y"], lwd=3, col="red") +dev.off() + +png(filename=file.path(dir.results, + "simulated_sequences_2class_flip_best_motifs.png"), + units="in", res=720, width=7, height=8) +# X11(width=7, height=10) + # best motif found + idx.best = which.max(auc.found) + file.motif.best = file.path(dir.results, + sprintf("simulated_sequences_2class_flip_class_model_%d.mat", + idx.best)) + motif.found.best = read.sequence.models(file.motif.best)$models + + pfm.found.best = get_pfm_list(motif.found.best, "Motif found") + rownames(motif.true.1) = rownames(motif.true.2) = c('A', 'C', 'G', 'T') + pfm.true.1 = new("pfm", + mat=motif.true.1, + name="True motif class 1") + pfm.true.2 = new("pfm", + mat=motif.true.2, + name="True motif class 2") + pfm.found.best.1 = new("pfm", + mat=motif.found.best[,,1], + name="Found motif class 1") + pfm.found.best.2 = new("pfm", + mat=motif.found.best[,,2], + name="Found motif class 2") + + motifStack(c(pfm.true.1, + pfm.true.2, + pfm.found.best.1, + pfm.found.best.2), + layout="treeview") +dev.off() + + diff --git a/scripts/toy_data/analysis_data_sequence.sh b/scripts/toy_data/analysis_data_sequence.sh new file mode 100755 index 0000000..063886b --- /dev/null +++ b/scripts/toy_data/analysis_data_sequence.sh @@ -0,0 +1,33 @@ +# some paths +## directories +results_dir='results/toy_data' +seq_dir="data/toy_data" + +file_seq_1=$seq_dir'/simulated_sequences_1class_flip.mat' +file_seq_2=$seq_dir'/simulated_sequences_2class_flip.mat' + +file_seed_1=$results_dir'/simulated_sequences_1class_flip_seed.mat' +file_seed_2=$results_dir'/simulated_sequences_2class_flip_seed.mat' + +mkdir $results_dir + +touch $file_seed_1 +touch $file_seed_2 + +# parameters +n_iter='200' +n_shift='90' +n_core=20 + +for i in {1..50} +do + # 2 classes + seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo) + file_prob=$results_dir/'simulated_sequences_2class_flip_'$k'class_prob_'$i'.mat4d' + file_prob2=$results_dir/'simulated_sequences_2class_flip_'$k'class_prob_'$i'.txt' + file_mod=$results_dir/'simulated_sequences_2class_flip_'$k'class_model_'$i'.mat' + echo "$file_prob $seed" >> $file_seed_2 + bin/EMSequence --seq $file_seq_2 --class 2 --shift $n_shift --flip --iter $n_iter --seed $seed --thread $n_core --out $file_prob + bin/ProbToModel --seq $file_seq_2 --prob $file_prob --thread $n_core 1> $file_mod + bin/MatrixBinToTxt --file $file_prob --type double --ndim 4 > $file_prob2 +done diff --git a/scripts/toy_data/generate_matrix_data_sequence.R b/scripts/toy_data/generate_matrix_data_sequence.R index d34cbd7..8f98f3a 100644 --- a/scripts/toy_data/generate_matrix_data_sequence.R +++ b/scripts/toy_data/generate_matrix_data_sequence.R @@ -1,255 +1,245 @@ setwd(file.path("/", "local", "groux", "scATAC-seq")) # required librairies and functions library(abind) # functions #' Converts a vector of characters containing a DNA sequence #' into a vector of integers : A->1, C->2, G->3, T->4. Any #' non ACGT character triggers an error. #' \param sequence the DNA sequence stored as a vector of #' characters. #' \return a vector of integers. #' \author Romain Groux dna.to.int = function(sequence) { seq.len = length(sequence) seq.int = vector(length=seq.len, mode="numeric") for(i in 1:seq.len) { if(sequence[i] == "A") { seq.int[i] = 1 } else if(sequence[i] == "C") { seq.int[i] = 2 } else if(sequence[i] == "G") { seq.int[i] = 3 } else if(sequence[i] == "T") { seq.int[i] = 4} else { stop(sprintf("Error! Unrecognized character in DNA sequence at position %d : %s", i, sequence[i])) } } return(seq.int) } #' The complementary function to dna.to.int(). #' \param sequence the DNA stored as a vector of int : -#' A->1, C->2, G->3, T->4 +#' 1->A, 2->C, 3->G, 4->T #' \return a vector of characters. #' \author Romain Groux int.to.dna = function(sequence) { seq.len = length(sequence) seq.let = vector(length=seq.len, mode="character") for(i in 1:seq.len) { if(sequence[i] == 1) { seq.let[i] = "A" } else if(sequence[i] == 2) { seq.let[i] = "C" } else if(sequence[i] == 3) { seq.let[i] = "G" } else if(sequence[i] == 4) { seq.let[i] = "T"} else { stop(sprintf("Error! Unrecognized character in int sequence at position %d : %d", i, sequence[i])) } } return(seq.let) } simulate_data = function(n_seq, l_seq, classes, prob_bg, p_classes, p_flip) { - # the alphabet A->1, C->2, G->3, T->4 + # the alphabet A->0, C->1, G->2, T->3 alphabet = c(1, 2, 3, 4) l_alpha = length(alphabet) # binding site length l_bs = ncol(classes[,,1]) # number of classes n_class = dim(classes)[3] # checks if(length(p_classes )!= n_class) { stop(sprintf("Error! %d classes detected but %d class probability given!", n_class, length(p_classes))) } for(k in 1:n_class) { if((nrow(classes[,,k]) != 4) || (ncol(classes[,,k]) != l_bs)) { stop(sprintf("Error! Check the dimensions of class %d motif : %d / %d!", k, nrow(classes[,,k]), ncol(classes[,,k]))) } } # last position (comprised) for a binding site to begin and be entirely in the seq last_pos_bs = l_seq - l_bs + 1 # data structures sequences = matrix(data=0, n_seq, l_seq) # the sequences bs_starts = vector(length=n_seq, mode="numeric") # the starting positions of the binding site bs_flips = vector(length=n_seq, mode="numeric") # the orientation of the binding site bs_classes = vector(length=n_seq, mode="numeric") # the class from which the binding site was sampled bs_contents = matrix(data=0, nrow=n_seq, ncol=l_bs) # the binding site sequences bs_probs = array(data=0, dim=c(l_alpha, l_bs, n_class)) # the class binding site probability matrices for(i in 1:n_seq) { # sample from a uniform distribution where the binding site should start bs_starts[i] = sample(1:last_pos_bs, 1) # sample a class class = sample(1:n_class, 1, prob=p_classes) bs_classes[i] = class # sample a flip state (0->forward, 1->reverse) flip = rbinom(1, 1, prob=p_flip) bs_flips[i] = flip # to store the int seq seq = vector(length=l_seq, mode="numeric") seq_bs = vector(length=l_bs, mode="numeric") # over the sequence j = 1 while(j <= l_seq) { # binding site starts if(j == bs_starts[i]) { for(k in 0:(l_bs-1)) { # reverse strand if(flip) { base = sample(alphabet, 1, prob=rev(classes[,,class][,l_bs-k])) seq[j+k] = base seq_bs[k+1] = base bs_probs[l_alpha-base+1, l_bs-k, class] = bs_probs[l_alpha-base+1, l_bs-k, class] + 1 } # forward strand else { base = sample(alphabet, 1, prob=classes[,,class][,k+1]) seq[j+k] = base seq_bs[k+1] = base bs_probs[base, k+1, class] = bs_probs[base, k+1, class] + 1 } } j = j + k # this is background sequence } else { seq[j] = sample(alphabet, 1, prob=prob_bg) } j = j + 1 } - sequences[i,] = int.to.dna(seq) - bs_contents[i,] = int.to.dna(seq_bs) + sequences[i,] = seq + bs_contents[i,] = seq_bs } # normalize for(i in 1:n_class) { bs_probs[,,i] = bs_probs[,,i] / colSums(bs_probs[,,i]) } return(list(sequences=sequences, sites=bs_contents, motifs=bs_probs, starts=bs_starts, flips=bs_flips, classes=bs_classes)) } # some general parameters -n_seq = 10 # number of sequences -l_seq = 10 # length of sequences +n_seq = 2000 # number of sequences +l_seq = 100 # length of sequences # the base probabilities inside the binding site (A,C,G,T) -motif_class1 = matrix(data=c(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, - 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, - 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0), +motif_class1 = matrix(data=c(0.9, .033, .033, 0.1, 0.1, .033, .033, 0.9, + .033, 0.9, .033, 0.1, 0.1, .033, 0.9, .033, + .033, .033, 0.9, 0.1, 0.1, 0.9, .033, .033, + .033, .033, .033, 0.7, 0.7, .033, .033, .033), nrow=4, ncol=8, byrow=T) -motif_class2 = matrix(data=c(0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, - 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0), +motif_class2 = matrix(data=c(.033, .033, .033, 0.7, 0.7, .033, .033, .033, + 0.9, .033, 0.9, 0.1, 0.1, .033, .033, .033, + .033, 0.9, .033, 0.1, 0.1, .033, .033, .033, + .033, .033, .033, 0.1, 0.1, 0.9, 0.9, 0.9), nrow=4, ncol=8, byrow=T) dir.create(file.path("data", "toy_data"), showWarnings=FALSE) -# ------------------------------------------- 1 classes with 1 motif/sequence, no flip, a really toy exemple ------------------------------------------- - -data = matrix(nrow=4, ncol=10, byrow=T, - data=c('T', 'T', 'C', 'C', 'T', 'T', 'A', 'G', 'C', 'T', - 'T', 'T', 'C', 'C', 'T', 'T', 'G', 'C', 'T', 'A', - 'T', 'T', 'C', 'C', 'T', 'T', 'C', 'T', 'A', 'G', - 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'A', 'G', 'C')) -write.table(data, file=file.path("data", "toy_data", "simulated_sequences_toy.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') -rm(data) - # -------------------------------------------------------- 1 classes with 1 motif/sequence, no flip ----------------------------------------------------- # seed set.seed(20190715) # the class binding site motif_classes = array(data=motif_class1, dim=c(dim(motif_class1), 1)) # the class probability p_classes = c(1) # the probability of having a binding site on the reverse strand p_flip = 0 # the base probabilities outside the binding site (A,C,G,T) prob_bg = rep(0.25, 4) # simulate the data data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip) # save -write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') -write.table(motif_class1, file=file.path("data", "toy_data", "simulated_sequences_motif.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') -write.table(data$sites, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') +write.table(data$sequences-1, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') +write.table(motif_class1, file=file.path("data", "toy_data", "simulated_sequences_motif.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') +write.table(data$sites-1, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$starts, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$flips, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$classes, file=file.path("data", "toy_data", "simulated_sequences_1class_noflip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') # clean rm(motif_classes, p_classes, p_flip, prob_bg, data) # -------------------------------------------------------- 1 classes with 1 motif/sequence, flip -------------------------------------------------------- # seed set.seed(201803142) # the class binding site motif_classes = array(data=motif_class1, dim=c(dim(motif_class1), 1)) # the class probability p_classes = c(1) # the probability of having a binding site on the reverse strand p_flip = 0.5 # the base probabilities outside the binding site (A,C,G,T) prob_bg = rep(0.25, 4) # simulate the data data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip) # save -write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_1class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') -write.table(data$sites, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') +write.table(data$sequences-1, file=file.path("data", "toy_data", "simulated_sequences_1class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') +write.table(data$sites-1, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$starts, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_1class_flip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$flips, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$classes, file=file.path("data", "toy_data", "simulated_sequences_1class_flip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') # clean rm(motif_classes, p_classes, p_flip, prob_bg, data) -# -------------------------------------------------------- 2 classes with 1 bs/sequence, flip -------------------------------------------------------- +# -------------------------------------------------------- 2 classes with 1 motif /sequence, flip -------------------------------------------------------- # seed set.seed(201803143) # the class binding site motif_classes = abind(motif_class1, motif_class2, along=3) # the class probability p_classes = c(0.5, 0.5) # the probability of having a binding site on the reverse strand p_flip = 0.5 # the base probabilities outside the binding site (A,C,G,T) prob_bg = rep(0.25, 4) # simulate the data data = simulate_data(n_seq, l_seq, motif_classes, prob_bg, p_classes, p_flip) # save -write.table(data$sequences, file=file.path("data", "toy_data", "simulated_sequences_2class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') -write.table(data$sites, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') +write.table(data$sequences-1, file=file.path("data", "toy_data", "simulated_sequences_2class_flip.mat"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') +write.table(data$sites-1 , file=file.path("data", "toy_data", "simulated_sequences_2class_flip_contents.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$starts, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_starts.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$motifs[,,1], file=file.path("data", "toy_data", "simulated_sequences_2class_flip_motif1.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$motifs[,,2], file=file.path("data", "toy_data", "simulated_sequences_2class_flip_motif2.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$flips, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_flips.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') write.table(data$classes, file=file.path("data", "toy_data", "simulated_sequences_2class_flip_classes.txt"), quote=F, row.names=F, col.names=F, sep='\t', eol='\n') # clean rm(motif_classes, p_classes, p_flip, prob_bg, data) diff --git a/src/Applications/.ProbToModelApplication.cpp.swp b/src/Applications/.ProbToModelApplication.cpp.swp deleted file mode 100644 index ace6708..0000000 Binary files a/src/Applications/.ProbToModelApplication.cpp.swp and /dev/null differ diff --git a/src/Applications/EMConsensusSequenceApplication.cpp b/src/Applications/EMConsensusSequenceApplication.cpp index 391036a..b18601b 100644 --- a/src/Applications/EMConsensusSequenceApplication.cpp +++ b/src/Applications/EMConsensusSequenceApplication.cpp @@ -1,271 +1,272 @@ #include #include #include #include #include #include // std::move() #include // std::invalid_argument #include #include #include #include #include // kmer::compute_kmer_pvalue() #include // order() namespace po = boost::program_options ; EMConsensusSequenceApplication::EMConsensusSequenceApplication(int argn, char** argv) : file_consseq(""), file_filter(""), file_out(""), n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMConsensusSequenceApplication::run() { if(this->runnable) { EMConsensusSequence* em(nullptr) ; // row filter std::vector filter ; if(this->file_filter != "") { // it is a column vector, easier to use the Matrix2D interface // to read it rather than coding a function for :) filter = Matrix2D(this->file_filter).get_data() ; std::sort(filter.begin(), filter.end()) ; } // data Matrix3D data ; data.load(this->file_consseq) ; // filter out some rows if needed if(filter.size()) { data = filter_rows(filter, data) ; } // seeds motifs randomly if(this->seed != "") { em = new EMConsensusSequence(std::move(data), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } // seeds from enriched kmers else { size_t model_ncol = data.get_dim()[1] - this->n_shift + 1 ; Matrix3D model = this->init_model_kmer(model_ncol, data) ; em = new EMConsensusSequence(std::move(data), std::move(model), this->n_iter, this->flip, + this->bckg_class, this->n_threads) ; } // classify em->classify() ; em->get_post_prob().save(this->file_out) ; // clean delete em ; em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMConsensusSequenceApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMConsensusSequence is a probabilistic partitioning algorithm that \n" "sofetly assigns consensus sequences to classes given their motif\n" "content.\n" "The assignment probabilities are written in binary format as a 4D " "matrix.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " "by default 0 (no parallelization)." ; std::string opt_consseq_msg = "The path to the file containing the consensus sequences" ; std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n" "indices of rows to filter out in the data." ; std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n" "the data. By default, shifting is disabled (equivalent to\n" "--shift 1)." ; std::string opt_flip_msg = "Enables flipping to realign the data."; std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n" "contains the sequence background probabilities at each position\n" "and is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string seeding_tmp ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("consseq", po::value(&(this->file_consseq)), opt_consseq_msg.c_str()) ("filter", po::value(&(this->file_filter)), opt_filter_msg.c_str()) ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_consseq == "" and (not help)) { std::string msg("Error! No data were given (--seq)!") ; throw std::invalid_argument(msg) ; } if(this->file_out == "" and (not help)) { std::string msg("Error! No output file given (--out)!") ; throw std::invalid_argument(msg) ; } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } Matrix3D EMConsensusSequenceApplication::init_model_kmer(size_t l_model, const Matrix3D& data) const { // leave space for 2N's on each side size_t l_kmer = l_model ; size_t n_n = 0 ; // so far, 0 N's added if(l_model > 4) { n_n = 2 ; // 2 N's on each side l_kmer -= (2*n_n) ; } // compute the pvalue associated to each kmer auto kmers_pvalues = kmers::compute_kmer_pvalue(data, l_kmer) ; // sort kmers by ascending pvalue std::vector index = order(kmers_pvalues.second, true) ; // get most significant std::vector kmers(this->n_class) ; for(size_t i=0; in_class; i++) { size_t idx = index[i] ; kmers[i] = kmers_pvalues.first[idx] ; std::cerr << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ; } // turn to motifs double p_base = 0.7 ; // the prob of the base matching these of the kmer double p_nbase = 0.1 ; // the prob of the bases not matching these of the kmer double p_n = 0.25 ; // the prob of N // only N's for now Matrix3D model(this->n_class, l_model, 4, p_n) ; for(size_t i=0; i #include #include #include #include // std::move() #include // std::invalid_argument #include #include // boost::split() #include #include #include // filter() #include // kmer::compute_kmer_pvalue() #include // order() namespace po = boost::program_options ; EMSequenceApplication::EMSequenceApplication(int argn, char** argv) : file_seq(""), files_motif(""), file_filter(""), file_out(""), n_class(0), n_iter(0), n_shift(0), flip(false), bckg_class(false), n_threads(0), seed(""), runnable(true) { // parse command line options and set the fields this->parseOptions(argn, argv) ; } int EMSequenceApplication::run() { if(this->runnable) { EMSequence* em(nullptr) ; // data Matrix2D data(this->file_seq) ; // filter out some rows if needed std::vector filter ; if(this->file_filter != "") { // it is a column vector, easier to use the Matrix2D interface // to read it rather than coding a function for :) filter = Matrix2D(this->file_filter).get_data() ; std::sort(filter.begin(), filter.end()) ; data = filter_rows(filter, data) ; } // seeds motifs randomly if(this->files_motif == "" and this->seed != "") { em = new EMSequence(std::move(data), this->n_class, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; } // seeds motifs with the given matrices else if(this->files_motif != "") { // model std::vector motif_paths ; boost::split(motif_paths, this->files_motif, [](char c){return c == ',';}) ; // this->n_class = motif_paths.size() + this->bckg_class ; size_t model_ncol = data.get_ncol() - this->n_shift + 1 ; // add the given motif, random motifs (if needed) and // background class (if needed) Matrix3D model = this->init_model(model_ncol, data, motif_paths) ; em = new EMSequence(std::move(data), std::move(model), this->n_iter, - this->flip, + this->bckg_class, this->n_threads) ; } // seeds from enriched kmers else { size_t model_ncol = data.get_ncol() - this->n_shift + 1 ; Matrix3D model = this->init_model_kmer(model_ncol, data) ; em = new EMSequence(std::move(data), std::move(model), this->n_iter, this->flip, + this->bckg_class, this->n_threads) ; } // classify em->classify() ; em->get_post_prob().save(this->file_out) ; // clean delete em ; em = nullptr ; return EXIT_SUCCESS ; } else { return EXIT_FAILURE ; } } void EMSequenceApplication::parseOptions(int argn, char** argv) { // no option to parse if(argv == nullptr) { std::string message = "no options to parse!" ; throw std::invalid_argument(message) ; } // help messages std::string desc_msg = "\n" "EMSequence is a probabilistic partitioning algorithm that \n" "sofetly assigns sequences to classes given their motif content.\n" "The assignment probabilities are written in binary format as a 4D " "matrix.\n\n" ; std::string opt_help_msg = "Produces this help message." ; std::string opt_thread_msg = "The number of threads dedicated to parallelize the computations,\n " "by default 0 (no parallelization)." ; std::string opt_seq_msg = "The path to the file containing the sequences" ; std::string opt_motifs_msg = "A coma separated list of path to files containing the initial motifs\n" "values. The motifs should be probability matrices in horizontal format.\n" "If the motifs are too short after accounting for shifting, extra\n" "columns with uniform probabilities will be added on each side. The\n" "given number of classes (--class) should at least be the number of\n" "initial motifs. If the number of classes is bigger than the number of" "given motifs, the remaining classes are initialised randomly\n." ; std::string opt_filter_msg = "Optional. The path to a single column text file containing the 0-based\n" "indices of rows to filter out in the data." ; std::string opt_file_out_msg = "A path to a file in which the assignment probabilities will be saved\n" "in binary format." ; std::string opt_iter_msg = "The number of iterations." ; std::string opt_class_msg = "The number of classes to find." ; std::string opt_shift_msg = "Enables this number of column of shifting freedom to realign\n" "the data. By default, shifting is disabled (equivalent to\n" "--shift 1)." ; std::string opt_flip_msg = "Enables flipping to realign the data."; std::string opt_bckg_msg = "Adds a class to model the sequence background. This class\n" "contains the sequence background probabilities at each position\n" "and is never updated." ; std::string opt_seed_msg = "A value to seed the random number generator."; // option parser boost::program_options::variables_map vm ; boost::program_options::options_description desc(desc_msg) ; std::string seeding_tmp ; desc.add_options() ("help,h", opt_help_msg.c_str()) ("seq", po::value(&(this->file_seq)), opt_seq_msg.c_str()) ("motifs", po::value(&(this->files_motif)), opt_motifs_msg.c_str()) ("filter", po::value(&(this->file_filter)), opt_filter_msg.c_str()) ("out", po::value(&(this->file_out)), opt_file_out_msg.c_str()) ("iter,i", po::value(&(this->n_iter)), opt_iter_msg.c_str()) ("class,c", po::value(&(this->n_class)), opt_class_msg.c_str()) ("shift,s", po::value(&(this->n_shift)), opt_shift_msg.c_str()) ("flip", opt_flip_msg.c_str()) ("bgclass", opt_bckg_msg.c_str()) ("seed", po::value(&(this->seed)), opt_seed_msg.c_str()) ("thread", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ; // parse try { po::store(po::parse_command_line(argn, argv, desc), vm) ; po::notify(vm) ; } catch(std::invalid_argument& e) { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ; throw std::invalid_argument(msg) ; } catch(...) { throw std::invalid_argument("An unknown error occured while parsing the options") ; } bool help = vm.count("help") ; // checks unproper option settings if(this->file_seq == "" and (not help)) { std::string msg("Error! No data were given (--seq)!") ; throw std::invalid_argument(msg) ; } if(this->file_out == "" and (not help)) { std::string msg("Error! No output file given (--out)!") ; throw std::invalid_argument(msg) ; } // no iter given -> 1 iter if(this->n_iter == 0) { this->n_iter = 1 ; } // no shift class given -> 1 class if(this->n_class == 0) { this->n_class = 1 ; } // no shift given, value of 1 -> no shift if(this->n_shift == 0) { this->n_shift = 1 ; } // set flip if(vm.count("flip")) { this->flip = true ; } // set background class if(vm.count("bgclass")) { this->bckg_class = true ; } // help invoked, run() cannot be invoked if(help) { std::cout << desc << std::endl ; this->runnable = false ; return ; } // everything fine, run() can be called else { this->runnable = true ; return ; } } Matrix3D EMSequenceApplication::init_model(size_t l_model, const Matrix2D& data, const std::vector& motif_paths) const { int n_class_given = motif_paths.size() ; int n_class_bckg = this->bckg_class ; int n_class_rand = this->n_class - n_class_given - n_class_bckg ; // number of classes should at least be number of motifs if(n_class_given > (int)this->n_class) { char msg[4096] ; sprintf(msg, "Error! number of class given (--class %zu) should at " "least be equal to number of motifs (--motifs %d)", this->n_class, n_class_given) ; throw std::invalid_argument(msg) ; } // check if there is room for a background class if((int)this->n_class < n_class_given+this->bckg_class) { char msg[4096] ; sprintf(msg, "Error! no class left to add a background " "class (--bgclass) with the given motifs (--motifs) (--class %zu)", this->n_class) ; throw std::invalid_argument(msg) ; } // init empty model Matrix3D model(this->n_class, l_model, 4, 0.25) ; // add given motifs for(size_t i=0; i matrix(motif_paths[i]) ; // motif is too big for this shift if(matrix.get_ncol() > l_model) { char msg[4096] ; sprintf(msg, "Error! In %s, motif column number is bigger " "than data column number - shift + 1 " "(%zu > %zu - %zu + 1)", motif_paths[i].c_str(), matrix.get_ncol(), data.get_ncol(), this->n_shift) ; throw std::invalid_argument(msg) ; } // insert motif in middle of matrix else { // size_t j_model = this->n_shift / 2 ; size_t j_model = (l_model - matrix.get_ncol()) / 2 ; for(size_t j_mat=0, j_mod=j_model; j_mat 0) { // initialise randomly EMSequence em(data, n_class_rand, this->n_iter, this->n_shift, this->flip, this->bckg_class, this->seed, this->n_threads) ; Matrix3D model_rand = em.get_sequence_models() ; // copy them into model for(int i_rand=0, i_mod=n_class_given; i_rand EMSequenceApplication::init_model_kmer(size_t l_model, const Matrix2D& data) const { // leave space for 2N's on each side size_t l_kmer = l_model ; size_t n_n = 0 ; // so far, 0 N's added if(l_model > 4) { n_n = 2 ; // 2 N's on each side l_kmer -= (2*n_n) ; } // compute the pvalue associated to each kmer auto kmers_pvalues = kmers::compute_kmer_pvalue(data, l_kmer) ; // sort kmers by ascending pvalue std::vector index = order(kmers_pvalues.second, true) ; // get most significant std::vector kmers(this->n_class) ; for(size_t i=0; in_class; i++) { size_t idx = index[i] ; kmers[i] = kmers_pvalues.first[idx] ; std::cerr << kmers_pvalues.first[idx] << " " << kmers_pvalues.second[idx] << std::endl ; } // turn to motifs double p_base = 0.7 ; // the prob of the base matching these of the kmer double p_nbase = 0.1 ; // the prob of the bases not matching these of the kmer double p_n = 0.25 ; // the prob of N // only N's for now Matrix3D model(this->n_class, l_model, 4, p_n) ; for(size_t i=0; i #include #include // std::invalid_argument #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // std::iota() #include // std::mt19937 #include #include #include #include // beta_distribution() #include // rand_string() #include // getRandomNumberGenerator() #include // sd(), normal_pmf() EMBase::EMBase(size_t n_row, size_t n_col, size_t n_class, size_t n_iter, size_t n_shift, bool flip, size_t n_threads=0) : n_row(n_row), n_col(n_col), n_class(n_class), n_shift(n_shift), flip(flip), n_flip(flip+1), n_iter(n_iter), l_model(n_col - n_shift + 1), loglikelihood(n_row, n_class, n_shift, n_flip, 0.), post_prob(n_row, n_class, n_shift, n_flip, 0.), post_state_prob(n_class, n_shift, n_flip, 0.), post_class_prob(n_class, 0.), post_prob_rowsum(n_row, 0.), post_prob_colsum(n_class, 0.), post_prob_tot(0.), threads(nullptr) -{ // check n_shift value +{ + // check n_shift value if(this->n_col < this->n_shift) { char msg[4096] ; sprintf(msg, "Error! Shift is bigger than data column number " "(%zu / %zu)!", this->n_shift, this->n_col) ; throw std::invalid_argument(msg) ; } /* // data structures this->loglikelihood = Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.) ; this->post_prob = Matrix4D(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.) ; this->post_state_prob = Matrix3D(this->n_class, this->n_shift, this->n_flip, 0.) ; this->post_class_prob = vector_d(this->n_class, 0) ; this->post_prob_rowsum = vector_d(this->n_row, 0) ; this->post_prob_colsum = vector_d(this->n_class, 0) ; this->post_prob_tot = 0 ; */ // threads if(n_threads) { this->threads = new ThreadPool(n_threads) ; } } EMBase::~EMBase() { // threads if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } } Matrix4D EMBase::get_post_prob() const { return this->post_prob ; } vector_d EMBase::get_post_class_prob() const { return this->post_class_prob ; } void EMBase::set_state_prob_uniform() { double sum = this->n_class * this->n_shift * this->n_flip ; for(size_t i=0; in_class; i++) { for(size_t j=0; jn_shift; j++) { for(size_t k=0; kn_flip; k++) { this->post_state_prob(i,j,k) = 1./sum ; } } } } void EMBase::set_post_prob_random(const std::string& seed) { // set random number generator // will be used to generate thread private seeds getRandomGenerator(seed) ; // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->set_post_prob_random_routine(0, this->n_row, seed, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; // private seeds std::vector private_seeds(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMBase::set_post_prob_random_routine, this, slice.first, slice.second, private_seeds[i], std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } // compute class and state probs this->compute_class_prob() ; } void EMBase::set_post_prob_random_routine(size_t from, size_t to, const std::string& seed, std::promise& post_prob_colsum) { // random number generator std::mt19937 generator ; std::seed_seq seed_sequence(seed.begin(),seed.end()) ; generator.seed(seed_sequence) ; // this->post_prob_tot = 0. ; // this->post_prob_colsum = vector_d(this->n_class, 0.) ; vector_d colsums = vector_d(this->n_class, 0.) ; vector_d rowsums(this->n_row, 0) ; // random sampling beta_distribution beta(1, this->n_row) ; for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { double p = beta(generator) ; this->post_prob(i,j,k,l) = p ; rowsums[i] += p ; } } } } // normalization for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { double p = this->post_prob(i,j,k,l) / rowsums[i] ; this->post_prob(i,j,k,l) = p ; // this->post_prob_tot += p ; // this->post_prob_colsum[j] += p ; colsums[j] += p ; } } } } // compute class and state probs // this->compute_class_prob() ; post_prob_colsum.set_value(colsums) ; } void EMBase::compute_class_prob() { for(size_t n_class=0; n_classn_class; n_class++) { // reset total this->post_class_prob[n_class] = 0. ; for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t flip=0; flipn_flip; flip++) { // sum this->post_state_prob(n_class,n_shift,flip) = 0. ; for(size_t i=0; in_row; i++) { this->post_state_prob(n_class,n_shift,flip) += this->post_prob(i,n_class,n_shift,flip) ; } // normalize this->post_state_prob(n_class,n_shift,flip) /= this->post_prob_tot ; this->post_class_prob[n_class] += this->post_state_prob(n_class,n_shift,flip) ; } } } } void EMBase::center_post_state_prob() { if(this->n_shift == 1) { return ; } // the possible shift states vector_d shifts(this->n_shift) ; std::iota(shifts.begin(), shifts.end(), 1.) ; // the shift probabilities and the class probabilies // (no need to norm., class_prob sums to 1) double shifts_prob_measured_tot = 0. ; vector_d shifts_prob_measured(this->n_shift) ; for(size_t s=0; sn_shift; s++) { for(size_t k=0; kn_class; k++) { for(size_t f=0; fn_flip; f++) { shifts_prob_measured[s] += this->post_state_prob(k,s,f) ; shifts_prob_measured_tot += this->post_state_prob(k,s,f) ; } } } // the shift mean and (biased) standard deviation double shifts_sd = sd(shifts, shifts_prob_measured, false) ; // the shift probabilities under the assumption that is // distributed as a gaussian centered on // the central shift state with sd and mean as in the data // sd as the data vector_d shifts_prob_centered(shifts.size(), 0.) ; double shifts_prob_centered_tot = 0. ; for(size_t i=0; in_shift/2)+1, shifts_sd) ; shifts_prob_centered_tot += shifts_prob_centered[i] ; } for(size_t k=0; kn_class; k++) { for(size_t f=0; fn_flip; f++) { for(size_t s=0; sn_shift; s++) { this->post_state_prob(k,s,f) = this->post_class_prob[k] * shifts_prob_centered[s] / (this->n_flip * shifts_prob_centered_tot) ; } } } // shifts_prob_measured_tot = 0. ; shifts_prob_measured.clear() ; shifts_prob_measured.resize(this->n_shift) ; for(size_t s=0; sn_shift; s++) { for(size_t k=0; kn_class; k++) { for(size_t f=0; fn_flip; f++) { shifts_prob_measured[s] += this->post_state_prob(k,s,f) ; } } } } diff --git a/src/Clustering/EMConsensusSequence.cpp b/src/Clustering/EMConsensusSequence.cpp index ea97f85..268cec2 100644 --- a/src/Clustering/EMConsensusSequence.cpp +++ b/src/Clustering/EMConsensusSequence.cpp @@ -1,355 +1,353 @@ #include #include #include #include // std::promise, std::future #include // std::pair, std::move() #include // std::bind(), std::ref() #include // SequenceLayer #include // getRandomNumberGenerator() #include // ConsoleProgressBar #include // ThreadPool #include // dna::base_composition() EMConsensusSequence::EMConsensusSequence(const Matrix3D& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_dim()[0], seq_matrix.get_dim()[1], n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), cseq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models this->cseq_layer = new ConsensusSequenceLayer(seq_matrix, this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->cseq_layer->update_model(this->post_prob, this->threads) ; } EMConsensusSequence::EMConsensusSequence(Matrix3D&& seq_matrix, size_t n_class, size_t n_iter, size_t n_shift, bool flip, bool bckg_class, const std::string& seed, size_t n_threads) : EMBase(seq_matrix.get_dim()[0], seq_matrix.get_dim()[1], n_class, n_iter, n_shift, flip, n_threads), loglikelihood_max(n_row, 0.), cseq_layer(nullptr) { this->loglikelihood_max = vector_d(n_row, 0.) ; // initialise post prob randomly // getRandomGenerator(seed) ; this->set_post_prob_random(seed) ; // data and models this->cseq_layer = new ConsensusSequenceLayer(std::move(seq_matrix), this->n_class, this->n_shift, this->flip, bckg_class) ; // intialise the models with the post prob this->cseq_layer->update_model(this->post_prob, this->threads) ; } EMConsensusSequence::EMConsensusSequence(const Matrix3D& seq_matrix, const Matrix3D& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_dim()[0], seq_matrix.get_dim()[1], motifs.get_dim()[0], n_iter, seq_matrix.get_dim()[1] - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), cseq_layer(nullptr) { - this->loglikelihood_max = vector_d(n_row, 0.) ; // data and models // background motif (if any) is the last of the given motifs this->cseq_layer = new ConsensusSequenceLayer(seq_matrix, motifs, this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMConsensusSequence::EMConsensusSequence(Matrix3D&& seq_matrix, Matrix3D&& motifs, size_t n_iter, bool flip, bool bckg_class, size_t n_threads) : EMBase(seq_matrix.get_dim()[0], seq_matrix.get_dim()[1], motifs.get_dim()[0], n_iter, seq_matrix.get_dim()[1] - motifs.get_dim()[1] + 1, flip, n_threads), loglikelihood_max(n_row, 0.), cseq_layer(nullptr) { - this->loglikelihood_max = vector_d(n_row, 0.) ; // data and models // background motif (if any) is the last of the given motifs this->cseq_layer = new ConsensusSequenceLayer(std::move(seq_matrix), std::move(motifs), this->flip, bckg_class) ; // intialise the class prob uniformly this->set_state_prob_uniform() ; } EMConsensusSequence::~EMConsensusSequence() { if(this->cseq_layer != nullptr) { delete this->cseq_layer ; this->cseq_layer = nullptr ; } if(this->threads != nullptr) { this->threads->join() ; delete this->threads ; this->threads = nullptr ; } } Matrix3D EMConsensusSequence::get_sequence_models() const { return this->cseq_layer->get_model() ; } EMConsensusSequence::exit_codes EMConsensusSequence::classify() { size_t bar_update_n = this->n_iter ; ConsoleProgressBar bar(std::cerr, bar_update_n, 60, "classifying") ; // optimize the partition for(size_t n_iter=0; n_itern_iter; n_iter++) { // E-step this->compute_loglikelihood() ; this->compute_post_prob() ; // M-step this->compute_class_prob() ; this->update_models() ; this->center_post_state_prob() ; bar.update() ; } bar.update() ; std::cerr << std::endl ; return EMConsensusSequence::exit_codes::ITER_MAX ; } void EMConsensusSequence::compute_loglikelihood() { // compute the loglikelihood this->cseq_layer->compute_loglikelihoods(this->loglikelihood, this->loglikelihood_max, this->threads) ; // rescale the values // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_loglikelihood_routine(0, this->n_row, promise) ; future.get() ; } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMConsensusSequence::compute_loglikelihood_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working for(auto& future : futures) { future.get() ; } // -------------------------- threads stop --------------------------- } } void EMConsensusSequence::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done) { // rescale the values for(size_t i=from; in_class; j++) { for(size_t k=0; kn_shift; k++) { for(size_t l=0; ln_flip; l++) { this->loglikelihood(i,j,k,l) = std::max(this->loglikelihood(i,j,k,l) - this->loglikelihood_max[i], ConsensusSequenceLayer::p_min_log) ; } } } } done.set_value(true) ; } void EMConsensusSequence::compute_post_prob() { // don't parallelize if(this->threads == nullptr) { std::promise promise ; std::future future = promise.get_future() ; this->compute_post_prob_routine(0, this->n_row, promise) ; // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = future.get() ; for(const auto& prob : this->post_prob_colsum) { this->post_prob_tot += prob ; } } // parallelize else { size_t n_threads = this->threads->getNThread() ; // compute the slices on which each thread will work std::vector> slices = ThreadPool::split_range(0, this->n_row,n_threads) ; // get promises and futures // the function run by the threads will compute // the partial sum per class of post_prob for the given slice // this should be used to compute the complete sum of post_prob // and the complete sum per class of post_prob std::vector> promises(n_threads) ; std::vector> futures(n_threads) ; for(size_t i=0; ithreads->addJob(std::move( std::bind(&EMConsensusSequence::compute_post_prob_routine, this, slice.first, slice.second, std::ref(promises[i])))) ; } // wait until all threads are done working // compute the sum of post prob and the per class sum of post prob // from the partial results computed on each slice this->post_prob_tot = 0. ; this->post_prob_colsum = vector_d(this->n_class, 0.) ; for(auto& future : futures) { auto probs = future.get() ; for(size_t i=0; in_class; i++) { double prob = probs[i] ; this->post_prob_colsum[i] += prob ; this->post_prob_tot += prob ; } } // -------------------------- threads stop --------------------------- } } void EMConsensusSequence::compute_post_prob_routine(size_t from, size_t to, std::promise& post_prob_colsum) { vector_d colsums(this->n_class, 0.) ; // reset grand total // this->post_prob_tot = 0 ; // this->post_prob_colsum = vector_d(n_class, 0) ; // post prob for(size_t i=from; ipost_prob_rowsum[i] = 0. ; for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = exp(this->loglikelihood(i,n_class,n_shift,n_flip)) * this->post_state_prob(n_class,n_shift,n_flip) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; this->post_prob_rowsum[i] += p ; } } } // normalize for(size_t n_class=0; n_classn_class; n_class++) { for(size_t n_shift=0; n_shiftn_shift; n_shift++) { for(size_t n_flip=0; n_flipn_flip; n_flip++) { double p = std::max(this->post_prob(i,n_class,n_shift,n_flip) / this->post_prob_rowsum[i], ConsensusSequenceLayer::p_min) ; this->post_prob(i,n_class,n_shift,n_flip) = p ; colsums[n_class] += p ; } } } } post_prob_colsum.set_value(colsums) ; } void EMConsensusSequence::update_models() { this->cseq_layer->update_model(this->post_prob, this->threads) ; }