# REDO THE FIGURES FROM THE ARTICLE BUT IN PNG FORMAT INSTEAD OF PDF, WITH LOWER RESOLUTION # IT IS AN ADAPTED COPY/PASTE FROM /local/groux/Kmeans_chipseq/bin/article/figures.R setwd(file.path("", "local", "groux", "Kmeans_chipseq")) library(RColorBrewer) library(plotrix) # ===================================================== Supplemental Figure 1 ===================================================== # plot the class profiles of the simulated classes and one example of a dataset with low noise, one example with high noise # and their two best partitions dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark") source(file.path("res", "functions_utility.R")) source(file.path("res", "functions_plot.R")) # class densities # general parameter n_samples = 1000 n_col = 2001 # the length of a signal vector shift_max = 100 # the maximum possible shift p_flip = 0.3 # the prob of having a flipped signal # class 1 : a simple gaussian class1_n = 600 class1_m = ceiling(n_col/2) - ceiling(shift_max/2) # class 1 mean, mean will be in average in the middle of the data vector class1_s = 40 # class 1 sd # the signal shape shape1 = dnorm(1:(n_col-shift_max+1), class1_m, class1_s) # class 2 : half a gaussian class2_n = n_samples - class1_n class2_m = floor(n_col/2) - floor(shift_max/2) # class 2 mean, mean will be in average in the middle of the data vector class2_s = 40 # class 2 sd # the signal shape shape2 = dnorm(1:(n_col-shift_max+1), class2_m, class2_s) shape2[class2_m:length(shape2)] = min(shape2) # class 3 : a uniform class3_n = 333 class3_from = floor(n_col/2) - floor(shift_max/2) -120 # class 3 from, mean will be in average in the middle of the data vector class3_to = floor(n_col/2) - floor(shift_max/2) +120 # class 3 to, mean will be in average in the middle of the data vector # the signal shape shape3 = dunif(1:(n_col-shift_max+1), class3_from, class3_to) # normalize shape1 = shape1 / sum(shape1) shape2 = shape2 / sum(shape2) shape2 = shape2 / sum(shape2) # two datasets and an example of partitioning using SPar-K labels = as.matrix(read.table("data/simulated_data_chipseq/simulated_data_3_class_asym_classes_cov100_noise0.0.txt"))[,1] # coverage 100, noise 0 data.100.0 = as.matrix(read.table(file.path("data", "simulated_data_chipseq", "simulated_data_3_class_asym_cov100_noise0.0.txt"))) ari = read.RDS(file.path("results", "simulated_data_chipseq", "app", "simulated_data_chipseq_3_class_asym_ari_newkmean.RDS")) best = which.max(ari$`kmean++`$nooutlier$`cov 100`$`noise 0.0`$`3 cluster`) data.100.0.part = read.table(file.path("results", "simulated_data_chipseq", "app", "seeding_kmean++", sprintf("simulated_data_3_class_asym_cov100_noise0.0_3cluster_flip_normcorr_%d.txt", best)), header=T) data.100.0.part = realign.data(data.100.0, data.100.0.part$shift_ref, data.100.0.part$shift_dat, data.100.0.part$flip, 71) # coverage 100, noise 90 data.100.9 = as.matrix(read.table(file.path("data", "simulated_data_chipseq", "simulated_data_3_class_asym_cov100_noise0.9.txt"))) best = which.max(ari$`kmean++`$nooutlier$`cov 100`$`noise 0.9`$`3 cluster`) data.100.9.part = read.table(file.path("results", "simulated_data_chipseq", "app", "seeding_kmean++", sprintf("simulated_data_3_class_asym_cov100_noise0.9_3cluster_flip_normcorr_%d.txt", best)), header=T) data.100.9.part = realign.data(data.100.9, data.100.9.part$shift_ref, data.100.9.part$shift_dat, data.100.9.part$flip, 71) col = brewer.pal(3, "Set1") col.heat = colorRampPalette(c("white", "red"), space = "rgb")(100) col.lab = c(rep(col[1], table(labels)[1]), rep(col[2], table(labels)[2]), rep(col[3], table(labels)[3])) x.lab = seq(-1000, 1000, length.out=5) x.at = seq(0, 1, length.out=length(x.lab)) # pdf(file=file.path("results", "article", "supplemental_figure1.pdf"), width=14, height=7) png(filename=file.path(dest.dir, "supplemental_figure1.png"), width=14, height=7, units="in", res=300) par(mar=c(5.1, 6.1, 4.1, 2.1)) lay = layout(mat=matrix(c(1,4,5, 8, 9, 1,4,5, 8, 9, 2,4,5, 8, 9, 2,6,7,10,11, 3,6,7,10,11, 3,6,7,10,11), nrow=6, ncol=5, byrow=T),widths=c(5,0.5,5,0.5,5)) # layout.show(lay) # class 1 density x = 1:length(shape1) plot(x, shape1, lwd=3, type='l', col=col[1], main="Class 1 density", xlab="position [bp]", ylab="density", cex.main=2, cex.axis=2, cex.lab=2) text(x=-200, y=0.0125, labels='A', cex=4.5, xpd=NA, font=2) # class 2 density plot(x, shape2, lwd=3, type='l', col=col[2], main="Class 2 density", xlab="position [bp]", ylab="density", cex.main=2, cex.axis=2, cex.lab=2) # class 3 density plot(x, shape3, lwd=3, type='l', col=col[3], main="Class 3 density", xlab="position [bp]", ylab="density", cex.main=2, cex.axis=2, cex.lab=2) # dataset coverage 100 noise 0 p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(labels, lwd=2, colors=col.lab) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(data.100.0)), col=col.heat, xaxt='n', yaxt='n', main="Coverage 100, noise 0%", ylab="", xlab="position (bp)", cex.main=2.0, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.07, y=1.1, labels='B', cex=4.5, xpd=NA, font=2) # dataset coverage 100 noise 0.9 p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(labels, lwd=2, colors=col.lab) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(data.100.9)), col=col.heat, xaxt='n', yaxt='n', main="Coverage 100, noise 90%", ylab="", xlab="position (bp)", cex.main=2.0, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.07, y=1.1, labels='D', cex=4.5, xpd=NA, font=2) # partition of dataset coverage 100 noise 0 p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(labels, lwd=2, colors=col.lab) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(data.100.0.part)), col=col.heat, xaxt='n', yaxt='n', main="Coverage 100, noise 0%", ylab="", xlab="Approximated pos. (bp)", cex.main=2.0, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.07, y=1.1, labels='C', cex=4.5, xpd=NA, font=2) # partition dataset coverage 100 noise 0.9 p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(labels, lwd=2, colors=col.lab) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(data.100.9.part)), col=col.heat, xaxt='n', yaxt='n', main="Coverage 100, noise 90%", ylab="", xlab="Approximated pos. (bp)", cex.main=2.0, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.07, y=1.1, labels='E', cex=4.5, xpd=NA, font=2) dev.off() rm(list=ls()) # ===================================================== Supplemental Figure 2 ===================================================== # plot the Adjuted Rand Index values for all programs, measured on the simulated data with different coverages, background to # noise ratios and containing 3 classes. # supplemantal figure 1 : results when clustering the data with random seeding # supplemantal figure 2 : results when clustering the data with kmean++ seeding dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark") source(file.path("res", "functions_utility.R")) # some colors colors = brewer.pal(9, "Set1") # coverages and signal/noise ratiios used to simulate the data coverages = c(10, 50, 100) noises = c(0.0, 0.1, 0.5, 0.9) # load Adjusted Rand Index measured ari.kmean.new = read.RDS(file.path("results", "simulated_data_chipseq", "app", "simulated_data_chipseq_3_class_asym_ari_newkmean.RDS")) ari.kmean.reg = read.RDS(file.path("results", "simulated_data_chipseq", "kmean", "simulated_data_chipseq_3_class_asym_ari_kmean.RDS")) ari.chippart = read.RDS(file.path("results", "simulated_data_chipseq", "chippartitioning", "simulated_data_chipseq_3_class_asym_ari_chippartitioning.RDS")) ari.shuf = read.RDS(file.path("results", "simulated_data_chipseq", "simulated_data_chipseq_3_class_asym_gamma_shuffled.RDS")) # pdf(file=file.path("results", "article", "supplemental_figure2.pdf"), width=16, height=7) png(filename=file.path(dest.dir, "supplemental_figure2.png"), width=16, height=7, units="in", res=300) par(mar=c(5.1, 6.1, 4.1, 2.1)) colors.boxplot = c(rep(c(colors[1], colors[5], colors[2:4]), each=12), colors[7]) boxplot( # new K-means # random seeding # normal # coverage 10 ari.kmean.new[["random"]][["normal"]][["cov 10"]][["noise 0.0"]][["3 cluster"]], ari.kmean.new[["random"]][["normal"]][["cov 10"]][["noise 0.1"]][["3 cluster"]], ari.kmean.new[["random"]][["normal"]][["cov 10"]][["noise 0.5"]][["3 cluster"]], ari.kmean.new[["random"]][["normal"]][["cov 10"]][["noise 0.9"]][["3 cluster"]], # coverage 50 ari.kmean.new[["random"]][["normal"]][["cov 50"]][["noise 0.0"]][["3 cluster"]], ari.kmean.new[["random"]][["normal"]][["cov 50"]][["noise 0.1"]][["3 cluster"]], ari.kmean.new[["random"]][["normal"]][["cov 50"]][["noise 0.5"]][["3 cluster"]], ari.kmean.new[["random"]][["normal"]][["cov 50"]][["noise 0.9"]][["3 cluster"]], # coverage 100 ari.kmean.new[["random"]][["normal"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], ari.kmean.new[["random"]][["normal"]][["cov 100"]][["noise 0.1"]][["3 cluster"]], ari.kmean.new[["random"]][["normal"]][["cov 100"]][["noise 0.5"]][["3 cluster"]], ari.kmean.new[["random"]][["normal"]][["cov 100"]][["noise 0.9"]][["3 cluster"]], # new K-means # random seeding # normal # coverage 10 ari.kmean.new[["random"]][["nooutlier"]][["cov 10"]][["noise 0.0"]][["3 cluster"]], ari.kmean.new[["random"]][["nooutlier"]][["cov 10"]][["noise 0.1"]][["3 cluster"]], ari.kmean.new[["random"]][["nooutlier"]][["cov 10"]][["noise 0.5"]][["3 cluster"]], ari.kmean.new[["random"]][["nooutlier"]][["cov 10"]][["noise 0.9"]][["3 cluster"]], # coverage 50 ari.kmean.new[["random"]][["nooutlier"]][["cov 50"]][["noise 0.0"]][["3 cluster"]], ari.kmean.new[["random"]][["nooutlier"]][["cov 50"]][["noise 0.1"]][["3 cluster"]], ari.kmean.new[["random"]][["nooutlier"]][["cov 50"]][["noise 0.5"]][["3 cluster"]], ari.kmean.new[["random"]][["nooutlier"]][["cov 50"]][["noise 0.9"]][["3 cluster"]], # coverage 100 ari.kmean.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], ari.kmean.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.1"]][["3 cluster"]], ari.kmean.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.5"]][["3 cluster"]], ari.kmean.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.9"]][["3 cluster"]], # regular K-means # random seeding # euclidean distance # kmean++ seeding # coverage 10 ari.kmean.reg[["random"]][["eucl"]][["cov 10"]][["noise 0.0"]][["3 cluster"]], ari.kmean.reg[["random"]][["eucl"]][["cov 10"]][["noise 0.1"]][["3 cluster"]], ari.kmean.reg[["random"]][["eucl"]][["cov 10"]][["noise 0.5"]][["3 cluster"]], ari.kmean.reg[["random"]][["eucl"]][["cov 10"]][["noise 0.9"]][["3 cluster"]], # coverage 50 ari.kmean.reg[["random"]][["eucl"]][["cov 50"]][["noise 0.0"]][["3 cluster"]], ari.kmean.reg[["random"]][["eucl"]][["cov 50"]][["noise 0.1"]][["3 cluster"]], ari.kmean.reg[["random"]][["eucl"]][["cov 50"]][["noise 0.5"]][["3 cluster"]], ari.kmean.reg[["random"]][["eucl"]][["cov 50"]][["noise 0.9"]][["3 cluster"]], # coverage 100 ari.kmean.reg[["random"]][["eucl"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], ari.kmean.reg[["random"]][["eucl"]][["cov 100"]][["noise 0.1"]][["3 cluster"]], ari.kmean.reg[["random"]][["eucl"]][["cov 100"]][["noise 0.5"]][["3 cluster"]], ari.kmean.reg[["random"]][["eucl"]][["cov 100"]][["noise 0.9"]][["3 cluster"]], # regular K-means # random seeding # euclidean distance # kmean++ seeding # coverage 10 ari.kmean.reg[["random"]][["corr"]][["cov 10"]][["noise 0.0"]][["3 cluster"]], ari.kmean.reg[["random"]][["corr"]][["cov 10"]][["noise 0.1"]][["3 cluster"]], ari.kmean.reg[["random"]][["corr"]][["cov 10"]][["noise 0.5"]][["3 cluster"]], ari.kmean.reg[["random"]][["corr"]][["cov 10"]][["noise 0.9"]][["3 cluster"]], # coverage 50 ari.kmean.reg[["random"]][["corr"]][["cov 50"]][["noise 0.0"]][["3 cluster"]], ari.kmean.reg[["random"]][["corr"]][["cov 50"]][["noise 0.1"]][["3 cluster"]], ari.kmean.reg[["random"]][["corr"]][["cov 50"]][["noise 0.5"]][["3 cluster"]], ari.kmean.reg[["random"]][["corr"]][["cov 50"]][["noise 0.9"]][["3 cluster"]], # coverage 100 ari.kmean.reg[["random"]][["corr"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], ari.kmean.reg[["random"]][["corr"]][["cov 100"]][["noise 0.1"]][["3 cluster"]], ari.kmean.reg[["random"]][["corr"]][["cov 100"]][["noise 0.5"]][["3 cluster"]], ari.kmean.reg[["random"]][["corr"]][["cov 100"]][["noise 0.9"]][["3 cluster"]], # ChIPparitioning # random seeding # coverage 10 ari.chippart[["random"]][["cov 10"]][["noise 0.0"]][["3 cluster"]], ari.chippart[["random"]][["cov 10"]][["noise 0.1"]][["3 cluster"]], ari.chippart[["random"]][["cov 10"]][["noise 0.5"]][["3 cluster"]], ari.chippart[["random"]][["cov 10"]][["noise 0.9"]][["3 cluster"]], # coverage 50 ari.chippart[["random"]][["cov 50"]][["noise 0.0"]][["3 cluster"]], ari.chippart[["random"]][["cov 50"]][["noise 0.1"]][["3 cluster"]], ari.chippart[["random"]][["cov 50"]][["noise 0.5"]][["3 cluster"]], ari.chippart[["random"]][["cov 50"]][["noise 0.9"]][["3 cluster"]], # coverage 100 ari.chippart[["random"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], ari.chippart[["random"]][["cov 100"]][["noise 0.1"]][["3 cluster"]], ari.chippart[["random"]][["cov 100"]][["noise 0.5"]][["3 cluster"]], ari.chippart[["random"]][["cov 100"]][["noise 0.9"]][["3 cluster"]], # Random expectations ari.shuf, main="Adjusted Rand Index", xlab="", ylab="ARI", xaxt='n', yaxt='n', cex.main=3, cex.axis=2, cex.lab=2, ylim=c(-0.2, 1.4), col=colors.boxplot) # y axis axis(side=2, at=seq(0, 1, by=0.2), cex.axis=1.3) # add horizontal lines abline(h=1.0, lty=2) abline(h=0.5, lty=2) abline(h=0.0, lty=2) # draw noise values # parameters to draw triangles y_from_tri = -0.05 y_to_tri = y_from_tri x_from_tri = 0.5 x_to_tri = 1 x_by_tri = length(noises) + 1 h_tri = 0.02 # parameters to draw noise values x_noise = 1 y_noise = y_from_tri - 0.05 x_by_noise = 1 for(i in 1:5) { for(j in 1:length(coverages)) { x_to_tri = x_from_tri + x_by_tri - 1 polygon(x=c(x_from_tri, x_to_tri, x_to_tri), y=c(y_from_tri, y_to_tri, y_to_tri+h_tri), col="black") for(k in 1:length(noises)) { text(x=x_noise, y=y_noise, labels=noises[k], cex=0.8) x_noise = x_noise + x_by_noise } abline(v=x_from_tri, lty=2) x_from_tri = x_to_tri } } abline(v=x_from_tri, lty=2) # label the random values text(x=x_noise, y=y_noise, labels="R") # draw coverage values y_cov = y_from_tri - 0.1 y_cov_text = y_cov - 0.05 x_from_cov = 1 x_to_cov = 1 x_by_cov = length(noises) for(i in 1:5) { for(j in 1:length(coverages)) { x_to_cov = x_from_cov + x_by_cov - 1 segments(x0=x_from_cov, x1=x_to_cov, y0=y_cov, y1=y_cov, lwd=3) text(x=x_from_cov + 0.5*(x_by_cov-1), y=y_cov_text, labels=sprintf("cov %d", coverages[j])) x_from_cov = x_to_cov + 1 } } # draw legend legend(x=50, y=1.52, legend=c("SPar-K", "SPar-K (smooth.)", "K-means (eucl.)", "K-means (corr.)", "ChIPPartitioning", "Random partition"), col=unique(colors.boxplot), cex=1.2, lwd=4, bty='n') dev.off() rm(list=ls()) # ===================================================== Supplemental Figure 4 ===================================================== # plot the SSE for random and Kmeans++ seedings for simulated ChIP-seq data with 3 classes dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark") source(file.path("res", "functions_utility.R")) sse = read.RDS(file.path("results", "simulated_data_chipseq", "app", "simulated_data_chipseq_3_class_asym_sse_newkmean.RDS")) cov = "cov 100" noise = "noise 0.0" # pdf(file=file.path("results", "article", "supplemental_figure4.pdf"), width=10, height=6) png(filename=file.path(dest.dir, "supplemental_figure4.png"), width=10, height=6, units="in", res=300) par(mar=c(5.1, 6.1, 4.1, 2.1), mfrow=c(2,2)) # random seeding, normal option = "normal" seeding = "random" x = 2:5 m = c( # given seeding # coverage 100 median(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]]) ) s = c( # given seeding # coverage 100 sd(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]]) ) ylim = c(min(m-s), max(m+s)) # plot medians plot(x=x, y=m, main="", xlab="Nb of clusters", ylab="SSE", cex.main=3, cex.axis=2, cex.lab=2, lwd=3, type='b', ylim=ylim, xaxt='n') axis(side=1, at=x, cex.axis=2) # plot standard deviations segments(x0=x, x1=x, y0=m-s, y1=m+s, lwd=2) # plot label text(x=1.4, y=ylim[1]+ 1.3*diff(ylim), labels="A", cex=3.5, xpd=NA, font=2) # random seeding, nooutlier option = "nooutlier" seeding = "random" x = 2:5 m = c( # given seeding # coverage 100 median(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]]) ) s = c( # given seeding # coverage 100 sd(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]]) ) ylim = c(min(m-s), max(m+s)) # plot medians plot(x=x, y=m, main="", xlab="Nb of clusters", ylab="SSE", cex.main=3, cex.axis=2, cex.lab=2, lwd=3, type='b', ylim=ylim, xaxt='n') axis(side=1, at=x, cex.axis=2) # plot standard deviations segments(x0=x, x1=x, y0=m-s, y1=m+s, lwd=2) # plot label text(x=1.4, y=ylim[1]+ 1.3*diff(ylim), labels="B", cex=3.5, xpd=NA, font=2) # kmean++ seeding, normal option = "normal" seeding = "kmean++" x = 2:5 m = c( # given seeding # coverage 100 median(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]]) ) s = c( # given seeding # coverage 100 sd(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]]) ) ylim = c(min(m-s), max(m+s)) # plot medians plot(x=x, y=m, main="", xlab="Nb of clusters", ylab="SSE", cex.main=3, cex.axis=2, cex.lab=2, lwd=3, type='b', ylim=ylim, xaxt='n') axis(side=1, at=x, cex.axis=2) # plot standard deviations segments(x0=x, x1=x, y0=m-s, y1=m+s, lwd=2) # plot label text(x=1.4, y=ylim[1]+ 1.3*diff(ylim), labels="C", cex=3.5, xpd=NA, font=2) # kmean++ seeding, nooutlier option = "nooutlier" seeding = "kmean++" x = 2:5 m = c( # given seeding # coverage 100 median(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]), median(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]]) ) s = c( # given seeding # coverage 100 sd(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]), sd(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]]) ) ylim = c(min(m-s), max(m+s)) # plot medians plot(x=x, y=m, main="", xlab="Nb of clusters", ylab="SSE", cex.main=3, cex.axis=2, cex.lab=2, lwd=3, type='b', ylim=ylim, xaxt='n') axis(side=1, at=x, cex.axis=2) # plot standard deviations segments(x0=x, x1=x, y0=m-s, y1=m+s, lwd=2) # plot label text(x=1.4, y=ylim[1]+ 1.3*diff(ylim), labels="D", cex=3.5, xpd=NA, font=2) dev.off() rm(list=ls()) # ===================================================== Supplemental Figure 5 ===================================================== # plot the runtimes for each prorgam when clustering the simulated ChIP-seq data with 3 classes dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark") source(file.path("res", "functions_utility.R")) times.new = read.RDS(file.path("results", "runtime", "runtimes_app.RDS")) times.kmean = read.RDS(file.path("results", "runtime", "runtimes_kmean.RDS")) times.chipp = read.RDS(file.path("results", "runtime", "runtimes_chippartitioning.RDS")) data = list(vec1=times.new[["random"]][["normal"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], vec2=times.new[["kmean++"]][["normal"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], vec3=times.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], vec4=times.new[["kmean++"]][["nooutlier"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], vec5=times.kmean[["random"]][["eucl"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], vec6=times.kmean[["random"]][["corr"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], vec7=times.kmean[["kmean++"]][["eucl"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], vec8=times.kmean[["kmean++"]][["corr"]][["cov 100"]][["noise 0.0"]][["3 cluster"]], vec9=times.chipp[["random"]][["cov 100"]][["noise 0.0"]][["3 cluster"]]) # some colors colors = brewer.pal(9, "Set1") colors.boxplot = c(rep(colors[1],2), rep(colors[5],2), rep(colors[2],2), rep(colors[3],2), rep(colors[4],2)) # pdf(file=file.path("results", "article", "supplemental_figure5.pdf"), width=10, height=6) png(filename=file.path(dest.dir, "supplemental_figure5.png"), width=10, height=6, units="in", res=300) par(mar=c(6.1, 6.1, 4.1, 2.1), cex.main=3, cex.axis=1.5, cex.lab=2, xaxt="n") p = par(cex.main=3, cex.axis=1.5, cex.lab=2) # boxplot with a broken y-axis gap.boxplot(data, gap=list(top=c(80,550),bottom=c(NA,NA)), main="Running times", xlab="", ylab="time (sec)", col=colors.boxplot) # x-axis labels = c(rep(c("rand", "k++"), 4), "rand") axis(1, at=1:9, tick=T, labels=FALSE) text(x=1:9, y=-10, labels=labels, srt=45, adj=1, xpd=TRUE, cex=1.8) # y-axis axis(2, labels=c(seq(0,80,length.out=5), seq(550,650,length.out=5)), at=c(seq(0,80,length.out=5), seq(550,650,length.out=5)-(550-60))) # legend legend("topleft", legend = c("SPar-K", "SPar-K (smooth)", "Kmeans (eucl)", "Kmeans (corr)", "ChiPPartitioning"), col = unique(colors.boxplot), cex=1.2, lwd=4, bty='n') grid() dev.off() rm(list=ls()) # ===================================================== Supplemental Figure 8 ===================================================== # figure with the MNase data at CTCF binding sites partition # the partition was obtained by running the clustering with a shift of 41, flip and nooutlier # the otpimal number of cluster was estimated to be 3, the best partition was estimated to be # the 4th one. Visually, it was providing interesting biological informations but was not # the partition with the lowest SSE for K=3 (but it was not neither the one with the highest). dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark") source(file.path("res", "functions_utility.R")) source(file.path("res", "functions_plot.R")) #' Order the rows of a given matrix by similarity #' (correlation) to the aggregation (in descending #' order) and returns the order. #' @param data the matrix of interest. #' @return a vector of indices to reorder the #' original matrix. #' @author Romain Groux get.row.order = function(data) { if(is.vector(data)) { return(c(1)) } else { ref = colSums(data) scores = apply(data, 1, cor, ref) return(order(scores, decreasing=F)) } } # clustering parameters n.cluster = 4 n.shift = 41 flip = TRUE # the data data = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_mnase_encode.txt"))) # some additionnal data dnase = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_dnase_encode_rep1.txt"))) + as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_dnase_encode_rep2.txt"))) motif = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_ctcfmotif_encode.txt"))) tss.plus = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_tss_std+_encode.txt"))) tss.minus = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_tss_std-_encode.txt"))) tss = tss.plus + tss.minus peaks = read.table(file.path("data", "data_chipseq", "ctcfpeak.sga"), header=F, stringsAsFactors=F) # cluster 1 aggregation profiles chipcor.tss.m = read.table(file.path("results", "ctcf_mnase_encode2", sprintf("ctcfpeak_mnase_%dclusters_newkmean_nooutlier_4_cluster1_tss-.txt", n.cluster))) chipcor.cage.m = read.table(file.path("results", "ctcf_mnase_encode2", sprintf("ctcfpeak_mnase_%dclusters_newkmean_nooutlier_4_cluster1_cage-rep1-.txt", n.cluster))) chipcor.dnase = read.table(file.path("results", "ctcf_mnase_encode2", sprintf("ctcfpeak_mnase_%dclusters_newkmean_nooutlier_4_cluster1_dnase-rep1.txt", n.cluster))) chipcor.mnase = read.table(file.path("results", "ctcf_mnase_encode2", sprintf("ctcfpeak_mnase_%dclusters_newkmean_nooutlier_4_cluster1_mnase.txt", n.cluster))) # the best partition results = read.table(file.path("results", "ctcf_mnase_encode2", sprintf("ctcf_mnase_encode_%dclusters_nooutlier_4.txt", n.cluster)), header=T) results = format.results(data, results, n.shift, n.cluster) # x-axis labels x.lab = seq(-1000, 1000, length.out=5) x.at = seq(0, 1, length.out=length(x.lab)) x.at2 = seq(1, ncol(data), length.out=length(x.lab)) # heatmap colors color.1 = colorRampPalette(c("white", "red"), space = "rgb")(100) color.2 = colorRampPalette(c("white", "blue"), space = "rgb")(100) # cluster colors color.lab = brewer.pal(8, "Set1") # whether a region has a motif has_motif = apply(motif, 1, sum) has_motif[which(has_motif > 1)] = 1 # wheteher a region has a TSS has_tss = apply(tss, 1, sum) has_tss[which(has_tss > 1)] = 1 # plot # pdf(file=file.path("results", "article", "supplemental_figure8.pdf"), width=14, height=8) png(filename=file.path(dest.dir, "supplemental_figure8.png"), width=14, height=8, units="in", res=300) # create matrices with the data and the peaks for the heatmap and a vector of color # labels to plot the cluster assignment on the side of the heatmap d = matrix(nrow=nrow(data), ncol=ncol(data)) p = d data.aligned = d motif.aligned = d dnase.aligned = d tss.aligned = d l = vector(mode="character", length=nrow(data)) from = 1; to = from ; for(j in 1:n.cluster) { index = which(results$clusters == j) to = from + length(index) -1 d[from:to,] = order.rows(data[index,]) data.aligned[from:to,] = realign.data(data[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift) motif.aligned[from:to,] = realign.data(motif[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift) dnase.aligned[from:to,] = realign.data(dnase[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift) tss.aligned[from:to,] = realign.data(tss[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift) order = get.row.order(data.aligned[from:to,]) data.aligned[from:to,] = data.aligned[from:to,][order,] motif.aligned[from:to,] = motif.aligned[from:to,][order,] dnase.aligned[from:to,] = dnase.aligned[from:to,][order,] tss.aligned[from:to,] = tss.aligned[from:to,][order,] l[from:to] = color.lab[j] from = to + 1 } p = par(oma=c(0,0,5,0)) # layout construction labels = c(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,11,12,12,13,13, 9, 10,11,11,12,12,13,13) lay = layout(matrix(data=labels, nrow=4, ncol=8, byrow=T), widths=c(0.5,5,0.5,5,0.5,5,0.5,5,0.5,5,0.5,5)) # layout.show(lay) # data heatmap # p = par(mar=c(5, 0, 4, 1) + 0.1) # plot.label.bar(results$clusters, lwd=2, colors=l) # p = par(mar=c(5, 10, 4, 4) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) # image(t(condense.matrix(d)), col=color.1, xaxt='n', yaxt='n', main="MNase Data", ylab="", xlab="Position (bp)", # cex.main=2.5, cex.lab=2.5) # axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) # text(x=-0.08, y=1.15, labels='A', cex=4, xpd=NA, font=2) p = par(mar=c(5, 0, 4, 1) + 0.1) plot(1,1, xaxt='n', yaxt='n', col="white", xlab="", ylab="", main="", bty='n') p = par(mar=c(5, 10, 4, 4) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(order.rows(data))), col=color.1, xaxt='n', yaxt='n', main="MNase Data", ylab="", xlab="Position (bp)", cex.main=2.5, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.08, y=1.15, labels='A', cex=4, xpd=NA, font=2) # realigned data heatmap p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(results$clusters, lwd=2, colors=l) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(data.aligned)), col=color.1, xaxt='n', yaxt='n', main="Aligned MNase", ylab="", xlab="Approx. pos. (bp)", cex.main=2.5, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.08, y=1.15, labels='B', cex=4, xpd=NA, font=2) # realigned DNaseI p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(results$clusters, lwd=2, colors=l) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(dnase.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned DNaseI", ylab="", xlab="Approx. pos. (bp)", cex.main=2.5, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.08, y=1.15, labels='C', cex=4, xpd=NA, font=2) # realigned motifs p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(results$clusters, lwd=2, colors=l) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(motif.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned motifs", ylab="", xlab="Approx. pos. (bp)", cex.main=2.5, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.08, y=1.15, labels='D', cex=4, xpd=NA, font=2) # realigned TSSs p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(results$clusters, lwd=2, colors=l) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(tss.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned TSSs", ylab="", xlab="Approx. pos. (bp)", cex.main=2.5, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.08, y=1.15, labels='E', cex=4, xpd=NA, font=2) # cluster 2 aggregations x = chipcor.dnase[,1] y.dnase = chipcor.dnase[,2] y.mnase = chipcor.mnase[,2] y.tss.m = chipcor.tss.m[,2] y.cage.m = chipcor.cage.m[,2] p = par(mar=c(5.1,6.1,4.1,2.1)) plot(x=x, y=y.mnase/max(y.mnase), lwd=3, col=color.lab[2], type='l', xlab="Approximated position (bp)", ylab="Prop. of max signal", main="Cluster 1", ylim=c(0,1.2), cex.main=2.5, cex.axis=2, cex.lab=2) lines(x=x, y=y.dnase/max(y.dnase), lwd=3, col=color.lab[1], lty=1) # dnase on both std / at orinted peaks lines(x=x, y=y.tss.m/max(y.tss.m), lwd=2, col=color.lab[3], lty=1) # tss on - std / at orinted peaks lines(x=x, y=y.cage.m/max(y.cage.m), lwd=2, col=color.lab[4], lty=1) # cage on - std / at orinted peaks legend("topright", legend=c("MNase", "DNaseI", "TSS -std", "CAGE -std"), seg.len=0.5, col=c(color.lab[c(2,1,3,4)]), lwd=c(3,3,2,2), bty="n", cex=1) text(x=-1100, y=1.42, labels='F', cex=4.5, xpd=NA, font=2) # motif proportions motif_prop = vector(mode="numeric", length=n.cluster) for(j in 1:n.cluster) { index = which(results$clusters == j) motif_prop[j] = sum(has_motif[index]) / length(index) } barplot(height=motif_prop, ylim=c(0,1),col=color.lab[1:j], main="Prop. CTCF motif", xlab="clusters", ylab="Prop. region with motif", names.arg=1:n.cluster, cex.main=2.0, cex.lab=2, cex.axis=2) text(x=-0.08, y=1.15, labels='G', cex=4, xpd=NA, font=2) # TSS proportions tss_prop = vector(mode="numeric", length=n.cluster) for(j in 1:n.cluster) { index = which(results$clusters == j) tss_prop[j] = sum(has_tss[index]) / length(index) } barplot(height=tss_prop, ylim=c(0,1),col=color.lab[1:j], main="Prop. TSS", xlab="clusters", ylab="Prop. region with TSS", names.arg=1:n.cluster, cex.main=2.0, cex.lab=2, cex.axis=2) text(x=-0.08, y=1.15, labels='H', cex=4, xpd=NA, font=2) par(p) dev.off() rm(list=ls()) # ===================================================== Figure 1 ===================================================== # figure with the DNaseI data at SP1 binding sites partition # the partition was obtained by running the clustering with a shift of 41, flip and nooutlier # The best partition is the 7th which is the 4th partition with the lowest SSE (the 4 lowest SSE # values are really close from each other, ~1/600 of diff) and it looks also really nice in terms # of biology with aligned footprints andclusters loooking different dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark") source(file.path("res", "functions_utility.R")) source(file.path("res", "functions_plot.R")) #' Order the rows of a given matrix by similarity #' (correlation) to the aggregation (in descending #' order) and returns the order. #' @param data the matrix of interest. #' @return a vector of indices to reorder the #' original matrix. #' @author Romain Groux get.row.order = function(data) { if(is.vector(data)) { return(c(1)) } else { ref = colSums(data) scores = apply(data, 1, cor, ref) return(order(scores, decreasing=F)) } } # clustering parameters n.shift = 41 flip = TRUE n.cluster = 3 # the data data = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_dnase_big_clean.txt"))) # some additionnal data mnase = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_mnase_big_clean.txt"))) motif = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_sp1motif_big_clean.txt"))) tss = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_tss_big_clean.txt"))) tss.plus = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_tss_std+_big.txt"))) tss.minus = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_tss_std-_big.txt"))) peaks = read.table(file.path("data", "sp1_dnase", "sp1peak_clean.sga"), header=F, stringsAsFactors=F) # this is the best partition to me results = read.table(file.path("results", "sp1_dnase3", sprintf("sp1peak_dnase_%dclusters_nooutlier_7.txt", n.cluster)), header=T) results = format.results(data, results, n.shift, n.cluster) # cluster 2 aggregation profiles chipcor.tss.m = read.table(file.path("results", "sp1_dnase3", sprintf("sp1peak_dnase_%dclusters_newkmean_nooutlier_7_cluster2_tss-.txt", n.cluster))) chipcor.cage.m = read.table(file.path("results", "sp1_dnase3", sprintf("sp1peak_dnase_%dclusters_newkmean_nooutlier_7_cluster2_cage-.txt", n.cluster))) chipcor.dnase = read.table(file.path("results", "sp1_dnase3", sprintf("sp1peak_dnase_%dclusters_newkmean_nooutlier_7_cluster2_dnase_rep1.txt", n.cluster))) chipcor.mnase = read.table(file.path("results", "sp1_dnase3", sprintf("sp1peak_dnase_%dclusters_newkmean_nooutlier_7_cluster2_mnase.txt", n.cluster))) x.lab = seq(-300, 300, length.out=5) x.at = seq(0, 1, length.out=length(x.lab)) x.at2 = seq(1, ncol(data), length.out=length(x.lab)) # heatmap colors color.1 = colorRampPalette(c("white", "red"), space = "rgb")(100) color.2 = colorRampPalette(c("white", "blue"), space = "rgb")(100) # cluster colors color.lab = brewer.pal(8, "Set1") # whether a region has a motif has_motif = apply(motif, 1, sum) has_motif[which(has_motif > 1)] = 1 # whether a region has a TSS has_tss = apply(tss, 1, sum) has_tss[which(has_tss > 1)] = 1 # plot # pdf(file=file.path("results", "article", "figure1.pdf"), width=14, height=8) png(filename=file.path(dest.dir, "figure1.png"), width=14, height=8, units="in", res=300) # create matrices with the data and the peaks for the heatmap and a vector of color # labels to plot the cluster assignment on the side of the heatmap d = matrix(nrow=nrow(data), ncol=ncol(data)) p = d data.aligned = d motif.aligned = d mnase.aligned = d tss.aligned = d l = vector(mode="character", length=nrow(data)) from = 1; to = from ; for(j in 1:n.cluster) { index = which(results$clusters == j) to = from + length(index) -1 d[from:to,] = order.rows(data[index,]) data.aligned[from:to,] = realign.data(data[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift) motif.aligned[from:to,] = realign.data(motif[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift) mnase.aligned[from:to,] = realign.data(mnase[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift) tss.aligned[from:to,] = realign.data(tss[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift) order = get.row.order(data.aligned[from:to,]) data.aligned[from:to,] = data.aligned[from:to,][order,] motif.aligned[from:to,] = motif.aligned[from:to,][order,] mnase.aligned[from:to,] = mnase.aligned[from:to,][order,] tss.aligned[from:to,] = tss.aligned[from:to,][order,] l[from:to] = color.lab[j] from = to + 1 } p = par(oma=c(0,0,5,0)) # layout construction labels = c(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9,10,10,11,11,11,11, 9, 9,10,10,11,11,11,11) lay = layout(matrix(data=labels, nrow=5, ncol=8, byrow=T), widths=c(0.5,5,0.5,5,0.5,5,0.5,5,0.5,5,0.5,5)) # layout.show(lay) # p = par(oma=c(0,0,5,0)) # # layout construction # labels = c(1, 2, 3, 4, 5, 6, 7, 8, # 1, 2, 3, 4, 5, 6, 7, 8, # 9, 10,11,11,12,12,12,12, # 9, 10,11,11,12,12,12,12) # lay = layout(matrix(data=labels, nrow=4, ncol=8, byrow=T), widths=c(0.5,5,0.5,5,0.5,5,0.5,5,0.5,5,0.5,5)) # layout.show(lay) # data heatmap # p = par(mar=c(5, 0, 4, 1) + 0.1) # plot.label.bar(results$clusters, lwd=2, colors=l) # p = par(mar=c(5, 10, 4, 4) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) # image(t(condense.matrix(d)), col=color.1, xaxt='n', yaxt='n', main="DNaseI data", ylab="", xlab="Position (bp)", # cex.main=2.5, cex.lab=2.5) # axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) # text(x=-0.07, y=1.1, labels='A', cex=4.5, xpd=NA, font=2) p = par(mar=c(5, 0, 4, 1) + 0.1) plot(1,1, xaxt='n', yaxt='n', col="white", xlab="", ylab="", main="", bty='n') p = par(mar=c(5, 10, 4, 4) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(order.rows(data))), col=color.1, xaxt='n', yaxt='n', main="DNaseI data", ylab="", xlab="Position (bp)", cex.main=2.5, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.07, y=1.1, labels='A', cex=4.5, xpd=NA, font=2) # realigned data heatmap p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(results$clusters, lwd=2, colors=l) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(data.aligned)), col=color.1, xaxt='n', yaxt='n', main="Aligned DNaseI", ylab="", xlab="Approx. pos. (bp)", cex.main=2.5, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.07, y=1.1, labels='B', cex=4.5, xpd=NA, font=2) # realigned MNase p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(results$clusters, lwd=2, colors=l) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(mnase.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned MNase", ylab="", xlab="Approx. pos. (bp)", cex.main=2.5, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.07, y=1.1, labels='C', cex=4.5, xpd=NA, font=2) # realigned motifs p = par(mar=c(5, 0, 4, 1) + 0.1) plot.label.bar(results$clusters, lwd=2, colors=l) p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1)) image(t(condense.matrix(motif.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned motifs", ylab="", xlab="Approx. pos. (bp)", cex.main=2.5, cex.lab=2.5) axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7) text(x=-0.07, y=1.1, labels='D', cex=4.5, xpd=NA, font=2) # proportion TSS in each cluster prop_tss = vector(mode="numeric", length=n.cluster) for(j in 1:n.cluster) { index = which(results$clusters == j) prop_tss[j] = sum(has_tss[index]) / length(index) } p = par(mar=c(5.1,5.1,4.1,1.1)) barplot(prop_tss, col=color.lab, main="Prop. TSS", xlab="clusters", ylab="Prop. region with TSS", names.arg=1:n.cluster, cex.main=2.5, cex.lab=2, cex.axis=2) text(x=-0.2, 0.65, labels='E', cex=4.5, xpd=NA, font=2) # cluster 2 aggregations x = chipcor.dnase[,1] y.dnase = chipcor.dnase[,2] y.mnase = chipcor.mnase[,2] y.tss.m = chipcor.tss.m[,2] y.cage.m = chipcor.cage.m[,2] p = par(mar=c(5.1,6.1,4.1,2.1)) plot(x=x, y=y.mnase/max(y.mnase), lwd=3, col=color.lab[2], type='l', xlab="Approx. pos. (bp)", ylab="Prop. of max signal", main="Cluster 2", ylim=c(0,1.2), cex.main=2.5, cex.axis=1.7, cex.lab=2.5) lines(x=x, y=y.dnase/max(y.dnase), lwd=3, col=color.lab[1], lty=1) # dnase on both std / at orinted peaks lines(x=x, y=y.tss.m/max(y.tss.m), lwd=2, col=color.lab[3], lty=1) # tss on - std / at orinted peaks lines(x=x, y=y.cage.m/max(y.cage.m), lwd=2, col=color.lab[4], lty=1) # cage on - std / at orinted peaks legend("topright", legend=c("MNase", "DNaseI", "TSS -std", "CAGE -std"), seg.len=0.5, col=c(color.lab[c(2,1,3,4)]), lwd=c(3,3,2,2), bty="n", cex=1) text(x=-300, 1.5, labels='F', cex=4.5, xpd=NA, font=2) # write motif found by MEME p = par(mar=c(5, 0, 4, 1) + 0.1) plot(0,0, bty="n", xaxt="n", yaxt="n", main="De novo discovered motifs", xlab="", ylab="", cex.main=2.5, xlim=c(1,100), ylim=c(1,100), col="white") # cluster 1 text(x=0 , y=100, labels="Cluster 1", cex=2, pos=4, font=2, xpd=NA, col=color.lab[1]) text(x=0 , y=88, labels="*NFYA / NFYB", cex=2, pos=4) text(x=0 , y=76, labels="*SP related", cex=2, pos=4) # cluster 2 left text(x=35 , y=100, labels="Cluster 2 left", cex=2, pos=4, font=2, xpd=NA, col=color.lab[2]) text(x=35 , y=88, labels="*SP related", cex=2, pos=4) text(x=35 , y=76, labels="*NFYA / NFYB", cex=2, pos=4) text(x=35 , y=64, labels=" GATA6 / GATA3", cex=2, pos=4) text(x=35 , y=52, labels=" SFPI1 / SPIC", cex=2, pos=4) text(x=35 , y=40, labels=" FOX related", cex=2, pos=4) text(x=35 , y=28, labels=" ARNTL / BHLHE41", cex=2, pos=4) # cluster 2 right text(x=35 , y=16, labels="Cluster 2 right", cex=2, pos=4, font=2, xpd=NA, col=color.lab[2]) text(x=35 , y=04, labels="*SP related", cex=2, pos=4, xpd=NA) text(x=35 , y=-8, labels="*NFYA / NFYB", cex=2, pos=4, xpd=NA) text(x=35 , y=-20, labels=" ARNTL / BHLHE41", cex=2, pos=4, xpd=NA) # cluster 3 text(x=75 , y=100, labels="Cluster 3", cex=2, pos=4, font=2, xpd=NA, col=color.lab[3]) text(x=75 , y=88, labels="*SP related (c)", cex=2, pos=4) text(x=75 , y=76, labels="*NFYA / NFYB", cex=2, pos=4) text(x=75 , y=64, labels="*GATA related", cex=2, pos=4) text(x=75 , y=52, labels=" ETS related", cex=2, pos=4) text(x=75 , y=40, labels=" ATF1", cex=2, pos=4) text(x=75 , y=28, labels="*AP1 related", cex=2, pos=4) text(x=75 , y=16, labels="*NRF1 (c)", cex=2, pos=4) text(x=02, 125, labels='G', cex=4.5, xpd=NA, font=2) par(p) dev.off() rm(list=ls())