# REDO THE FIGURES FROM THE ARTICLE BUT IN PNG FORMAT INSTEAD OF PDF, WITH LOWER RESOLUTION 
# IT IS AN ADAPTED COPY/PASTE FROM /local/groux/Kmeans_chipseq/bin/article/figures.R

setwd(file.path("", "local", "groux", "Kmeans_chipseq"))

library(RColorBrewer)
library(plotrix)


# ===================================================== Supplemental Figure 1 =====================================================
# plot the class profiles of the simulated classes and one example of a dataset with low noise, one example with high noise 
# and their two best partitions

dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark")

source(file.path("res", "functions_utility.R"))
source(file.path("res", "functions_plot.R"))

# class densities
# general parameter
n_samples  = 1000
n_col      = 2001                   # the length of a signal vector
shift_max  = 100                    # the maximum possible shift
p_flip     = 0.3                    # the prob of having a flipped signal    
# class 1  : a simple gaussian
class1_n   = 600
class1_m   = ceiling(n_col/2) - ceiling(shift_max/2) # class 1 mean, mean will be in average in the middle of the data vector
class1_s   = 40                                      # class 1 sd
# the signal shape
shape1     = dnorm(1:(n_col-shift_max+1), class1_m, class1_s)
# class 2  : half a gaussian
class2_n   = n_samples - class1_n
class2_m   = floor(n_col/2) - floor(shift_max/2) # class 2 mean, mean will be in average in the middle of the data vector
class2_s   = 40                                  # class 2 sd
# the signal shape
shape2      = dnorm(1:(n_col-shift_max+1), class2_m, class2_s)
shape2[class2_m:length(shape2)] = min(shape2)
# class 3  : a uniform
class3_n    = 333
class3_from = floor(n_col/2) - floor(shift_max/2) -120 # class 3 from, mean will be in average in the middle of the data vector
class3_to   = floor(n_col/2) - floor(shift_max/2) +120 # class 3 to, mean will be in average in the middle of the data vector
# the signal shape
shape3      = dunif(1:(n_col-shift_max+1), class3_from, class3_to)
# normalize
shape1 = shape1 / sum(shape1)
shape2 = shape2 / sum(shape2)
shape2 = shape2 / sum(shape2)

# two datasets and an example of partitioning using SPar-K
labels       = as.matrix(read.table("data/simulated_data_chipseq/simulated_data_3_class_asym_classes_cov100_noise0.0.txt"))[,1]
# coverage 100, noise 0
data.100.0      = as.matrix(read.table(file.path("data", "simulated_data_chipseq", "simulated_data_3_class_asym_cov100_noise0.0.txt")))
ari             = read.RDS(file.path("results", "simulated_data_chipseq", "app", "simulated_data_chipseq_3_class_asym_ari_newkmean.RDS"))
best            = which.max(ari$`kmean++`$nooutlier$`cov 100`$`noise 0.0`$`3 cluster`)
data.100.0.part = read.table(file.path("results",
                                       "simulated_data_chipseq",
                                       "app",
                                       "seeding_kmean++",
                                       sprintf("simulated_data_3_class_asym_cov100_noise0.0_3cluster_flip_normcorr_%d.txt", best)),
                             header=T)
data.100.0.part = realign.data(data.100.0, data.100.0.part$shift_ref, data.100.0.part$shift_dat, data.100.0.part$flip, 71)

# coverage 100, noise 90
data.100.9      = as.matrix(read.table(file.path("data", "simulated_data_chipseq", "simulated_data_3_class_asym_cov100_noise0.9.txt")))
best            = which.max(ari$`kmean++`$nooutlier$`cov 100`$`noise 0.9`$`3 cluster`)
data.100.9.part = read.table(file.path("results",
                                       "simulated_data_chipseq",
                                       "app",
                                       "seeding_kmean++",
                                       sprintf("simulated_data_3_class_asym_cov100_noise0.9_3cluster_flip_normcorr_%d.txt", best)),
                             header=T)
data.100.9.part = realign.data(data.100.9, data.100.9.part$shift_ref, data.100.9.part$shift_dat, data.100.9.part$flip, 71)


col      = brewer.pal(3, "Set1")
col.heat = colorRampPalette(c("white", "red"),  space = "rgb")(100)
col.lab  = c(rep(col[1], table(labels)[1]),
             rep(col[2], table(labels)[2]),
             rep(col[3], table(labels)[3]))  

x.lab     = seq(-1000, 1000, length.out=5)
x.at      = seq(0, 1, length.out=length(x.lab))

# pdf(file=file.path("results", "article", "supplemental_figure1.pdf"), width=14, height=7)
png(filename=file.path(dest.dir, "supplemental_figure1.png"),
    width=14, height=7, units="in", res=300)
  par(mar=c(5.1, 6.1, 4.1, 2.1))

  lay = layout(mat=matrix(c(1,4,5, 8, 9,
                            1,4,5, 8, 9,
                            2,4,5, 8, 9,
                            2,6,7,10,11,
                            3,6,7,10,11,
                            3,6,7,10,11), nrow=6, ncol=5, byrow=T),widths=c(5,0.5,5,0.5,5))
  # layout.show(lay)
  
  # class 1 density
  x = 1:length(shape1)
  plot(x, shape1, lwd=3, type='l', col=col[1], main="Class 1 density", xlab="position [bp]", ylab="density", 
       cex.main=2, cex.axis=2, cex.lab=2)
  text(x=-200, y=0.0125, labels='A', cex=4.5, xpd=NA, font=2)
  # class 2 density
  plot(x, shape2, lwd=3, type='l', col=col[2], main="Class 2 density", xlab="position [bp]", ylab="density", 
       cex.main=2, cex.axis=2, cex.lab=2)
  # class 3 density
  plot(x, shape3, lwd=3, type='l', col=col[3], main="Class 3 density", xlab="position [bp]", ylab="density", 
       cex.main=2, cex.axis=2, cex.lab=2)

  # dataset coverage 100 noise 0
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(labels, lwd=2, colors=col.lab)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(data.100.0)), col=col.heat, xaxt='n', yaxt='n', main="Coverage 100, noise 0%", ylab="", xlab="position (bp)",
        cex.main=2.0, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.07, y=1.1, labels='B', cex=4.5, xpd=NA, font=2)
  
  # dataset coverage 100 noise 0.9
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(labels, lwd=2, colors=col.lab)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(data.100.9)), col=col.heat, xaxt='n', yaxt='n', main="Coverage 100, noise 90%", ylab="", xlab="position (bp)",
        cex.main=2.0, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.07, y=1.1, labels='D', cex=4.5, xpd=NA, font=2)
  
  # partition of dataset coverage 100 noise 0
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(labels, lwd=2, colors=col.lab)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(data.100.0.part)), col=col.heat, xaxt='n', yaxt='n', main="Coverage 100, noise 0%", ylab="", xlab="Approximated pos. (bp)",
        cex.main=2.0, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.07, y=1.1, labels='C', cex=4.5, xpd=NA, font=2)
  
  # partition dataset coverage 100 noise 0.9
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(labels, lwd=2, colors=col.lab)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(data.100.9.part)), col=col.heat, xaxt='n', yaxt='n', main="Coverage 100, noise 90%", ylab="", xlab="Approximated pos. (bp)",
        cex.main=2.0, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.07, y=1.1, labels='E', cex=4.5, xpd=NA, font=2)
dev.off()  

rm(list=ls())
  
  
# ===================================================== Supplemental Figure 2 =====================================================
# plot the Adjuted Rand Index values for all programs, measured on the simulated data with different coverages, background to 
# noise ratios and containing 3 classes.
# supplemantal figure 1 : results when clustering the data with random  seeding
# supplemantal figure 2 : results when clustering the data with kmean++ seeding

dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark")
source(file.path("res", "functions_utility.R"))


# some colors
colors = brewer.pal(9, "Set1")

# coverages and signal/noise ratiios used to simulate the data
coverages = c(10, 50, 100)
noises    = c(0.0, 0.1, 0.5, 0.9)

# load Adjusted Rand Index measured
ari.kmean.new = read.RDS(file.path("results", 
                                   "simulated_data_chipseq", 
                                   "app",              
                                   "simulated_data_chipseq_3_class_asym_ari_newkmean.RDS"))
ari.kmean.reg = read.RDS(file.path("results", "simulated_data_chipseq", 
                                   "kmean",            
                                   "simulated_data_chipseq_3_class_asym_ari_kmean.RDS"))
ari.chippart  = read.RDS(file.path("results",
                                   "simulated_data_chipseq", 
                                   "chippartitioning", 
                                   "simulated_data_chipseq_3_class_asym_ari_chippartitioning.RDS"))
ari.shuf      = read.RDS(file.path("results",
                                   "simulated_data_chipseq",
                                   "simulated_data_chipseq_3_class_asym_gamma_shuffled.RDS"))

# pdf(file=file.path("results", "article", "supplemental_figure2.pdf"), width=16, height=7)
png(filename=file.path(dest.dir, "supplemental_figure2.png"),
     width=16, height=7, units="in", res=300)
  par(mar=c(5.1, 6.1, 4.1, 2.1))
  
  colors.boxplot = c(rep(c(colors[1],
                           colors[5],
                           colors[2:4]),
                         each=12),
                     colors[7])
  boxplot(
    # new K-means
    # random seeding
    # normal
    # coverage 10
    ari.kmean.new[["random"]][["normal"]][["cov 10"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.new[["random"]][["normal"]][["cov 10"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.new[["random"]][["normal"]][["cov 10"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.new[["random"]][["normal"]][["cov 10"]][["noise 0.9"]][["3 cluster"]],
    # coverage 50
    ari.kmean.new[["random"]][["normal"]][["cov 50"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.new[["random"]][["normal"]][["cov 50"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.new[["random"]][["normal"]][["cov 50"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.new[["random"]][["normal"]][["cov 50"]][["noise 0.9"]][["3 cluster"]],
    # coverage 100
    ari.kmean.new[["random"]][["normal"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.new[["random"]][["normal"]][["cov 100"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.new[["random"]][["normal"]][["cov 100"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.new[["random"]][["normal"]][["cov 100"]][["noise 0.9"]][["3 cluster"]],
    
    # new K-means
    # random seeding
    # normal
    # coverage 10
    ari.kmean.new[["random"]][["nooutlier"]][["cov 10"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.new[["random"]][["nooutlier"]][["cov 10"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.new[["random"]][["nooutlier"]][["cov 10"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.new[["random"]][["nooutlier"]][["cov 10"]][["noise 0.9"]][["3 cluster"]],
    # coverage 50
    ari.kmean.new[["random"]][["nooutlier"]][["cov 50"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.new[["random"]][["nooutlier"]][["cov 50"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.new[["random"]][["nooutlier"]][["cov 50"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.new[["random"]][["nooutlier"]][["cov 50"]][["noise 0.9"]][["3 cluster"]],
    # coverage 100
    ari.kmean.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.9"]][["3 cluster"]],
    
    # regular K-means
    # random seeding
    # euclidean distance
    # kmean++ seeding
    # coverage 10
    ari.kmean.reg[["random"]][["eucl"]][["cov 10"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["eucl"]][["cov 10"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["eucl"]][["cov 10"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["eucl"]][["cov 10"]][["noise 0.9"]][["3 cluster"]],
    # coverage 50
    ari.kmean.reg[["random"]][["eucl"]][["cov 50"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["eucl"]][["cov 50"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["eucl"]][["cov 50"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["eucl"]][["cov 50"]][["noise 0.9"]][["3 cluster"]],
    # coverage 100
    ari.kmean.reg[["random"]][["eucl"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["eucl"]][["cov 100"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["eucl"]][["cov 100"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["eucl"]][["cov 100"]][["noise 0.9"]][["3 cluster"]],
    
    # regular K-means
    # random seeding
    # euclidean distance
    # kmean++ seeding
    # coverage 10
    ari.kmean.reg[["random"]][["corr"]][["cov 10"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["corr"]][["cov 10"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["corr"]][["cov 10"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["corr"]][["cov 10"]][["noise 0.9"]][["3 cluster"]],
    # coverage 50
    ari.kmean.reg[["random"]][["corr"]][["cov 50"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["corr"]][["cov 50"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["corr"]][["cov 50"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["corr"]][["cov 50"]][["noise 0.9"]][["3 cluster"]],
    # coverage 100
    ari.kmean.reg[["random"]][["corr"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["corr"]][["cov 100"]][["noise 0.1"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["corr"]][["cov 100"]][["noise 0.5"]][["3 cluster"]],
    ari.kmean.reg[["random"]][["corr"]][["cov 100"]][["noise 0.9"]][["3 cluster"]],
    
    # ChIPparitioning
    # random seeding
    # coverage 10
    ari.chippart[["random"]][["cov 10"]][["noise 0.0"]][["3 cluster"]],
    ari.chippart[["random"]][["cov 10"]][["noise 0.1"]][["3 cluster"]],
    ari.chippart[["random"]][["cov 10"]][["noise 0.5"]][["3 cluster"]],
    ari.chippart[["random"]][["cov 10"]][["noise 0.9"]][["3 cluster"]],
    # coverage 50
    ari.chippart[["random"]][["cov 50"]][["noise 0.0"]][["3 cluster"]],
    ari.chippart[["random"]][["cov 50"]][["noise 0.1"]][["3 cluster"]],
    ari.chippart[["random"]][["cov 50"]][["noise 0.5"]][["3 cluster"]],
    ari.chippart[["random"]][["cov 50"]][["noise 0.9"]][["3 cluster"]],
    # coverage 100
    ari.chippart[["random"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
    ari.chippart[["random"]][["cov 100"]][["noise 0.1"]][["3 cluster"]],
    ari.chippart[["random"]][["cov 100"]][["noise 0.5"]][["3 cluster"]],
    ari.chippart[["random"]][["cov 100"]][["noise 0.9"]][["3 cluster"]],
    
    # Random expectations
    ari.shuf,
    
    main="Adjusted Rand Index", xlab="", ylab="ARI",
    xaxt='n', yaxt='n', cex.main=3, cex.axis=2, cex.lab=2, ylim=c(-0.2, 1.4),
    col=colors.boxplot)
  # y axis
  axis(side=2, at=seq(0, 1, by=0.2), cex.axis=1.3)
  # add horizontal lines
  abline(h=1.0, lty=2)
  abline(h=0.5, lty=2)
  abline(h=0.0, lty=2)
  # draw noise values
  # parameters to draw triangles
  y_from_tri = -0.05
  y_to_tri   = y_from_tri
  x_from_tri = 0.5
  x_to_tri   = 1
  x_by_tri   = length(noises) + 1
  h_tri      = 0.02
  # parameters to draw noise values
  x_noise    = 1
  y_noise    = y_from_tri - 0.05
  x_by_noise = 1
  for(i in 1:5)
  { for(j in 1:length(coverages))
    { x_to_tri = x_from_tri + x_by_tri - 1
      polygon(x=c(x_from_tri, x_to_tri, x_to_tri), y=c(y_from_tri, y_to_tri, y_to_tri+h_tri), col="black")
      for(k in 1:length(noises))
      {
        text(x=x_noise, y=y_noise, labels=noises[k], cex=0.8)
        x_noise = x_noise + x_by_noise
      }
      abline(v=x_from_tri, lty=2)
      x_from_tri = x_to_tri
    }
  }
  abline(v=x_from_tri, lty=2)
  # label the random values
  text(x=x_noise, 
       y=y_noise, 
       labels="R")
  # draw coverage values
  y_cov      = y_from_tri - 0.1
  y_cov_text = y_cov - 0.05
  x_from_cov = 1
  x_to_cov   = 1
  x_by_cov   = length(noises)
  for(i in 1:5)
  { for(j in 1:length(coverages))
    { x_to_cov = x_from_cov + x_by_cov - 1
    segments(x0=x_from_cov, x1=x_to_cov, y0=y_cov, y1=y_cov, lwd=3)
    text(x=x_from_cov + 0.5*(x_by_cov-1), y=y_cov_text, labels=sprintf("cov %d", coverages[j]))
    x_from_cov = x_to_cov + 1
    }
  }
  # draw legend
  legend(x=50, y=1.52, legend=c("SPar-K",
                                "SPar-K (smooth.)",
                                "K-means (eucl.)",
                                "K-means (corr.)",
                                "ChIPPartitioning",
                                "Random partition"),
         col=unique(colors.boxplot),
         cex=1.2, lwd=4, bty='n')
dev.off()


rm(list=ls())


# ===================================================== Supplemental Figure 4 =====================================================
# plot the SSE for random and Kmeans++ seedings for simulated ChIP-seq data with 3 classes

dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark")
source(file.path("res", "functions_utility.R"))

sse = read.RDS(file.path("results",
                         "simulated_data_chipseq",
                         "app",
                         "simulated_data_chipseq_3_class_asym_sse_newkmean.RDS"))
cov      = "cov 100"
noise    = "noise 0.0"

# pdf(file=file.path("results", "article", "supplemental_figure4.pdf"), width=10, height=6)
png(filename=file.path(dest.dir, "supplemental_figure4.png"),
     width=10, height=6, units="in", res=300)

  par(mar=c(5.1, 6.1, 4.1, 2.1), mfrow=c(2,2))
  # random seeding, normal
  option  = "normal"
  seeding = "random"
  x = 2:5
  m = c(
    # given seeding
    # coverage 100
    median(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]])
  )
  s = c(
    # given seeding
    # coverage 100
    sd(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]])
  )
  ylim = c(min(m-s), max(m+s))
  # plot medians
  plot(x=x, y=m,
       main="", xlab="Nb of clusters", ylab="SSE",
       cex.main=3, cex.axis=2, cex.lab=2, lwd=3, type='b', ylim=ylim,
       xaxt='n')
  axis(side=1, at=x, cex.axis=2)
  # plot standard deviations
  segments(x0=x, x1=x, y0=m-s, y1=m+s, lwd=2)
  # plot label
  text(x=1.4, y=ylim[1]+ 1.3*diff(ylim), labels="A", cex=3.5, xpd=NA, font=2)
  
  
  # random seeding, nooutlier
  option  = "nooutlier"
  seeding = "random"
  x       = 2:5
  m = c(
    # given seeding
    # coverage 100
    median(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]])
  )
  s = c(
    # given seeding
    # coverage 100
    sd(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]])
  )
  ylim = c(min(m-s), max(m+s))
  # plot medians
  plot(x=x, y=m,
       main="", xlab="Nb of clusters", ylab="SSE",
       cex.main=3, cex.axis=2, cex.lab=2, lwd=3, type='b', ylim=ylim,
       xaxt='n')
  axis(side=1, at=x, cex.axis=2)
  # plot standard deviations
  segments(x0=x, x1=x, y0=m-s, y1=m+s, lwd=2)
  # plot label
  text(x=1.4, y=ylim[1]+ 1.3*diff(ylim), labels="B", cex=3.5, xpd=NA, font=2)
  
  
  # kmean++ seeding, normal
  option  = "normal"
  seeding = "kmean++"
  x = 2:5
  m = c(
    # given seeding
    # coverage 100
    median(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]])
  )
  s = c(
    # given seeding
    # coverage 100
    sd(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]])
  )
  ylim = c(min(m-s), max(m+s))
  # plot medians
  plot(x=x, y=m,
       main="", xlab="Nb of clusters", ylab="SSE",
       cex.main=3, cex.axis=2, cex.lab=2, lwd=3, type='b', ylim=ylim,
       xaxt='n')
  axis(side=1, at=x, cex.axis=2)
  # plot standard deviations
  segments(x0=x, x1=x, y0=m-s, y1=m+s, lwd=2)
  # plot label
  text(x=1.4, y=ylim[1]+ 1.3*diff(ylim), labels="C", cex=3.5, xpd=NA, font=2)

  
  # kmean++ seeding, nooutlier
  option  = "nooutlier"
  seeding = "kmean++"
  x = 2:5
  m = c(
    # given seeding
    # coverage 100
    median(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]),
    median(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]])
  )
  s = c(
    # given seeding
    # coverage 100
    sd(sse[[seeding]][[option]][[cov]][[noise]][["2 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["3 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["4 cluster"]]),
    sd(sse[[seeding]][[option]][[cov]][[noise]][["5 cluster"]])
  )
  ylim = c(min(m-s), max(m+s))
  # plot medians
  plot(x=x, y=m,
       main="", xlab="Nb of clusters", ylab="SSE",
       cex.main=3, cex.axis=2, cex.lab=2, lwd=3, type='b', ylim=ylim,
       xaxt='n')
  axis(side=1, at=x, cex.axis=2)
  # plot standard deviations
  segments(x0=x, x1=x, y0=m-s, y1=m+s, lwd=2)
  # plot label
  text(x=1.4, y=ylim[1]+ 1.3*diff(ylim), labels="D", cex=3.5, xpd=NA, font=2)
  
dev.off()


rm(list=ls())


# ===================================================== Supplemental Figure 5 =====================================================
# plot the runtimes for each prorgam when clustering the simulated ChIP-seq data with 3 classes

dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark")
source(file.path("res", "functions_utility.R"))

times.new   = read.RDS(file.path("results", "runtime", "runtimes_app.RDS"))
times.kmean = read.RDS(file.path("results", "runtime", "runtimes_kmean.RDS"))
times.chipp = read.RDS(file.path("results", "runtime", "runtimes_chippartitioning.RDS"))

data = list(vec1=times.new[["random"]][["normal"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
            vec2=times.new[["kmean++"]][["normal"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
            vec3=times.new[["random"]][["nooutlier"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
            vec4=times.new[["kmean++"]][["nooutlier"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
            vec5=times.kmean[["random"]][["eucl"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
            vec6=times.kmean[["random"]][["corr"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
            vec7=times.kmean[["kmean++"]][["eucl"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
            vec8=times.kmean[["kmean++"]][["corr"]][["cov 100"]][["noise 0.0"]][["3 cluster"]],
            vec9=times.chipp[["random"]][["cov 100"]][["noise 0.0"]][["3 cluster"]])

# some colors
colors = brewer.pal(9, "Set1") 
colors.boxplot = c(rep(colors[1],2),
                   rep(colors[5],2),
                   rep(colors[2],2),
                   rep(colors[3],2),
                   rep(colors[4],2))

# pdf(file=file.path("results", "article", "supplemental_figure5.pdf"), width=10, height=6)
png(filename=file.path(dest.dir, "supplemental_figure5.png"),
     width=10, height=6, units="in", res=300)
  par(mar=c(6.1, 6.1, 4.1, 2.1),
      cex.main=3,
      cex.axis=1.5,
      cex.lab=2,
      xaxt="n")
  
  p = par(cex.main=3, cex.axis=1.5, cex.lab=2)
  
  # boxplot with a broken y-axis
  gap.boxplot(data,
              gap=list(top=c(80,550),bottom=c(NA,NA)),
              main="Running times", xlab="", ylab="time (sec)", 
              col=colors.boxplot)
  # x-axis
  labels = c(rep(c("rand", "k++"), 4),
             "rand")
  axis(1, at=1:9, tick=T, labels=FALSE)
  text(x=1:9,
       y=-10,
       labels=labels,
       srt=45, adj=1, xpd=TRUE, cex=1.8)
  # y-axis
  axis(2, labels=c(seq(0,80,length.out=5), seq(550,650,length.out=5)), 
       at=c(seq(0,80,length.out=5), seq(550,650,length.out=5)-(550-60)))
  # legend
  legend("topleft",
         legend = c("SPar-K",
                    "SPar-K (smooth)",
                    "Kmeans (eucl)",
                    "Kmeans (corr)",
                    "ChiPPartitioning"),
         col = unique(colors.boxplot),
         cex=1.2, lwd=4, bty='n')
  grid()
dev.off()

rm(list=ls())


# ===================================================== Supplemental Figure 8 =====================================================
# figure with the MNase data at CTCF binding sites partition
# the partition was obtained by running the clustering with a shift of 41, flip and nooutlier
# the otpimal number of cluster was estimated to be 3, the best partition was estimated to be 
# the 4th one. Visually, it was providing interesting biological informations but was not 
# the partition with the lowest SSE for K=3 (but it was not neither the one with the highest).

dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark")
source(file.path("res", "functions_utility.R"))
source(file.path("res", "functions_plot.R"))

#' Order the rows of a given matrix by similarity 
#' (correlation) to the aggregation (in descending 
#' order) and returns the order.
#' @param data the matrix of interest.
#' @return a vector of indices to reorder the 
#' original matrix.
#' @author Romain Groux
get.row.order = function(data)
{ if(is.vector(data))
  { return(c(1)) }
    else
    { ref    = colSums(data)
    scores = apply(data, 1, cor, ref)
    return(order(scores, decreasing=F))
    }
}


# clustering parameters
n.cluster  = 4
n.shift    = 41
flip       = TRUE


# the data
data  = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_mnase_encode.txt")))
# some additionnal data
dnase     = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_dnase_encode_rep1.txt"))) + 
            as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_dnase_encode_rep2.txt")))
motif     = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_ctcfmotif_encode.txt")))
tss.plus  = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_tss_std+_encode.txt")))
tss.minus = as.matrix(read.table(file.path("data", "data_chipseq", "ctcf_tss_std-_encode.txt")))
tss       = tss.plus + tss.minus
peaks     = read.table(file.path("data", "data_chipseq", "ctcfpeak.sga"), header=F, stringsAsFactors=F)

# cluster 1 aggregation profiles
chipcor.tss.m  = read.table(file.path("results",
                                      "ctcf_mnase_encode2",
                                      sprintf("ctcfpeak_mnase_%dclusters_newkmean_nooutlier_4_cluster1_tss-.txt",
                                              n.cluster)))
chipcor.cage.m = read.table(file.path("results",
                                      "ctcf_mnase_encode2",
                                      sprintf("ctcfpeak_mnase_%dclusters_newkmean_nooutlier_4_cluster1_cage-rep1-.txt",
                                              n.cluster)))
chipcor.dnase = read.table(file.path("results",
                                     "ctcf_mnase_encode2",
                                     sprintf("ctcfpeak_mnase_%dclusters_newkmean_nooutlier_4_cluster1_dnase-rep1.txt",
                                             n.cluster)))
chipcor.mnase = read.table(file.path("results",
                                     "ctcf_mnase_encode2",
                                     sprintf("ctcfpeak_mnase_%dclusters_newkmean_nooutlier_4_cluster1_mnase.txt",
                                             n.cluster)))

# the best partition
results   = read.table(file.path("results", "ctcf_mnase_encode2", 
                                 sprintf("ctcf_mnase_encode_%dclusters_nooutlier_4.txt", n.cluster)),
                       header=T)
results   = format.results(data, results, n.shift, n.cluster)


# x-axis labels
x.lab     = seq(-1000, 1000, length.out=5)
x.at      = seq(0, 1, length.out=length(x.lab))
x.at2     = seq(1, ncol(data), length.out=length(x.lab))
# heatmap colors
color.1 = colorRampPalette(c("white",  "red"),  space = "rgb")(100)
color.2 = colorRampPalette(c("white", "blue"),  space = "rgb")(100)
# cluster colors
color.lab = brewer.pal(8, "Set1")
# whether a region has a motif
has_motif = apply(motif, 1, sum)
has_motif[which(has_motif > 1)] = 1
# wheteher a region has a TSS
has_tss = apply(tss, 1, sum)
has_tss[which(has_tss > 1)] = 1

# plot
# pdf(file=file.path("results", "article", "supplemental_figure8.pdf"), width=14, height=8)
png(filename=file.path(dest.dir, "supplemental_figure8.png"),
     width=14, height=8, units="in", res=300)
  
  # create matrices with the data and the peaks for the heatmap and a vector of color 
  # labels to plot the cluster assignment on the side of the heatmap
  d             = matrix(nrow=nrow(data), ncol=ncol(data))
  p             = d
  data.aligned  = d
  motif.aligned = d
  dnase.aligned = d
  tss.aligned   = d
  l             = vector(mode="character", length=nrow(data))
  from = 1; to = from ;
  for(j in 1:n.cluster)
  { index                   = which(results$clusters == j)
    to                      = from + length(index) -1
    d[from:to,]             = order.rows(data[index,])
    data.aligned[from:to,]  = realign.data(data[index,],  results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift)
    motif.aligned[from:to,] = realign.data(motif[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift)
    dnase.aligned[from:to,] = realign.data(dnase[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift)
    tss.aligned[from:to,]   = realign.data(tss[index,],   results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift)
    order                   = get.row.order(data.aligned[from:to,])
    data.aligned[from:to,]  = data.aligned[from:to,][order,]
    motif.aligned[from:to,] = motif.aligned[from:to,][order,]
    dnase.aligned[from:to,] = dnase.aligned[from:to,][order,]
    tss.aligned[from:to,]   = tss.aligned[from:to,][order,]
    l[from:to]  = color.lab[j]
    from = to + 1
  }
  
  
  p = par(oma=c(0,0,5,0))
  # layout construction
  labels = c(1, 2, 3, 4, 5, 6, 7, 8,
             1, 2, 3, 4, 5, 6, 7, 8,
             9, 10,11,11,12,12,13,13,
             9, 10,11,11,12,12,13,13)
  lay = layout(matrix(data=labels, nrow=4, ncol=8, byrow=T), widths=c(0.5,5,0.5,5,0.5,5,0.5,5,0.5,5,0.5,5))
  # layout.show(lay)
  
  # data heatmap
  # p = par(mar=c(5, 0, 4, 1) + 0.1)
  # plot.label.bar(results$clusters, lwd=2, colors=l)
  # p = par(mar=c(5, 10, 4, 4) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  # image(t(condense.matrix(d)), col=color.1, xaxt='n', yaxt='n', main="MNase Data", ylab="", xlab="Position (bp)",
  #       cex.main=2.5, cex.lab=2.5)
  # axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  # text(x=-0.08, y=1.15, labels='A', cex=4, xpd=NA, font=2)
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot(1,1, xaxt='n', yaxt='n', col="white", xlab="", ylab="", main="", bty='n')
  p = par(mar=c(5, 10, 4, 4) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(order.rows(data))), col=color.1, xaxt='n', yaxt='n', main="MNase Data", ylab="", xlab="Position (bp)",
        cex.main=2.5, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.08, y=1.15, labels='A', cex=4, xpd=NA, font=2)
  # realigned data heatmap
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(results$clusters, lwd=2, colors=l)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(data.aligned)), col=color.1, xaxt='n', yaxt='n', main="Aligned MNase", ylab="", xlab="Approx. pos. (bp)",
        cex.main=2.5, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.08, y=1.15, labels='B', cex=4, xpd=NA, font=2)
  # realigned DNaseI
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(results$clusters, lwd=2, colors=l)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(dnase.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned DNaseI", ylab="", xlab="Approx. pos. (bp)",
        cex.main=2.5, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.08, y=1.15, labels='C', cex=4, xpd=NA, font=2)
  # realigned motifs
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(results$clusters, lwd=2, colors=l)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(motif.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned motifs", ylab="", xlab="Approx. pos. (bp)",
        cex.main=2.5, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.08, y=1.15, labels='D', cex=4, xpd=NA, font=2)
  # realigned TSSs
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(results$clusters, lwd=2, colors=l)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(tss.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned TSSs", ylab="", xlab="Approx. pos. (bp)",
        cex.main=2.5, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.08, y=1.15, labels='E', cex=4, xpd=NA, font=2)
  
  # cluster 2 aggregations
  x        = chipcor.dnase[,1]
  y.dnase  = chipcor.dnase[,2]
  y.mnase  = chipcor.mnase[,2]
  y.tss.m  = chipcor.tss.m[,2]
  y.cage.m = chipcor.cage.m[,2]
  p = par(mar=c(5.1,6.1,4.1,2.1))
  plot(x=x,  y=y.mnase/max(y.mnase), lwd=3, col=color.lab[2], type='l',
       xlab="Approximated position (bp)",
       ylab="Prop. of max signal",
       main="Cluster 1",
       ylim=c(0,1.2),
       cex.main=2.5, cex.axis=2, cex.lab=2)
  lines(x=x, y=y.dnase/max(y.dnase),   lwd=3, col=color.lab[1], lty=1) # dnase on both std / at orinted peaks
  lines(x=x, y=y.tss.m/max(y.tss.m),   lwd=2, col=color.lab[3], lty=1) # tss   on    - std / at orinted peaks
  lines(x=x, y=y.cage.m/max(y.cage.m), lwd=2, col=color.lab[4], lty=1) # cage  on    - std / at orinted peaks
  legend("topright", legend=c("MNase",
                              "DNaseI",
                              "TSS -std",
                              "CAGE -std"),
         seg.len=0.5, col=c(color.lab[c(2,1,3,4)]), lwd=c(3,3,2,2), bty="n", cex=1)
  text(x=-1100, y=1.42, labels='F', cex=4.5, xpd=NA, font=2)
  
  # motif proportions
  motif_prop = vector(mode="numeric", length=n.cluster)
  for(j in 1:n.cluster)
  { index        = which(results$clusters == j) 
    motif_prop[j] = sum(has_motif[index]) / length(index)
  }
  barplot(height=motif_prop, ylim=c(0,1),col=color.lab[1:j], 
          main="Prop. CTCF motif", xlab="clusters", ylab="Prop. region with motif",
          names.arg=1:n.cluster,
          cex.main=2.0, cex.lab=2, cex.axis=2)
  text(x=-0.08, y=1.15, labels='G', cex=4, xpd=NA, font=2)
  
  # TSS proportions
  tss_prop = vector(mode="numeric", length=n.cluster)
  for(j in 1:n.cluster)
  { index        = which(results$clusters == j) 
    tss_prop[j] = sum(has_tss[index]) / length(index)
  }
  barplot(height=tss_prop, ylim=c(0,1),col=color.lab[1:j],
          main="Prop. TSS", xlab="clusters", ylab="Prop. region with TSS",
          names.arg=1:n.cluster,
          cex.main=2.0, cex.lab=2, cex.axis=2)
  text(x=-0.08, y=1.15, labels='H', cex=4, xpd=NA, font=2)
  
  par(p)
dev.off()

rm(list=ls())


# ===================================================== Figure 1 =====================================================
# figure with the DNaseI data at SP1 binding sites partition
# the partition was obtained by running the clustering with a shift of 41, flip and nooutlier
# The best partition is the 7th which is the 4th partition with the lowest SSE (the 4 lowest SSE 
# values are really close from each other, ~1/600 of diff) and it looks  also really nice in terms 
# of biology with aligned footprints andclusters loooking different 

dest.dir = file.path("", "local", "groux", "phd_thesis", "images", "ch_spark")
source(file.path("res", "functions_utility.R"))
source(file.path("res", "functions_plot.R"))


#' Order the rows of a given matrix by similarity 
#' (correlation) to the aggregation (in descending 
#' order) and returns the order.
#' @param data the matrix of interest.
#' @return a vector of indices to reorder the 
#' original matrix.
#' @author Romain Groux
get.row.order = function(data)
{ if(is.vector(data))
  { return(c(1)) }
  else
  { ref    = colSums(data)
   scores = apply(data, 1, cor, ref)
    return(order(scores, decreasing=F))
  }
}


# clustering parameters
n.shift      = 41
flip         = TRUE
n.cluster    = 3

# the data
data    = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_dnase_big_clean.txt")))
# some additionnal data
mnase     = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_mnase_big_clean.txt")))
motif     = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_sp1motif_big_clean.txt")))
tss       = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_tss_big_clean.txt")))
tss.plus  = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_tss_std+_big.txt")))
tss.minus = as.matrix(read.table(file.path("data", "sp1_dnase", "sp1peak_tss_std-_big.txt")))
peaks     = read.table(file.path("data", "sp1_dnase", "sp1peak_clean.sga"), header=F, stringsAsFactors=F)

# this is the best partition to me
results = read.table(file.path("results", "sp1_dnase3", sprintf("sp1peak_dnase_%dclusters_nooutlier_7.txt", n.cluster)), header=T)
results = format.results(data, results, n.shift, n.cluster)

# cluster 2 aggregation profiles
chipcor.tss.m   = read.table(file.path("results",
                                       "sp1_dnase3",
                                       sprintf("sp1peak_dnase_%dclusters_newkmean_nooutlier_7_cluster2_tss-.txt", 
                                               n.cluster))) 
chipcor.cage.m  = read.table(file.path("results",
                                       "sp1_dnase3",
                                       sprintf("sp1peak_dnase_%dclusters_newkmean_nooutlier_7_cluster2_cage-.txt", 
                                               n.cluster))) 
chipcor.dnase  = read.table(file.path("results",
                                      "sp1_dnase3",
                                      sprintf("sp1peak_dnase_%dclusters_newkmean_nooutlier_7_cluster2_dnase_rep1.txt", 
                                              n.cluster))) 
chipcor.mnase  = read.table(file.path("results",
                                      "sp1_dnase3",
                                      sprintf("sp1peak_dnase_%dclusters_newkmean_nooutlier_7_cluster2_mnase.txt", 
                                              n.cluster))) 


x.lab     = seq(-300, 300, length.out=5)
x.at      = seq(0, 1, length.out=length(x.lab))
x.at2     = seq(1, ncol(data), length.out=length(x.lab))
# heatmap colors
color.1 = colorRampPalette(c("white", "red"),  space = "rgb")(100)
color.2 = colorRampPalette(c("white", "blue"),  space = "rgb")(100)
# cluster colors
color.lab = brewer.pal(8, "Set1")
# whether a region has a motif
has_motif = apply(motif, 1, sum)
has_motif[which(has_motif > 1)] = 1
# whether a region has a TSS
has_tss = apply(tss, 1, sum)
has_tss[which(has_tss > 1)] = 1

# plot
# pdf(file=file.path("results", "article", "figure1.pdf"), width=14, height=8)
png(filename=file.path(dest.dir, "figure1.png"),
     width=14, height=8, units="in", res=300)
  # create matrices with the data and the peaks for the heatmap and a vector of color 
  # labels to plot the cluster assignment on the side of the heatmap
  d             = matrix(nrow=nrow(data), ncol=ncol(data))
  p             = d
  data.aligned  = d
  motif.aligned = d
  mnase.aligned = d
  tss.aligned   = d
  l             = vector(mode="character", length=nrow(data))
  from = 1; to = from ;
  for(j in 1:n.cluster)
  { index                   = which(results$clusters == j)
    to                      = from + length(index) -1
    d[from:to,]             = order.rows(data[index,])
    data.aligned[from:to,]  = realign.data(data[index,],  results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift)
    motif.aligned[from:to,] = realign.data(motif[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift)
    mnase.aligned[from:to,] = realign.data(mnase[index,], results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift)
    tss.aligned[from:to,]   = realign.data(tss[index,],   results$shifts_ref[index], results$shifts_dat[index], results$flips[index], n.shift)
    order                   = get.row.order(data.aligned[from:to,])
    data.aligned[from:to,]  = data.aligned[from:to,][order,]
    motif.aligned[from:to,] = motif.aligned[from:to,][order,]
    mnase.aligned[from:to,] = mnase.aligned[from:to,][order,]
    tss.aligned[from:to,]   = tss.aligned[from:to,][order,]
    l[from:to]  = color.lab[j]
    from = to + 1
  }
  
  p = par(oma=c(0,0,5,0))
  # layout construction
  labels = c(1, 2, 3, 4, 5, 6, 7, 8,
             1, 2, 3, 4, 5, 6, 7, 8,
             1, 2, 3, 4, 5, 6, 7, 8,
             9, 9,10,10,11,11,11,11,
             9, 9,10,10,11,11,11,11)
  lay = layout(matrix(data=labels, nrow=5, ncol=8, byrow=T), widths=c(0.5,5,0.5,5,0.5,5,0.5,5,0.5,5,0.5,5))
  # layout.show(lay)
  
  # p = par(oma=c(0,0,5,0))
  # # layout construction
  # labels = c(1, 2, 3, 4, 5, 6, 7, 8,
  #            1, 2, 3, 4, 5, 6, 7, 8,
  #            9, 10,11,11,12,12,12,12,
  #            9, 10,11,11,12,12,12,12)
  # lay = layout(matrix(data=labels, nrow=4, ncol=8, byrow=T), widths=c(0.5,5,0.5,5,0.5,5,0.5,5,0.5,5,0.5,5))
  # layout.show(lay)
  
  # data heatmap
  # p = par(mar=c(5, 0, 4, 1) + 0.1)
  # plot.label.bar(results$clusters, lwd=2, colors=l)
  # p = par(mar=c(5, 10, 4, 4) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  # image(t(condense.matrix(d)), col=color.1, xaxt='n', yaxt='n', main="DNaseI data", ylab="", xlab="Position (bp)",
  #       cex.main=2.5, cex.lab=2.5)
  # axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  # text(x=-0.07, y=1.1, labels='A', cex=4.5, xpd=NA, font=2)
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot(1,1, xaxt='n', yaxt='n', col="white", xlab="", ylab="", main="", bty='n')
  p = par(mar=c(5, 10, 4, 4) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(order.rows(data))), col=color.1, xaxt='n', yaxt='n', main="DNaseI data", ylab="", xlab="Position (bp)",
        cex.main=2.5, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.07, y=1.1, labels='A', cex=4.5, xpd=NA, font=2)
  # realigned data heatmap
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(results$clusters, lwd=2, colors=l)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(data.aligned)), col=color.1, xaxt='n', yaxt='n', main="Aligned DNaseI", ylab="", xlab="Approx. pos. (bp)",
        cex.main=2.5, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.07, y=1.1, labels='B', cex=4.5, xpd=NA, font=2)
  # realigned MNase
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(results$clusters, lwd=2, colors=l)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(mnase.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned MNase", ylab="", xlab="Approx. pos. (bp)",
        cex.main=2.5, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.07, y=1.1, labels='C', cex=4.5, xpd=NA, font=2)
  # realigned motifs
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot.label.bar(results$clusters, lwd=2, colors=l)
  p = par(mar=c(5, 0, 4, 1) + 0.1, mai=c(0.6732, 0, 0.5412, 1))
  image(t(condense.matrix(motif.aligned)), col=color.2, xaxt='n', yaxt='n', main="Aligned motifs", ylab="", xlab="Approx. pos. (bp)",
        cex.main=2.5, cex.lab=2.5)
  axis(side=1, at=x.at, labels=x.lab, cex.axis=1.7)
  text(x=-0.07, y=1.1, labels='D', cex=4.5, xpd=NA, font=2)
  
  # proportion TSS in each cluster
  prop_tss = vector(mode="numeric", length=n.cluster)
  for(j in 1:n.cluster)
  { index = which(results$clusters == j)
    prop_tss[j] = sum(has_tss[index]) / length(index)
  }
  p = par(mar=c(5.1,5.1,4.1,1.1))
  barplot(prop_tss, col=color.lab, 
          main="Prop. TSS", xlab="clusters", ylab="Prop. region with TSS",
          names.arg=1:n.cluster,
          cex.main=2.5, cex.lab=2, cex.axis=2)
  text(x=-0.2, 0.65, labels='E', cex=4.5, xpd=NA, font=2)
  
  # cluster 2 aggregations
  x        = chipcor.dnase[,1]
  y.dnase  = chipcor.dnase[,2]
  y.mnase  = chipcor.mnase[,2]
  y.tss.m  = chipcor.tss.m[,2]
  y.cage.m = chipcor.cage.m[,2]
  p = par(mar=c(5.1,6.1,4.1,2.1))
  plot(x=x,  y=y.mnase/max(y.mnase), lwd=3, col=color.lab[2], type='l',
       xlab="Approx. pos. (bp)",
       ylab="Prop. of max signal",
       main="Cluster 2",
       ylim=c(0,1.2),
       cex.main=2.5, cex.axis=1.7, cex.lab=2.5)
  lines(x=x, y=y.dnase/max(y.dnase),   lwd=3, col=color.lab[1], lty=1) # dnase on both std / at orinted peaks
  lines(x=x, y=y.tss.m/max(y.tss.m),   lwd=2, col=color.lab[3], lty=1) # tss   on    - std / at orinted peaks
  lines(x=x, y=y.cage.m/max(y.cage.m), lwd=2, col=color.lab[4], lty=1) # cage  on    - std / at orinted peaks
  legend("topright", legend=c("MNase",
                              "DNaseI",
                              "TSS -std",
                              "CAGE -std"),
         seg.len=0.5, col=c(color.lab[c(2,1,3,4)]), lwd=c(3,3,2,2), bty="n", cex=1)
  text(x=-300, 1.5, labels='F', cex=4.5, xpd=NA, font=2)
  
  # write motif found by MEME
  p = par(mar=c(5, 0, 4, 1) + 0.1)
  plot(0,0, bty="n", xaxt="n", yaxt="n", main="De novo discovered motifs", xlab="", ylab="", cex.main=2.5,
       xlim=c(1,100), ylim=c(1,100), col="white")
  # cluster 1
  text(x=0 , y=100,  labels="Cluster 1",        cex=2, pos=4, font=2, xpd=NA, col=color.lab[1])
  text(x=0 , y=88,   labels="*NFYA / NFYB",     cex=2, pos=4)
  text(x=0 , y=76,   labels="*SP related",      cex=2, pos=4)
  # cluster 2 left
  text(x=35 , y=100, labels="Cluster 2 left",  cex=2, pos=4, font=2, xpd=NA, col=color.lab[2])
  text(x=35 , y=88,  labels="*SP related",      cex=2, pos=4)
  text(x=35 , y=76,  labels="*NFYA / NFYB",     cex=2, pos=4)
  text(x=35 , y=64,  labels=" GATA6 / GATA3",   cex=2, pos=4)
  text(x=35 , y=52,  labels=" SFPI1 / SPIC",    cex=2, pos=4)
  text(x=35 , y=40,  labels=" FOX related",     cex=2, pos=4)
  text(x=35 , y=28,  labels=" ARNTL / BHLHE41", cex=2, pos=4)
  # cluster 2 right
  text(x=35 , y=16,  labels="Cluster 2 right",   cex=2, pos=4, font=2, xpd=NA, col=color.lab[2])
  text(x=35 , y=04,  labels="*SP related",      cex=2, pos=4, xpd=NA)
  text(x=35 , y=-8,  labels="*NFYA / NFYB",     cex=2, pos=4, xpd=NA)
  text(x=35 , y=-20, labels=" ARNTL / BHLHE41", cex=2, pos=4, xpd=NA)
  # cluster 3
  text(x=75 , y=100,  labels="Cluster 3",       cex=2, pos=4, font=2, xpd=NA, col=color.lab[3])
  text(x=75 , y=88,  labels="*SP related (c)",  cex=2, pos=4)
  text(x=75 , y=76,  labels="*NFYA / NFYB",     cex=2, pos=4)
  text(x=75 , y=64,  labels="*GATA related",    cex=2, pos=4)
  text(x=75 , y=52,  labels=" ETS related",     cex=2, pos=4)
  text(x=75 , y=40,  labels=" ATF1",            cex=2, pos=4)
  text(x=75 , y=28,  labels="*AP1 related",     cex=2, pos=4)
  text(x=75 , y=16,  labels="*NRF1 (c)",        cex=2, pos=4)
  text(x=02, 125, labels='G', cex=4.5, xpd=NA, font=2)
  
  par(p)
  
dev.off()

rm(list=ls())