setwd(file.path("", "local", "groux", "scATAC-seq"))

if(!file.exists(file.path("results", "10xgenomics_PBMC_5k")))
{	dir.create(file.path("results", "10xgenomics_PBMC_5k")) }

# library
library("RColorBrewer")

############# data ############# 

data = read.table(file.path("data", "10xgenomics_PBMC_5k", 
                            "atac_v1_pbmc_5k_possorted_filtered_fragment_lengths.txt"), header=F)
colnames(data) = c("nb", "size")

############# fit to gaussian mixture ############# 

set.seed(20190604) # d-day - 2 (les sanglots long de l'automne...)
# fit data to gaussian mixture model
size = data$size[1:1000]
dens = data$nb[1:1000] / sum(data$nb[1:1000])
# model parameters, 1st guess by looking at plot
m1 = 50   ; s1 = 10 ; a1 = 1
m2 = 200  ; s2 = 10 ; a2 = 1 
m3 = 380  ; s3 = 30 ; a3 = 1 
# fit
init = c(m1=m1, s1=s1, a1=a1,
         m2=m2, s2=s2, a2=a2,
         m3=m3, s3=s3, a3=a3)
f = nls(dens ~ a1 * exp(-((size-m1)**2)/(2*s1)) + 
               a2 * exp(-((size-m2)**2)/(2*s2)) +
               a3 * exp(-((size-m3)**2)/(2*s3)),
        start=init)
# parameter estimates
param = matrix(nrow=3, ncol=3)
colnames(param) = c("m", "s", "a")
rownames(param) = c("class1", "class2", "class3")
param[1,] = c(coef(f)["m1"], coef(f)["s1"], coef(f)["a1"])
param[2,] = c(coef(f)["m2"], coef(f)["s2"], coef(f)["a2"])
param[3,] = c(coef(f)["m3"], coef(f)["s3"], coef(f)["a3"])
# plot
png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_classes.png"), width=10, height=8, units="in", res=720)
  p = par(mar=c(5.1, 5.1, 4.1, 2.1))
  plot(size, dens, type='l', lwd=2, 
       main="Fragment lengths", xlab="length (bp)", ylab="density",
       cex.main=3, cex.axis=1.5, cex.lab=2.5)
  col = brewer.pal(4, "Set1")
  lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])), col=col[1], lwd=4, lty=2)
  lines(size, param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])), col=col[2], lwd=4, lty=2)
  lines(size, param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[3], lwd=4, lty=2)
  lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])) +
              param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])) +
              param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[4], lwd=4)
  legend("topright",
         legend=c("open chromatin",
                  "mono-nucl.",
                  "di-nucl.",
                  "all"),
         col=col, lwd=c(4,4,4,4), lty=c(2,2,2,1),
         bty='n', cex=2)
dev.off()
# assign probabilities to fragment length
prob = matrix(nrow=1000, ncol=3)
rownames(prob) = size
for(i in 1:nrow(prob))
{ for(j in 1:ncol(prob))
  { prob[i,j] = param[j,3] * exp(-((size[i]-param[j,1])**2)/(2*param[j,2])) }
    prob[i,]  = prob[i,] / sum(prob[i,])
}
# plot
png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_class_prob.png"), width=10, height=8, units="in", res=720)
  p = par(mar=c(5.1, 5.1, 4.1, 2.1))
  plot(size, prob[,1], ylim=c(0, max(prob)), type='l',
       main="Fragment classes", xlab="length (bp)", ylab="p(class)",
       cex.main=3, cex.axis=1.5, cex.lab=2.5, lwd=4, col=col[1])
  lines(size, prob[,2], lwd=4, col=col[2])
  lines(size, prob[,3], lwd=4, col=col[3])
  
  # set limits at min 90 assignment to a class
  abline(v=30, lwd=2, lty=2)  # class 1 lower limit (size limit)
  abline(v=84, lwd=2, lty=2)  # class 1 upper limit
  abline(v=133, lwd=2, lty=2) # class 2 lower limit
  abline(v=266, lwd=2, lty=2) # class 2 upper limit
  abline(v=341, lwd=2, lty=2) # class 3 lower limit
  abline(v=500, lwd=2, lty=2) # class 3 upper limit (size limit)
dev.off()

############# break dataset into classes ############# 

# size limits
i_cl1_1 = which(size == 30)
i_cl1_2 = which(size == 84)
i_cl2_1 = which(size == 133)
i_cl2_2 = which(size == 266)
i_cl3_1 = which(size == 341)
i_cl3_2 = which(size == 500)

# nb of reads per class
nb_all   = sum(data$nb)
nb_cl1   = sum(data$nb[i_cl1_1:i_cl1_2])
nb_cl2   = sum(data$nb[i_cl2_1:i_cl2_2])
nb_cl3   = sum(data$nb[i_cl3_1:i_cl3_2])
# nb of reads not assigned at the boundaries of classes
nb_left1 = sum(data$nb[(i_cl1_2+1):(i_cl2_1-1)]) +
           sum(data$nb[(i_cl2_2+1):(i_cl3_1-1)])
# nb of reads > 500bp
nb_left2 = sum(data$nb[(i_cl3_2+1):length(data$nb)])
nb_left = nb_left1 + nb_left2

# plot classes
png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_groups.png"), width=10, height=8, units="in", res=720)

  p = par(mar=c(5.1, 5.1, 4.1, 2.1))
  plot(y=data$nb[1:1000], x=data$size[1:1000], type='l', lwd=4,
       main="Fragment lengths", xlab="length (bp)", ylab="frequency",
       cex.main=3, cex.axis=1.5, cex.lab=2.5)
  # show limits
  abline(v=data$size[i_cl1_1], lwd=3, lty=2, col=col[1])
  abline(v=data$size[i_cl1_2], lwd=3, lty=2, col=col[1])
  abline(v=data$size[i_cl2_1], lwd=3, lty=2, col=col[2])
  abline(v=data$size[i_cl2_2], lwd=3, lty=2, col=col[2])
  abline(v=data$size[i_cl3_1], lwd=3, lty=2, col=col[3])
  abline(v=data$size[i_cl3_2], lwd=3, lty=2, col=col[3])
  # nb of reads in groups
  text(x=550, y=0.85*max(data[,1]), labels=sprintf("%.2f mio reads",         nb_all/1e6),  cex=1.8, pos=4)
  text(x=550, y=0.80*max(data[,1]), labels=sprintf("%.2f mio reads class 1", nb_cl1/1e6),  cex=1.8, pos=4, col=col[1])
  text(x=550, y=0.75*max(data[,1]), labels=sprintf("%.2f mio reads class 2", nb_cl2/1e6),  cex=1.8, pos=4, col=col[2])
  text(x=550, y=0.70*max(data[,1]), labels=sprintf("%.2f mio reads class 3", nb_cl3/1e6),  cex=1.8, pos=4, col=col[3])
  text(x=550, y=0.65*max(data[,1]), labels=sprintf("%.2f mio reads left",    nb_left/1e6), cex=1.8, pos=4)
  # shade the class areas
  # class 1
  rect(size[i_cl1_1],
       0,
       size[i_cl1_2],
       max(data$nb),
       col=rgb(red=1, green=0, blue=0, alpha=0.1), border="transparent")
  # class 2  
  rect(size[i_cl2_1],
       0,
       size[i_cl2_2],
       max(data$nb),
       col=rgb(red=0, green=0, blue=1, alpha=0.1), border="transparent")
  # class 3
  rect(size[i_cl3_1],
       0,
       size[i_cl3_2],
       max(data$nb),
       col=rgb(red=0, green=1, blue=0, alpha=0.1), border="transparent")
  
dev.off()