setwd(file.path("", "local", "groux", "scATAC-seq")) if(!file.exists(file.path("results", "10xgenomics_PBMC_5k"))) { dir.create(file.path("results", "10xgenomics_PBMC_5k")) } # library library("RColorBrewer") ############# data ############# data = read.table(file.path("data", "10xgenomics_PBMC_5k", "atac_v1_pbmc_5k_possorted_filtered_fragment_lengths.txt"), header=F) colnames(data) = c("nb", "size") ############# fit to gaussian mixture ############# set.seed(20190604) # d-day - 2 (les sanglots long de l'automne...) # fit data to gaussian mixture model size = data$size[1:1000] dens = data$nb[1:1000] / sum(data$nb[1:1000]) # model parameters, 1st guess by looking at plot m1 = 50 ; s1 = 10 ; a1 = 1 m2 = 200 ; s2 = 10 ; a2 = 1 m3 = 380 ; s3 = 30 ; a3 = 1 # fit init = c(m1=m1, s1=s1, a1=a1, m2=m2, s2=s2, a2=a2, m3=m3, s3=s3, a3=a3) f = nls(dens ~ a1 * exp(-((size-m1)**2)/(2*s1)) + a2 * exp(-((size-m2)**2)/(2*s2)) + a3 * exp(-((size-m3)**2)/(2*s3)), start=init) # parameter estimates param = matrix(nrow=3, ncol=3) colnames(param) = c("m", "s", "a") rownames(param) = c("class1", "class2", "class3") param[1,] = c(coef(f)["m1"], coef(f)["s1"], coef(f)["a1"]) param[2,] = c(coef(f)["m2"], coef(f)["s2"], coef(f)["a2"]) param[3,] = c(coef(f)["m3"], coef(f)["s3"], coef(f)["a3"]) # plot png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_classes.png"), width=10, height=8, units="in", res=720) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) plot(size, dens, type='l', lwd=2, main="Fragment lengths", xlab="length (bp)", ylab="density", cex.main=3, cex.axis=1.5, cex.lab=2.5) col = brewer.pal(4, "Set1") lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])), col=col[1], lwd=4, lty=2) lines(size, param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])), col=col[2], lwd=4, lty=2) lines(size, param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[3], lwd=4, lty=2) lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])) + param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])) + param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[4], lwd=4) legend("topright", legend=c("open chromatin", "mono-nucl.", "di-nucl.", "all"), col=col, lwd=c(4,4,4,4), lty=c(2,2,2,1), bty='n', cex=2) dev.off() # assign probabilities to fragment length prob = matrix(nrow=1000, ncol=3) rownames(prob) = size for(i in 1:nrow(prob)) { for(j in 1:ncol(prob)) { prob[i,j] = param[j,3] * exp(-((size[i]-param[j,1])**2)/(2*param[j,2])) } prob[i,] = prob[i,] / sum(prob[i,]) } # plot png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_class_prob.png"), width=10, height=8, units="in", res=720) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) plot(size, prob[,1], ylim=c(0, max(prob)), type='l', main="Fragment classes", xlab="length (bp)", ylab="p(class)", cex.main=3, cex.axis=1.5, cex.lab=2.5, lwd=4, col=col[1]) lines(size, prob[,2], lwd=4, col=col[2]) lines(size, prob[,3], lwd=4, col=col[3]) # set limits at min 90 assignment to a class abline(v=30, lwd=2, lty=2) # class 1 lower limit (size limit) abline(v=84, lwd=2, lty=2) # class 1 upper limit abline(v=133, lwd=2, lty=2) # class 2 lower limit abline(v=266, lwd=2, lty=2) # class 2 upper limit abline(v=341, lwd=2, lty=2) # class 3 lower limit abline(v=500, lwd=2, lty=2) # class 3 upper limit (size limit) dev.off() ############# break dataset into classes ############# # size limits i_cl1_1 = which(size == 30) i_cl1_2 = which(size == 84) i_cl2_1 = which(size == 133) i_cl2_2 = which(size == 266) i_cl3_1 = which(size == 341) i_cl3_2 = which(size == 500) # nb of reads per class nb_all = sum(data$nb) nb_cl1 = sum(data$nb[i_cl1_1:i_cl1_2]) nb_cl2 = sum(data$nb[i_cl2_1:i_cl2_2]) nb_cl3 = sum(data$nb[i_cl3_1:i_cl3_2]) # nb of reads not assigned at the boundaries of classes nb_left1 = sum(data$nb[(i_cl1_2+1):(i_cl2_1-1)]) + sum(data$nb[(i_cl2_2+1):(i_cl3_1-1)]) # nb of reads > 500bp nb_left2 = sum(data$nb[(i_cl3_2+1):length(data$nb)]) nb_left = nb_left1 + nb_left2 # plot classes png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_groups.png"), width=10, height=8, units="in", res=720) p = par(mar=c(5.1, 5.1, 4.1, 2.1)) plot(y=data$nb[1:1000], x=data$size[1:1000], type='l', lwd=4, main="Fragment lengths", xlab="length (bp)", ylab="frequency", cex.main=3, cex.axis=1.5, cex.lab=2.5) # show limits abline(v=data$size[i_cl1_1], lwd=3, lty=2, col=col[1]) abline(v=data$size[i_cl1_2], lwd=3, lty=2, col=col[1]) abline(v=data$size[i_cl2_1], lwd=3, lty=2, col=col[2]) abline(v=data$size[i_cl2_2], lwd=3, lty=2, col=col[2]) abline(v=data$size[i_cl3_1], lwd=3, lty=2, col=col[3]) abline(v=data$size[i_cl3_2], lwd=3, lty=2, col=col[3]) # nb of reads in groups text(x=550, y=0.85*max(data[,1]), labels=sprintf("%.2f mio reads", nb_all/1e6), cex=1.8, pos=4) text(x=550, y=0.80*max(data[,1]), labels=sprintf("%.2f mio reads class 1", nb_cl1/1e6), cex=1.8, pos=4, col=col[1]) text(x=550, y=0.75*max(data[,1]), labels=sprintf("%.2f mio reads class 2", nb_cl2/1e6), cex=1.8, pos=4, col=col[2]) text(x=550, y=0.70*max(data[,1]), labels=sprintf("%.2f mio reads class 3", nb_cl3/1e6), cex=1.8, pos=4, col=col[3]) text(x=550, y=0.65*max(data[,1]), labels=sprintf("%.2f mio reads left", nb_left/1e6), cex=1.8, pos=4) # shade the class areas # class 1 rect(size[i_cl1_1], 0, size[i_cl1_2], max(data$nb), col=rgb(red=1, green=0, blue=0, alpha=0.1), border="transparent") # class 2 rect(size[i_cl2_1], 0, size[i_cl2_2], max(data$nb), col=rgb(red=0, green=0, blue=1, alpha=0.1), border="transparent") # class 3 rect(size[i_cl3_1], 0, size[i_cl3_2], max(data$nb), col=rgb(red=0, green=1, blue=0, alpha=0.1), border="transparent") dev.off()