diff --git a/.gitignore b/.gitignore
index 0e81523..6a4e9ed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,45 +1,44 @@
# This file is used to ignore files which are generated
# ----------------------------------------------------------------------------
-
*~
*.autosave
*.a
*.core
*.moc
*.o
*.obj
*.orig
*.rej
*.so
*.so.*
*_pch.h.cpp
*_resource.rc
*.qm
.#*
*.*#
core
!core/
tags
.DS_Store
.directory
*.debug
Makefile*
*.prl
*.app
moc_*.cpp
ui_*.h
qrc_*.cpp
Thumbs.db
*.res
*.rc
.Rhistory
.RData
/.qmake.cache
/.qmake.stash
bin/
CMakeFiles/
data/
results/
+lib/
CMakeCache.txt
cmake_install.cmake
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f1eca7..0827422 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,37 +1,34 @@
-# project
-project(scATACseq)
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.0.0)
-# static libraries
+# project name and it is C++ only
+project(scATACseq CXX)
+
+
+# libraries
## boost library
set(BOOST_INCLUDEDIR "/usr/local/include/boost/")
set(BOOST_LIBRARYDIR "/usr/local/lib/boost")
find_package(Boost 1.65 COMPONENTS program_options REQUIRED)
## UnitTest++ library
## TODO write a FindUnitTest++.cmake file to use find_package()
find_library(UNITTEST_LIB
NAMES "UnitTest++"
PATHS "/usr/local/lib/UnitTest++")
find_path(UNITTEST_INCLUDE
NAMES "UnitTest++.h"
PATHS "/usr/local/include/UnitTest++/")
include_directories(${UNITTEST_INCLUDE})
# link_directories(${UNITTEST_LIB})
-## threads
-find_package(Threads REQUIRED)
+## zlib (for seqan Bam I/O)
+find_package(ZLIB REQUIRED)
+## SeqAn
+find_package (SeqAn REQUIRED)
-# compiler options
-add_compile_options(-std=c++11)
-add_compile_options(-O3)
-add_compile_options(-Wall)
-add_compile_options(-Wextra)
-add_compile_options(-Werror)
-add_compile_options(-Wfatal-errors)
-add_compile_options(-pedantic)
-
+## threads
+find_package(Threads REQUIRED)
add_subdirectory(src)
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..13d7c28
--- /dev/null
+++ b/build.sh
@@ -0,0 +1 @@
+cmake3 -DCMAKE_MODULE_PATH="/local/groux/scATAC-seq/lib/seqan/util/cmake/" -DSEQAN_INCLUDE_PATH="/local/groux/scATAC-seq/lib/seqan/include/" . && make
diff --git a/scripts/10xgenomics_PBMC_5k/.10xgenomics.sh.swp b/scripts/10xgenomics_PBMC_5k/.10xgenomics.sh.swp
new file mode 100644
index 0000000..0688813
Binary files /dev/null and b/scripts/10xgenomics_PBMC_5k/.10xgenomics.sh.swp differ
diff --git a/scripts/10xgenomics_PBMC_5k/.idea/encodings.xml b/scripts/10xgenomics_PBMC_5k/.idea/encodings.xml
new file mode 100644
index 0000000..15a15b2
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/.idea/encodings.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/scripts/10xgenomics_PBMC_5k/.idea/libraries/R_User_Library.xml b/scripts/10xgenomics_PBMC_5k/.idea/libraries/R_User_Library.xml
new file mode 100644
index 0000000..71f5ff7
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/.idea/libraries/R_User_Library.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/scripts/10xgenomics_PBMC_5k/.idea/misc.xml b/scripts/10xgenomics_PBMC_5k/.idea/misc.xml
new file mode 100644
index 0000000..65531ca
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/scripts/10xgenomics_PBMC_5k/.idea/modules.xml b/scripts/10xgenomics_PBMC_5k/.idea/modules.xml
new file mode 100644
index 0000000..bb83e26
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/scripts/10xgenomics_PBMC_5k/.idea/scripts.iml b/scripts/10xgenomics_PBMC_5k/.idea/scripts.iml
new file mode 100644
index 0000000..3a4807d
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/.idea/scripts.iml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/scripts/10xgenomics_PBMC_5k/.idea/workspace.xml b/scripts/10xgenomics_PBMC_5k/.idea/workspace.xml
new file mode 100644
index 0000000..fc338d5
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/.idea/workspace.xml
@@ -0,0 +1,128 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1549029710156
+
+
+ 1549029710156
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/scripts/10xgenomics_PBMC_5k/analyse_fragment_lengths.R b/scripts/10xgenomics_PBMC_5k/analyse_fragment_lengths.R
new file mode 100644
index 0000000..3c33c58
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analyse_fragment_lengths.R
@@ -0,0 +1,151 @@
+
+setwd(file.path("", "local", "groux", "scATAC-seq"))
+
+if(!file.exists(file.path("results", "10xgenomics_PBMC_5k")))
+{ dir.create(file.path("results", "10xgenomics_PBMC_5k")) }
+
+# library
+library("RColorBrewer")
+
+############# data #############
+
+data = read.table(file.path("data", "10xgenomics_PBMC_5k",
+ "atac_v1_pbmc_5k_possorted_filtered_fragment_lengths.txt"), header=F)
+colnames(data) = c("nb", "size")
+
+############# fit to gaussian mixture #############
+
+set.seed(20190604) # d-day - 2 (les sanglots long de l'automne...)
+# fit data to gaussian mixture model
+size = data$size[1:1000]
+dens = data$nb[1:1000] / sum(data$nb[1:1000])
+# model parameters, 1st guess by looking at plot
+m1 = 50 ; s1 = 10 ; a1 = 1
+m2 = 200 ; s2 = 10 ; a2 = 1
+m3 = 380 ; s3 = 30 ; a3 = 1
+# fit
+init = c(m1=m1, s1=s1, a1=a1,
+ m2=m2, s2=s2, a2=a2,
+ m3=m3, s3=s3, a3=a3)
+f = nls(dens ~ a1 * exp(-((size-m1)**2)/(2*s1)) +
+ a2 * exp(-((size-m2)**2)/(2*s2)) +
+ a3 * exp(-((size-m3)**2)/(2*s3)),
+ start=init)
+# parameter estimates
+param = matrix(nrow=3, ncol=3)
+colnames(param) = c("m", "s", "a")
+rownames(param) = c("class1", "class2", "class3")
+param[1,] = c(coef(f)["m1"], coef(f)["s1"], coef(f)["a1"])
+param[2,] = c(coef(f)["m2"], coef(f)["s2"], coef(f)["a2"])
+param[3,] = c(coef(f)["m3"], coef(f)["s3"], coef(f)["a3"])
+# plot
+png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_classes.png"), width=10, height=8, units="in", res=720)
+ p = par(mar=c(5.1, 5.1, 4.1, 2.1))
+ plot(size, dens, type='l', lwd=2,
+ main="Fragment lengths", xlab="length (bp)", ylab="density",
+ cex.main=3, cex.axis=1.5, cex.lab=2.5)
+ col = brewer.pal(4, "Set1")
+ lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])), col=col[1], lwd=4, lty=2)
+ lines(size, param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])), col=col[2], lwd=4, lty=2)
+ lines(size, param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[3], lwd=4, lty=2)
+ lines(size, param[1,3] * exp(-((size-param[1,1])**2)/(2*param[1,2])) +
+ param[2,3] * exp(-((size-param[2,1])**2)/(2*param[2,2])) +
+ param[3,3] * exp(-((size-param[3,1])**2)/(2*param[3,2])), col=col[4], lwd=4)
+ legend("topright",
+ legend=c("open chromatin",
+ "mono-nucl.",
+ "di-nucl.",
+ "all"),
+ col=col, lwd=c(4,4,4,4), lty=c(2,2,2,1),
+ bty='n', cex=2)
+dev.off()
+# assign probabilities to fragment length
+prob = matrix(nrow=1000, ncol=3)
+rownames(prob) = size
+for(i in 1:nrow(prob))
+{ for(j in 1:ncol(prob))
+ { prob[i,j] = param[j,3] * exp(-((size[i]-param[j,1])**2)/(2*param[j,2])) }
+ prob[i,] = prob[i,] / sum(prob[i,])
+}
+# plot
+png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_class_prob.png"), width=10, height=8, units="in", res=720)
+ p = par(mar=c(5.1, 5.1, 4.1, 2.1))
+ plot(size, prob[,1], ylim=c(0, max(prob)), type='l',
+ main="Fragment classes", xlab="length (bp)", ylab="p(class)",
+ cex.main=3, cex.axis=1.5, cex.lab=2.5, lwd=4, col=col[1])
+ lines(size, prob[,2], lwd=4, col=col[2])
+ lines(size, prob[,3], lwd=4, col=col[3])
+
+ # set limits at min 90 assignment to a class
+ abline(v=30, lwd=2, lty=2) # class 1 lower limit (size limit)
+ abline(v=84, lwd=2, lty=2) # class 1 upper limit
+ abline(v=133, lwd=2, lty=2) # class 2 lower limit
+ abline(v=266, lwd=2, lty=2) # class 2 upper limit
+ abline(v=341, lwd=2, lty=2) # class 3 lower limit
+ abline(v=500, lwd=2, lty=2) # class 3 upper limit (size limit)
+dev.off()
+
+############# break dataset into classes #############
+
+# size limits
+i_cl1_1 = which(size == 30)
+i_cl1_2 = which(size == 84)
+i_cl2_1 = which(size == 133)
+i_cl2_2 = which(size == 266)
+i_cl3_1 = which(size == 341)
+i_cl3_2 = which(size == 500)
+
+# nb of reads per class
+nb_all = sum(data$nb)
+nb_cl1 = sum(data$nb[i_cl1_1:i_cl1_2])
+nb_cl2 = sum(data$nb[i_cl2_1:i_cl2_2])
+nb_cl3 = sum(data$nb[i_cl3_1:i_cl3_2])
+# nb of reads not assigned at the boundaries of classes
+nb_left1 = sum(data$nb[(i_cl1_2+1):(i_cl2_1-1)]) +
+ sum(data$nb[(i_cl2_2+1):(i_cl3_1-1)])
+# nb of reads > 500bp
+nb_left2 = sum(data$nb[(i_cl3_2+1):length(data$nb)])
+nb_left = nb_left1 + nb_left2
+
+# plot classes
+png(filename=file.path("results", "10xgenomics_PBMC_5k", "fragment_lengths_groups.png"), width=10, height=8, units="in", res=720)
+
+ p = par(mar=c(5.1, 5.1, 4.1, 2.1))
+ plot(y=data$nb[1:1000], x=data$size[1:1000], type='l', lwd=4,
+ main="Fragment lengths", xlab="length (bp)", ylab="frequency",
+ cex.main=3, cex.axis=1.5, cex.lab=2.5)
+ # show limits
+ abline(v=data$size[i_cl1_1], lwd=3, lty=2, col=col[1])
+ abline(v=data$size[i_cl1_2], lwd=3, lty=2, col=col[1])
+ abline(v=data$size[i_cl2_1], lwd=3, lty=2, col=col[2])
+ abline(v=data$size[i_cl2_2], lwd=3, lty=2, col=col[2])
+ abline(v=data$size[i_cl3_1], lwd=3, lty=2, col=col[3])
+ abline(v=data$size[i_cl3_2], lwd=3, lty=2, col=col[3])
+ # nb of reads in groups
+ text(x=550, y=0.85*max(data[,1]), labels=sprintf("%.2f mio reads", nb_all/1e6), cex=1.8, pos=4)
+ text(x=550, y=0.80*max(data[,1]), labels=sprintf("%.2f mio reads class 1", nb_cl1/1e6), cex=1.8, pos=4, col=col[1])
+ text(x=550, y=0.75*max(data[,1]), labels=sprintf("%.2f mio reads class 2", nb_cl2/1e6), cex=1.8, pos=4, col=col[2])
+ text(x=550, y=0.70*max(data[,1]), labels=sprintf("%.2f mio reads class 3", nb_cl3/1e6), cex=1.8, pos=4, col=col[3])
+ text(x=550, y=0.65*max(data[,1]), labels=sprintf("%.2f mio reads left", nb_left/1e6), cex=1.8, pos=4)
+ # shade the class areas
+ # class 1
+ rect(size[i_cl1_1],
+ 0,
+ size[i_cl1_2],
+ max(data$nb),
+ col=rgb(red=1, green=0, blue=0, alpha=0.1), border="transparent")
+ # class 2
+ rect(size[i_cl2_1],
+ 0,
+ size[i_cl2_2],
+ max(data$nb),
+ col=rgb(red=0, green=0, blue=1, alpha=0.1), border="transparent")
+ # class 3
+ rect(size[i_cl3_1],
+ 0,
+ size[i_cl3_2],
+ max(data$nb),
+ col=rgb(red=0, green=1, blue=0, alpha=0.1), border="transparent")
+
+dev.off()
+
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.R
new file mode 100644
index 0000000..a67e09a
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.R
@@ -0,0 +1,277 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## aggregations around CTCF motifs ##################
+
+# data
+# open chromatin
+data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin1bp_fragment.mat")))
+data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin2bp_fragment.mat")))
+data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin10bp_fragment.mat")))
+
+data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin1bp_read.mat")))
+data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin2bp_read.mat")))
+data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin10bp_read.mat")))
+
+data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin1bp_read_atac.mat")))
+data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin2bp_read_atac.mat")))
+data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_open_bin10bp_read_atac.mat")))
+
+# mono-nucleosomes
+data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin1bp_fragment.mat")))
+data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin2bp_fragment.mat")))
+data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin10bp_fragment.mat")))
+
+data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin1bp_read.mat")))
+data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin2bp_read.mat")))
+data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin10bp_read.mat")))
+
+data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat")))
+data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center.mat")))
+data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_1nucl_bin10bp_fragment_center.mat")))
+
+# di-nucleosomes
+data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin1bp_fragment.mat")))
+data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin2bp_fragment.mat")))
+data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin10bp_fragment.mat")))
+
+data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin1bp_read.mat")))
+data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin2bp_read.mat")))
+data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin10bp_read.mat")))
+
+data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin1bp_fragment_center.mat")))
+data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin2bp_fragment_center.mat")))
+data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nucl_bin10bp_fragment_center.mat")))
+
+# mono-nucleosomes from di-nucleosome data
+data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment.mat")))
+data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment.mat")))
+data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment.mat")))
+
+data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_read.mat")))
+data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_read.mat")))
+data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_read.mat")))
+
+data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment_center.mat")))
+data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment_center.mat")))
+data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment_center.mat")))
+
+
+# colors
+col = brewer.pal(4, "Set1")
+
+# x-axis
+axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5)
+axis.lab.1 = seq(-400, 400, by=200)
+axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5)
+axis.lab.2 = seq(-400, 400, by=200)
+axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5)
+axis.lab.10 = seq(-1000, 1000, by=500)
+
+# X11(width=12, height=12)
+png(filename=file.path("results/10xgenomics_PBMC_5k/ctcf_motifs_10e-6_aggregations.png"),
+ units="in", res=720, width=12, height=9)
+ m = matrix(nrow=4, ncol=4,
+ data=c(16,13,14,15,
+ 10, 1, 4, 7,
+ 11, 2, 5, 8,
+ 12, 3, 6, 9), byrow=T)
+ l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1))
+ layout.show(l)
+
+ p = par(mar=c(5.1, 5.1, 4.1, 2.1))
+
+ # 1bp resolution
+ ## entire fragments
+ plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.open.1.frag), col=col[1], lwd=3)
+ lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## entire reads
+ plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 2bp resolution
+ ## entire fragments
+ plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## entire reads
+ plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+
+ # 10bp resolution
+ ## entire fragments
+ plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## entire reads
+ plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+
+ # some legends over the rows and columns
+ p = par(mar=c(0,0,0,0))
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="FRAGMENTS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="READS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 1bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 2bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-1kp by 10bp", cex=2)
+
+ par(p)
+dev.off()
+
+
+
+# footprint
+# x-axis
+axis.lab.1 = seq(-200, 200, by=100)
+axis.at.1 = seq(0, 400, length.out=length(axis.lab.1))
+
+axis.lab.2 = seq(-200, 200, by=100)
+axis.at.2 = seq(0, 200, length.out=length(axis.lab.2))
+
+axis.lab.10 = seq(-200, 200, by=100)
+axis.at.10 = seq(0, 41, length.out=length(axis.lab.10))
+
+
+# X11(width=10, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_footprint.png"),
+ units="in", res=720, width=10, height=12)
+ p = par(mfrow=c(3,1),
+ mar=c(5.1, 5.1, 4.1, 2.1))
+ # 1bp resolution
+ index = 200:600
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.1.atac[,index])/max(colMeans(data.open.1.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="CTCF motif 1bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.1.cent[,index])/max(colMeans(data.1nucl.1.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.1.cent[,index])/max(colMeans(data.nucls.1.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=191, lwd=3, lty=2)
+ abline(v=211, lwd=3, lty=2)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 2bp resolution
+ index = 100:300
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.2.atac[,index])/max(colMeans(data.open.2.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="CTCF motif 2bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.2.cent[,index])/max(colMeans(data.1nucl.2.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.2.cent[,index])/max(colMeans(data.nucls.2.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=96, lwd=3, lty=2)
+ abline(v=106, lwd=3, lty=2)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 10bp resolution
+ index = 80:120
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.10.atac[,index])/max(colMeans(data.open.10.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="CTCF motif 10bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.10.cent[,index])/max(colMeans(data.1nucl.10.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.10.cent[,index])/max(colMeans(data.nucls.10.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=20, lwd=3, lty=2)
+ abline(v=22, lwd=3, lty=2)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ par(p)
+dev.off()
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.sh
new file mode 100755
index 0000000..33391e9
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif.sh
@@ -0,0 +1,67 @@
+# some paths
+## directories
+results_dir='results/10xgenomics_PBMC_5k'
+data_dir='data/10xgenomics_PBMC_5k/'
+## input
+file_bed=$data_dir'/ctcf_motifs_10e-6.bed'
+file_bam_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam"
+file_bai_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai"
+file_bam_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam"
+file_bai_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai"
+file_bam_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam"
+file_bai_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai"
+file_bam_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam"
+file_bai_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai"
+
+mkdir -p $results_dir
+
+# matrix creation
+## open chromatin around CTCF motif
+for method in 'read' 'read_atac' 'fragment'
+do
+ file_mat_open_1="$results_dir/ctcf_motifs_10e-6_open_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1
+ file_mat_open_2="$results_dir/ctcf_motifs_10e-6_open_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2
+ file_mat_open_10="$results_dir/ctcf_motifs_10e-6_open_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10
+done
+
+## mono around CTCF motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_1nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_1nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_1nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
+
+## di nucleosomes around CTCF motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### di nucleosomes
+ file_mat_2nucl_1="$results_dir/ctcf_motifs_10e-6_2nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1
+ file_mat_2nucl_2="$results_dir/ctcf_motifs_10e-6_2nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2
+ file_mat_2nucl_10="$results_dir/ctcf_motifs_10e-6_2nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10
+done
+
+
+## mono nucleosomes from processed di-nucleosome data around CTCF motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_2nuclsplitintwo_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_2nuclsplitintwo_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_2nuclsplitintwo_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.R b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.R
new file mode 100644
index 0000000..711629c
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.R
@@ -0,0 +1,223 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## aggregations around CTCF motifs ##################
+
+# data
+# open chromatin
+data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin1bp_fragment.mat")))
+data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin2bp_fragment.mat")))
+data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin10bp_fragment.mat")))
+
+data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin1bp_read.mat")))
+data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin2bp_read.mat")))
+data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin10bp_read.mat")))
+
+data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin1bp_read_atac.mat")))
+data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin2bp_read_atac.mat")))
+data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_open_bin10bp_read_atac.mat")))
+
+# mono-nucleosomes
+data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin1bp_fragment.mat")))
+data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin2bp_fragment.mat")))
+data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin10bp_fragment.mat")))
+
+data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin1bp_read.mat")))
+data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin2bp_read.mat")))
+data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin10bp_read.mat")))
+
+data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin1bp_fragment_center.mat")))
+data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin2bp_fragment_center.mat")))
+data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_1nucl_bin10bp_fragment_center.mat")))
+
+# di-nucleosomes
+data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin1bp_fragment.mat")))
+data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin2bp_fragment.mat")))
+data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin10bp_fragment.mat")))
+
+data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin1bp_read.mat")))
+data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin2bp_read.mat")))
+data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin10bp_read.mat")))
+
+data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin1bp_fragment_center.mat")))
+data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin2bp_fragment_center.mat")))
+data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nucl_bin10bp_fragment_center.mat")))
+
+# mono-nucleosomes from di-nucleosome data
+data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin1bp_fragment.mat")))
+data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin2bp_fragment.mat")))
+data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin10bp_fragment.mat")))
+
+data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin1bp_read.mat")))
+data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin2bp_read.mat")))
+data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin10bp_read.mat")))
+
+data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin1bp_fragment_center.mat")))
+data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin2bp_fragment_center.mat")))
+data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin10bp_fragment_center.mat")))
+
+
+# colors
+col = brewer.pal(4, "Set1")
+
+# x-axis
+axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5)
+axis.lab.1 = seq(-400, 400, by=200)
+axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5)
+axis.lab.2 = seq(-400, 400, by=200)
+axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5)
+axis.lab.10 = seq(-1000, 1000, by=500)
+
+# X11(width=12, height=12)
+png(filename=file.path("results/10xgenomics_PBMC_5k/ctcf_motifs_10e-6_chr1_aggregations.png"),
+ units="in", res=720, width=12, height=9)
+ m = matrix(nrow=4, ncol=4,
+ data=c(16,13,14,15,
+ 10, 1, 4, 7,
+ 11, 2, 5, 8,
+ 12, 3, 6, 9), byrow=T)
+ l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1))
+ layout.show(l)
+
+ p = par(mar=c(5.1, 5.1, 4.1, 2.1))
+
+ # 1bp resolution
+ ## entire fragments
+ plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.open.1.frag), col=col[1], lwd=3)
+ lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## entire reads
+ plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 2bp resolution
+ ## entire fragments
+ plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## entire reads
+ plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+
+ # 10bp resolution
+ ## entire fragments
+ plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## entire reads
+ plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+
+ # some legends over the rows and columns
+ p = par(mar=c(0,0,0,0))
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="FRAGMENTS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="READS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 1bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 2bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-1kp by 10bp", cex=2)
+
+ par(p)
+dev.off()
+
+
+
+# footprint
+# x-axis
+axis.at.fp = seq(0, 200, length.out=3)
+axis.lab.fp = seq(-100, 100, by=100)
+
+
+# X11(width=8, height=4)
+png(filename=file.path("results", "10xgenomics_PBMC_5k", "ctcf_motifs_10e-6_chr1_footprint.png"),
+ units="in", res=720, width=8, height=4)
+ p = par(mar=c(5.1, 5.1, 4.1, 2.1))
+ plot(colMeans(data.open.1.atac[,300:500]), type='l', lwd=3, col=col[1],
+ main="CTCF motif", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ cex.axis=2, cex.lab=2)
+ abline(v=90, lwd=3, lty=2)
+ abline(v=110, lwd=3, lty=2)
+ axis(side=1, at=axis.at.fp, labels=axis.lab.fp, cex.axis=1.8)
+ par(p)
+dev.off()
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.sh b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.sh
new file mode 100755
index 0000000..6c21352
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_ctcf_motif_chr1.sh
@@ -0,0 +1,67 @@
+# some paths
+## directories
+results_dir='results/10xgenomics_PBMC_5k'
+data_dir='data/10xgenomics_PBMC_5k/'
+## input
+file_bed=$data_dir'/ctcf_motifs_10e-6_chr1.bed'
+file_bam_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam"
+file_bai_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai"
+file_bam_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam"
+file_bai_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai"
+file_bam_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam"
+file_bai_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai"
+file_bam_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam"
+file_bai_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai"
+
+mkdir -p $results_dir
+
+# matrix creation
+## open chromatin around CTCF motif
+for method in 'read' 'read_atac' 'fragment'
+do
+ file_mat_open_1="$results_dir/ctcf_motifs_10e-6_chr1_open_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1
+ file_mat_open_2="$results_dir/ctcf_motifs_10e-6_chr1_open_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2
+ file_mat_open_10="$results_dir/ctcf_motifs_10e-6_chr1_open_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10
+done
+
+## mono around CTCF motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_chr1_1nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_chr1_1nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_chr1_1nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
+
+## di nucleosomes around CTCF motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### di nucleosomes
+ file_mat_2nucl_1="$results_dir/ctcf_motifs_10e-6_chr1_2nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1
+ file_mat_2nucl_2="$results_dir/ctcf_motifs_10e-6_chr1_2nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2
+ file_mat_2nucl_10="$results_dir/ctcf_motifs_10e-6_chr1_2nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10
+done
+
+
+## mono nucleosomes from processed di-nucleosome data around CTCF motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/ctcf_motifs_10e-6_chr1_2nuclsplitintwo_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.R
new file mode 100644
index 0000000..2504612
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.R
@@ -0,0 +1,307 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## aggregations around ebf1 motifs ##################
+
+# data
+# open chromatin
+data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin1bp_fragment.mat")))
+data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin2bp_fragment.mat")))
+data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin10bp_fragment.mat")))
+
+data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin1bp_read.mat")))
+data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin2bp_read.mat")))
+data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin10bp_read.mat")))
+
+data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin1bp_read_atac.mat")))
+data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin2bp_read_atac.mat")))
+data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_open_bin10bp_read_atac.mat")))
+
+# mono-nucleosomes
+data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin1bp_fragment.mat")))
+data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin2bp_fragment.mat")))
+data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin10bp_fragment.mat")))
+
+data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin1bp_read.mat")))
+data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin2bp_read.mat")))
+data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin10bp_read.mat")))
+
+data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat")))
+data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center.mat")))
+data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_1nucl_bin10bp_fragment_center.mat")))
+
+# di-nucleosomes
+data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin1bp_fragment.mat")))
+data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin2bp_fragment.mat")))
+data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin10bp_fragment.mat")))
+
+data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin1bp_read.mat")))
+data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin2bp_read.mat")))
+data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin10bp_read.mat")))
+
+data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin1bp_fragment_center.mat")))
+data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin2bp_fragment_center.mat")))
+data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nucl_bin10bp_fragment_center.mat")))
+
+# mono-nucleosomes from di-nucleosome data
+data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment.mat")))
+data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment.mat")))
+data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment.mat")))
+
+data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_read.mat")))
+data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_read.mat")))
+data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_read.mat")))
+
+data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment_center.mat")))
+data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment_center.mat")))
+data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment_center.mat")))
+
+
+# colors
+col = brewer.pal(4, "Set1")
+
+# x-axis
+axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5)
+axis.lab.1 = seq(-400, 400, by=200)
+axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5)
+axis.lab.2 = seq(-400, 400, by=200)
+axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5)
+axis.lab.10 = seq(-1000, 1000, by=500)
+
+# X11(width=12, height=12)
+png(filename=file.path("results/10xgenomics_PBMC_5k/ebf1_motifs_10e-6_aggregations.png"),
+ units="in", res=720, width=12, height=9)
+ m = matrix(nrow=4, ncol=4,
+ data=c(16,13,14,15,
+ 10, 1, 4, 7,
+ 11, 2, 5, 8,
+ 12, 3, 6, 9), byrow=T)
+ l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1))
+ layout.show(l)
+
+ p = par(mar=c(5.1, 5.1, 4.1, 2.1))
+
+ # 1bp resolution
+ ## entire fragments
+ ylim = c(0,max(max(colMeans(data.open.1.frag)),
+ max(colMeans(data.open.1.frag)),
+ max(colMeans(data.1nucl.1.frag)),
+ max(colMeans(data.2nucl.1.frag)),
+ max(colMeans(data.nucls.1.frag))))
+ plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.open.1.frag), col=col[1], lwd=3)
+ lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## entire reads
+ ylim = c(0,max(max(colMeans(data.open.1.read)),
+ max(colMeans(data.open.1.read)),
+ max(colMeans(data.1nucl.1.read)),
+ max(colMeans(data.2nucl.1.read)),
+ max(colMeans(data.nucls.1.read))))
+ plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 2bp resolution
+ ## entire fragments
+ ylim = c(0,max(max(colMeans(data.open.2.frag)),
+ max(colMeans(data.open.2.frag)),
+ max(colMeans(data.1nucl.2.frag)),
+ max(colMeans(data.2nucl.2.frag)),
+ max(colMeans(data.nucls.2.frag))))
+ plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## entire reads
+ ylim = c(0,max(max(colMeans(data.open.2.read)),
+ max(colMeans(data.open.2.read)),
+ max(colMeans(data.1nucl.2.read)),
+ max(colMeans(data.2nucl.2.read)),
+ max(colMeans(data.nucls.2.read))))
+ plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+
+ # 10bp resolution
+ ## entire fragments
+ ylim = c(0,max(max(colMeans(data.open.10.frag)),
+ max(colMeans(data.open.10.frag)),
+ max(colMeans(data.1nucl.10.frag)),
+ max(colMeans(data.2nucl.10.frag)),
+ max(colMeans(data.nucls.10.frag))))
+ plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## entire reads
+ ylim = c(0,max(max(colMeans(data.open.10.read)),
+ max(colMeans(data.open.10.read)),
+ max(colMeans(data.1nucl.10.read)),
+ max(colMeans(data.2nucl.10.read)),
+ max(colMeans(data.nucls.10.read))))
+ plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+
+ # some legends over the rows and columns
+ p = par(mar=c(0,0,0,0))
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="FRAGMENTS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="READS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 1bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 2bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-1kp by 10bp", cex=2)
+
+ par(p)
+dev.off()
+
+
+
+# footprint
+# x-axis
+axis.lab.1 = seq(-200, 200, by=100)
+axis.at.1 = seq(0, 400, length.out=length(axis.lab.1))
+
+axis.lab.2 = seq(-200, 200, by=100)
+axis.at.2 = seq(0, 200, length.out=length(axis.lab.2))
+
+axis.lab.10 = seq(-200, 200, by=100)
+axis.at.10 = seq(0, 41, length.out=length(axis.lab.10))
+
+
+# X11(width=10, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k", "ebf1_motifs_10e-6_footprint.png"),
+ units="in", res=720, width=10, height=12)
+ p = par(mfrow=c(3,1),
+ mar=c(5.1, 5.1, 4.1, 2.1))
+ # 1bp resolution
+ index = 200:600
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.1.atac[,index])/max(colMeans(data.open.1.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="EBF1 motif 1bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.1.cent[,index])/max(colMeans(data.1nucl.1.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.1.cent[,index])/max(colMeans(data.nucls.1.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=191, lwd=3, lty=2)
+ abline(v=211, lwd=3, lty=2)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 2bp resolution
+ index = 100:300
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.2.atac[,index])/max(colMeans(data.open.2.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="EBF1 motif 2bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.2.cent[,index])/max(colMeans(data.1nucl.2.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.2.cent[,index])/max(colMeans(data.nucls.2.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=96, lwd=3, lty=2)
+ abline(v=106, lwd=3, lty=2)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 10bp resolution
+ index = 80:120
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.10.atac[,index])/max(colMeans(data.open.10.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="EBF1 motif 10bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.10.cent[,index])/max(colMeans(data.1nucl.10.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.10.cent[,index])/max(colMeans(data.nucls.10.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=20, lwd=3, lty=2)
+ abline(v=22, lwd=3, lty=2)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ par(p)
+dev.off()
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.sh
new file mode 100755
index 0000000..90b1aa3
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_ebf1_motif.sh
@@ -0,0 +1,67 @@
+# some paths
+## directories
+results_dir='results/10xgenomics_PBMC_5k'
+data_dir='data/10xgenomics_PBMC_5k/'
+## input
+file_bed=$data_dir'/ebf1_motifs_10e-6.bed'
+file_bam_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam"
+file_bai_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai"
+file_bam_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam"
+file_bai_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai"
+file_bam_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam"
+file_bai_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai"
+file_bam_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam"
+file_bai_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai"
+
+mkdir -p $results_dir
+
+# matrix creation
+## open chromatin around ebf1 motif
+for method in 'read' 'read_atac' 'fragment'
+do
+ file_mat_open_1="$results_dir/ebf1_motifs_10e-6_open_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1
+ file_mat_open_2="$results_dir/ebf1_motifs_10e-6_open_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2
+ file_mat_open_10="$results_dir/ebf1_motifs_10e-6_open_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10
+done
+
+## mono around ebf1 motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/ebf1_motifs_10e-6_1nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/ebf1_motifs_10e-6_1nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/ebf1_motifs_10e-6_1nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
+
+## di nucleosomes around ebf1 motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### di nucleosomes
+ file_mat_2nucl_1="$results_dir/ebf1_motifs_10e-6_2nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1
+ file_mat_2nucl_2="$results_dir/ebf1_motifs_10e-6_2nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2
+ file_mat_2nucl_10="$results_dir/ebf1_motifs_10e-6_2nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10
+done
+
+
+## mono nucleosomes from processed di-nucleosome data around ebf1 motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/ebf1_motifs_10e-6_2nuclsplitintwo_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/ebf1_motifs_10e-6_2nuclsplitintwo_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/ebf1_motifs_10e-6_2nuclsplitintwo_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_myc_motif.R b/scripts/10xgenomics_PBMC_5k/analysis_myc_motif.R
new file mode 100644
index 0000000..6fcdefb
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_myc_motif.R
@@ -0,0 +1,307 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## aggregations around myc motifs ##################
+
+# data
+# open chromatin
+data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin1bp_fragment.mat")))
+data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin2bp_fragment.mat")))
+data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin10bp_fragment.mat")))
+
+data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin1bp_read.mat")))
+data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin2bp_read.mat")))
+data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin10bp_read.mat")))
+
+data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin1bp_read_atac.mat")))
+data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin2bp_read_atac.mat")))
+data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_open_bin10bp_read_atac.mat")))
+
+# mono-nucleosomes
+data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin1bp_fragment.mat")))
+data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin2bp_fragment.mat")))
+data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin10bp_fragment.mat")))
+
+data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin1bp_read.mat")))
+data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin2bp_read.mat")))
+data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin10bp_read.mat")))
+
+data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat")))
+data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin2bp_fragment_center.mat")))
+data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_1nucl_bin10bp_fragment_center.mat")))
+
+# di-nucleosomes
+data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin1bp_fragment.mat")))
+data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin2bp_fragment.mat")))
+data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin10bp_fragment.mat")))
+
+data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin1bp_read.mat")))
+data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin2bp_read.mat")))
+data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin10bp_read.mat")))
+
+data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin1bp_fragment_center.mat")))
+data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin2bp_fragment_center.mat")))
+data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nucl_bin10bp_fragment_center.mat")))
+
+# mono-nucleosomes from di-nucleosome data
+data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment.mat")))
+data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment.mat")))
+data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment.mat")))
+
+data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin1bp_read.mat")))
+data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin2bp_read.mat")))
+data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin10bp_read.mat")))
+
+data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin1bp_fragment_center.mat")))
+data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin2bp_fragment_center.mat")))
+data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_2nuclsplitintwo_bin10bp_fragment_center.mat")))
+
+
+# colors
+col = brewer.pal(4, "Set1")
+
+# x-axis
+axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5)
+axis.lab.1 = seq(-400, 400, by=200)
+axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5)
+axis.lab.2 = seq(-400, 400, by=200)
+axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5)
+axis.lab.10 = seq(-1000, 1000, by=500)
+
+# X11(width=12, height=12)
+png(filename=file.path("results/10xgenomics_PBMC_5k/myc_motifs_10e-6_aggregations.png"),
+ units="in", res=720, width=12, height=9)
+ m = matrix(nrow=4, ncol=4,
+ data=c(16,13,14,15,
+ 10, 1, 4, 7,
+ 11, 2, 5, 8,
+ 12, 3, 6, 9), byrow=T)
+ l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1))
+ layout.show(l)
+
+ p = par(mar=c(5.1, 5.1, 4.1, 2.1))
+
+ # 1bp resolution
+ ## entire fragments
+ ylim = c(0,max(max(colMeans(data.open.1.frag)),
+ max(colMeans(data.open.1.frag)),
+ max(colMeans(data.1nucl.1.frag)),
+ max(colMeans(data.2nucl.1.frag)),
+ max(colMeans(data.nucls.1.frag))))
+ plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.open.1.frag), col=col[1], lwd=3)
+ lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## entire reads
+ ylim = c(0,max(max(colMeans(data.open.1.read)),
+ max(colMeans(data.open.1.read)),
+ max(colMeans(data.1nucl.1.read)),
+ max(colMeans(data.2nucl.1.read)),
+ max(colMeans(data.nucls.1.read))))
+ plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 2bp resolution
+ ## entire fragments
+ ylim = c(0,max(max(colMeans(data.open.2.frag)),
+ max(colMeans(data.open.2.frag)),
+ max(colMeans(data.1nucl.2.frag)),
+ max(colMeans(data.2nucl.2.frag)),
+ max(colMeans(data.nucls.2.frag))))
+ plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## entire reads
+ ylim = c(0,max(max(colMeans(data.open.2.read)),
+ max(colMeans(data.open.2.read)),
+ max(colMeans(data.1nucl.2.read)),
+ max(colMeans(data.2nucl.2.read)),
+ max(colMeans(data.nucls.2.read))))
+ plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+
+ # 10bp resolution
+ ## entire fragments
+ ylim = c(0,max(max(colMeans(data.open.10.frag)),
+ max(colMeans(data.open.10.frag)),
+ max(colMeans(data.1nucl.10.frag)),
+ max(colMeans(data.2nucl.10.frag)),
+ max(colMeans(data.nucls.10.frag))))
+ plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## entire reads
+ ylim = c(0,max(max(colMeans(data.open.10.read)),
+ max(colMeans(data.open.10.read)),
+ max(colMeans(data.1nucl.10.read)),
+ max(colMeans(data.2nucl.10.read)),
+ max(colMeans(data.nucls.10.read))))
+ plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+
+ # some legends over the rows and columns
+ p = par(mar=c(0,0,0,0))
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="FRAGMENTS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="READS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 1bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 2bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-1kp by 10bp", cex=2)
+
+ par(p)
+dev.off()
+
+
+
+# footprint
+# x-axis
+axis.lab.1 = seq(-200, 200, by=100)
+axis.at.1 = seq(0, 400, length.out=length(axis.lab.1))
+
+axis.lab.2 = seq(-200, 200, by=100)
+axis.at.2 = seq(0, 200, length.out=length(axis.lab.2))
+
+axis.lab.10 = seq(-200, 200, by=100)
+axis.at.10 = seq(0, 41, length.out=length(axis.lab.10))
+
+
+# X11(width=10, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k", "myc_motifs_10e-6_footprint.png"),
+ units="in", res=720, width=10, height=12)
+ p = par(mfrow=c(3,1),
+ mar=c(5.1, 5.1, 4.1, 2.1))
+ # 1bp resolution
+ index = 200:600
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.1.atac[,index])/max(colMeans(data.open.1.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="myc motif 1bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.1.cent[,index])/max(colMeans(data.1nucl.1.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.1.cent[,index])/max(colMeans(data.nucls.1.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=191, lwd=3, lty=2)
+ abline(v=211, lwd=3, lty=2)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 2bp resolution
+ index = 100:300
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.2.atac[,index])/max(colMeans(data.open.2.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="myc motif 2bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.2.cent[,index])/max(colMeans(data.1nucl.2.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.2.cent[,index])/max(colMeans(data.nucls.2.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=96, lwd=3, lty=2)
+ abline(v=106, lwd=3, lty=2)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 10bp resolution
+ index = 80:120
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.10.atac[,index])/max(colMeans(data.open.10.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="myc motif 10bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.10.cent[,index])/max(colMeans(data.1nucl.10.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.10.cent[,index])/max(colMeans(data.nucls.10.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=20, lwd=3, lty=2)
+ abline(v=22, lwd=3, lty=2)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ par(p)
+dev.off()
\ No newline at end of file
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_myc_motif.sh b/scripts/10xgenomics_PBMC_5k/analysis_myc_motif.sh
new file mode 100755
index 0000000..29a3f24
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_myc_motif.sh
@@ -0,0 +1,67 @@
+# some paths
+## directories
+results_dir='results/10xgenomics_PBMC_5k'
+data_dir='data/10xgenomics_PBMC_5k/'
+## input
+file_bed=$data_dir'/myc_motifs_10e-6.bed'
+file_bam_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam"
+file_bai_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai"
+file_bam_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam"
+file_bai_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai"
+file_bam_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam"
+file_bai_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai"
+file_bam_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam"
+file_bai_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai"
+
+mkdir -p $results_dir
+
+# matrix creation
+## open chromatin around myc motif
+for method in 'read' 'read_atac' 'fragment'
+do
+ file_mat_open_1="$results_dir/myc_motifs_10e-6_open_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1
+ file_mat_open_2="$results_dir/myc_motifs_10e-6_open_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2
+ file_mat_open_10="$results_dir/myc_motifs_10e-6_open_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10
+done
+
+## mono around myc motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/myc_motifs_10e-6_1nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/myc_motifs_10e-6_1nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/myc_motifs_10e-6_1nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
+
+## di nucleosomes around myc motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### di nucleosomes
+ file_mat_2nucl_1="$results_dir/myc_motifs_10e-6_2nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1
+ file_mat_2nucl_2="$results_dir/myc_motifs_10e-6_2nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2
+ file_mat_2nucl_10="$results_dir/myc_motifs_10e-6_2nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10
+done
+
+
+## mono nucleosomes from processed di-nucleosome data around myc motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/myc_motifs_10e-6_2nuclsplitintwo_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/myc_motifs_10e-6_2nuclsplitintwo_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/myc_motifs_10e-6_2nuclsplitintwo_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.R b/scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.R
new file mode 100644
index 0000000..5011e4b
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.R
@@ -0,0 +1,307 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## aggregations around myc motifs ##################
+
+# data
+# open chromatin
+data.open.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin1bp_fragment.mat")))
+data.open.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin2bp_fragment.mat")))
+data.open.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin10bp_fragment.mat")))
+
+data.open.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin1bp_read.mat")))
+data.open.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin2bp_read.mat")))
+data.open.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin10bp_read.mat")))
+
+data.open.1.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin1bp_read_atac.mat")))
+data.open.2.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin2bp_read_atac.mat")))
+data.open.10.atac = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_open_bin10bp_read_atac.mat")))
+
+# mono-nucleosomes
+data.1nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin1bp_fragment.mat")))
+data.1nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin2bp_fragment.mat")))
+data.1nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin10bp_fragment.mat")))
+
+data.1nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin1bp_read.mat")))
+data.1nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin2bp_read.mat")))
+data.1nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin10bp_read.mat")))
+
+data.1nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat")))
+data.1nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center.mat")))
+data.1nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_1nucl_bin10bp_fragment_center.mat")))
+
+# di-nucleosomes
+data.2nucl.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin1bp_fragment.mat")))
+data.2nucl.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin2bp_fragment.mat")))
+data.2nucl.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin10bp_fragment.mat")))
+
+data.2nucl.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin1bp_read.mat")))
+data.2nucl.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin2bp_read.mat")))
+data.2nucl.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin10bp_read.mat")))
+
+data.2nucl.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin1bp_fragment_center.mat")))
+data.2nucl.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin2bp_fragment_center.mat")))
+data.2nucl.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nucl_bin10bp_fragment_center.mat")))
+
+# mono-nucleosomes from di-nucleosome data
+data.nucls.1.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_fragment.mat")))
+data.nucls.2.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_fragment.mat")))
+data.nucls.10.frag = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_fragment.mat")))
+
+data.nucls.1.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_read.mat")))
+data.nucls.2.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_read.mat")))
+data.nucls.10.read = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_read.mat")))
+
+data.nucls.1.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_fragment_center.mat")))
+data.nucls.2.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_fragment_center.mat")))
+data.nucls.10.cent = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_fragment_center.mat")))
+
+
+# colors
+col = brewer.pal(4, "Set1")
+
+# x-axis
+axis.at.1 = seq(0, ncol(data.open.1.frag), length.out =5)
+axis.lab.1 = seq(-400, 400, by=200)
+axis.at.2 = seq(0, ncol(data.open.2.frag), length.out =5)
+axis.lab.2 = seq(-400, 400, by=200)
+axis.at.10 = seq(0, ncol(data.open.10.frag), length.out=5)
+axis.lab.10 = seq(-1000, 1000, by=500)
+
+# X11(width=12, height=12)
+png(filename=file.path("results/10xgenomics_PBMC_5k/sp1_motifs_10e-7_aggregations.png"),
+ units="in", res=720, width=12, height=9)
+ m = matrix(nrow=4, ncol=4,
+ data=c(16,13,14,15,
+ 10, 1, 4, 7,
+ 11, 2, 5, 8,
+ 12, 3, 6, 9), byrow=T)
+ l = layout(mat=m, widths=c(0.2, 1, 1, 1), heights=c(0.2, 1, 1, 1))
+ layout.show(l)
+
+ p = par(mar=c(5.1, 5.1, 4.1, 2.1))
+
+ # 1bp resolution
+ ## entire fragments
+ ylim = c(0,max(max(colMeans(data.open.1.frag)),
+ max(colMeans(data.open.1.frag)),
+ max(colMeans(data.1nucl.1.frag)),
+ max(colMeans(data.2nucl.1.frag)),
+ max(colMeans(data.nucls.1.frag))))
+ plot(colMeans(data.open.1.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.open.1.frag), col=col[1], lwd=3)
+ lines(colMeans(data.1nucl.1.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## entire reads
+ ylim = c(0,max(max(colMeans(data.open.1.read)),
+ max(colMeans(data.open.1.read)),
+ max(colMeans(data.1nucl.1.read)),
+ max(colMeans(data.2nucl.1.read)),
+ max(colMeans(data.nucls.1.read))))
+ plot(colMeans(data.open.1.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.1.atac)/max(colMeans(data.open.1.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.1.cent)/max(colMeans(data.1nucl.1.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.1.cent)/max(colMeans(data.2nucl.1.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.1.cent)/max(colMeans(data.nucls.1.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 2bp resolution
+ ## entire fragments
+ ylim = c(0,max(max(colMeans(data.open.2.frag)),
+ max(colMeans(data.open.2.frag)),
+ max(colMeans(data.1nucl.2.frag)),
+ max(colMeans(data.2nucl.2.frag)),
+ max(colMeans(data.nucls.2.frag))))
+ plot(colMeans(data.open.2.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## entire reads
+ ylim = c(0,max(max(colMeans(data.open.2.read)),
+ max(colMeans(data.open.2.read)),
+ max(colMeans(data.1nucl.2.read)),
+ max(colMeans(data.2nucl.2.read)),
+ max(colMeans(data.nucls.2.read))))
+ plot(colMeans(data.open.2.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.2.atac)/max(colMeans(data.open.2.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.2.cent)/max(colMeans(data.1nucl.2.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.2.cent)/max(colMeans(data.2nucl.2.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.2.cent)/max(colMeans(data.nucls.2.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.2, labels=axis.lab.2, cex.axis=1.8)
+
+ # 10bp resolution
+ ## entire fragments
+ ylim = c(0,max(max(colMeans(data.open.10.frag)),
+ max(colMeans(data.open.10.frag)),
+ max(colMeans(data.1nucl.10.frag)),
+ max(colMeans(data.2nucl.10.frag)),
+ max(colMeans(data.nucls.10.frag))))
+ plot(colMeans(data.open.10.frag), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.frag), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.frag), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.frag), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## entire reads
+ ylim = c(0,max(max(colMeans(data.open.10.read)),
+ max(colMeans(data.open.10.read)),
+ max(colMeans(data.1nucl.10.read)),
+ max(colMeans(data.2nucl.10.read)),
+ max(colMeans(data.nucls.10.read))))
+ plot(colMeans(data.open.10.read), col=col[1], lwd=3, type='l',
+ main="", xlab="pos[bp]", ylab="Nb of reads", xaxt='n',
+ ylim=ylim, cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.read), col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.read), col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.read), col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ ## atac reads and centers
+ plot(colMeans(data.open.10.atac)/max(colMeans(data.open.10.atac)),
+ col=col[1], lwd=3, type='l', xaxt='n',
+ main="", xlab="pos[bp]", ylab="Prop max signal",
+ cex.axis=2, cex.lab=2)
+ lines(colMeans(data.1nucl.10.cent)/max(colMeans(data.1nucl.10.cent)),
+ col=col[2], lwd=3)
+ lines(colMeans(data.2nucl.10.cent)/max(colMeans(data.2nucl.10.cent)),
+ col=col[3], lwd=3)
+ lines(colMeans(data.nucls.10.cent)/max(colMeans(data.nucls.10.cent)),
+ col=col[4], lwd=3)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+
+ # some legends over the rows and columns
+ p = par(mar=c(0,0,0,0))
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="FRAGMENTS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="READS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="EDGES/CENTERS", cex=2, srt=90)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 1bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-400bp by 2bp", cex=2)
+
+ plot(0, 0, col=0, main="", xlab="", ylab="", xaxt='n', yaxt='n')
+ text(0, 0, labels="+/-1kp by 10bp", cex=2)
+
+ par(p)
+dev.off()
+
+
+
+# x-axis
+axis.lab.1 = seq(-200, 200, by=100)
+axis.at.1 = seq(0, 400, length.out=length(axis.lab.1))
+
+axis.lab.2 = seq(-200, 200, by=100)
+axis.at.2 = seq(0, 200, length.out=length(axis.lab.2))
+
+axis.lab.10 = seq(-200, 200, by=100)
+axis.at.10 = seq(0, 41, length.out=length(axis.lab.10))
+
+
+# X11(width=10, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k", "sp1_motifs_10e-7_footprint.png"),
+ units="in", res=720, width=10, height=12)
+ p = par(mfrow=c(3,1),
+ mar=c(5.1, 5.1, 4.1, 2.1))
+ # 1bp resolution
+ index = 200:600
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.1.atac[,index])/max(colMeans(data.open.1.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="SP1 motif 1bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.1.cent[,index])/max(colMeans(data.1nucl.1.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.1.cent[,index])/max(colMeans(data.nucls.1.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=191, lwd=3, lty=2)
+ abline(v=211, lwd=3, lty=2)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 2bp resolution
+ index = 100:300
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.2.atac[,index])/max(colMeans(data.open.2.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="SP1 motif 2bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.2.cent[,index])/max(colMeans(data.1nucl.2.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.2.cent[,index])/max(colMeans(data.nucls.2.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=96, lwd=3, lty=2)
+ abline(v=106, lwd=3, lty=2)
+ axis(side=1, at=axis.at.1, labels=axis.lab.1, cex.axis=1.8)
+
+ # 10bp resolution
+ index = 80:120
+ x = 1:length(index)
+ plot(x,
+ colMeans(data.open.10.atac[,index])/max(colMeans(data.open.10.atac[,index])),
+ type='l', lwd=3, col=col[1],
+ main="SP1 motif 10bp", xlab="pos[bp]", ylab="Prop max signal", xaxt='n',
+ cex.axis=2, cex.lab=2, cex.main=2)
+ lines(x,
+ colMeans(data.1nucl.10.cent[,index])/max(colMeans(data.1nucl.10.cent[,index])),
+ lwd=3, col=col[2])
+ lines(x,
+ colMeans(data.nucls.10.cent[,index])/max(colMeans(data.nucls.10.cent[,index])),
+ lwd=3, col=col[4])
+ abline(v=20, lwd=3, lty=2)
+ abline(v=22, lwd=3, lty=2)
+ axis(side=1, at=axis.at.10, labels=axis.lab.10, cex.axis=1.8)
+ par(p)
+dev.off()
+
diff --git a/scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.sh
new file mode 100755
index 0000000..26e2c69
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/analysis_sp1_motif.sh
@@ -0,0 +1,67 @@
+# some paths
+## directories
+results_dir='results/10xgenomics_PBMC_5k'
+data_dir='data/10xgenomics_PBMC_5k/'
+## input
+file_bed=$data_dir'/sp1_motifs_10e-7.bed'
+file_bam_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam"
+file_bai_open="$data_dir/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam.bai"
+file_bam_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam"
+file_bai_1nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam.bai"
+file_bam_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam"
+file_bai_2nucl="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam.bai"
+file_bam_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam"
+file_bai_1nucl2="$data_dir/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam.bai"
+
+mkdir -p $results_dir
+
+# matrix creation
+## open chromatin around sp1 motif
+for method in 'read' 'read_atac' 'fragment'
+do
+ file_mat_open_1="$results_dir/sp1_motifs_10e-7_open_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 1 --method $method > $file_mat_open_1
+ file_mat_open_2="$results_dir/sp1_motifs_10e-7_open_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -400 --to 400 --binSize 2 --method $method > $file_mat_open_2
+ file_mat_open_10="$results_dir/sp1_motifs_10e-7_open_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_open --bai $file_bai_open --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_open_10
+done
+
+## mono around sp1 motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/sp1_motifs_10e-7_1nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/sp1_motifs_10e-7_1nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/sp1_motifs_10e-7_1nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl --bai $file_bai_1nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
+
+## di nucleosomes around sp1 motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### di nucleosomes
+ file_mat_2nucl_1="$results_dir/sp1_motifs_10e-7_2nucl_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 1 --method $method > $file_mat_2nucl_1
+ file_mat_2nucl_2="$results_dir/sp1_motifs_10e-7_2nucl_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -400 --to 400 --binSize 2 --method $method > $file_mat_2nucl_2
+ file_mat_2nucl_10="$results_dir/sp1_motifs_10e-7_2nucl_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_2nucl --bai $file_bai_2nucl --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_2nucl_10
+done
+
+
+## mono nucleosomes from processed di-nucleosome data around sp1 motif
+for method in 'read' 'fragment' 'fragment_center'
+do
+ ### mono nucleosomes
+ file_mat_1nucl_1="$results_dir/sp1_motifs_10e-7_2nuclsplitintwo_bin1bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 1 --method $method > $file_mat_1nucl_1
+ file_mat_1nucl_2="$results_dir/sp1_motifs_10e-7_2nuclsplitintwo_bin2bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -400 --to 400 --binSize 2 --method $method > $file_mat_1nucl_2
+ file_mat_1nucl_10="$results_dir/sp1_motifs_10e-7_2nuclsplitintwo_bin10bp_$method.mat"
+ bin/CorrelationMatrixCreator --bed $file_bed --bam $file_bam_1nucl2 --bai $file_bai_1nucl2 --from -1000 --to 1000 --binSize 10 --method $method > $file_mat_1nucl_10
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k/process_data.sh b/scripts/10xgenomics_PBMC_5k/process_data.sh
new file mode 100755
index 0000000..755fe66
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/process_data.sh
@@ -0,0 +1,19 @@
+mkdir -p data/10xgenomics_PBMC_5k
+
+# download 10xGenomics 5k PBMC ss-ATAC-seq dataset
+wget -O data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted.bam http://s3-us-west-2.amazonaws.com/10x.files/samples/cell-atac/1.0.1/atac_v1_pbmc_5k/atac_v1_pbmc_5k_possorted_bam.bam
+# download some barecode informations
+wget -O data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_singlecell.csv http://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_5k/atac_v1_pbmc_5k_singlecell.csv
+# download their peaks
+wget -O data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed http://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_5k/atac_v1_pbmc_5k_peaks.bed
+sed -E s/^\([0-9XY]+\)/chr\\1/ data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed | grep -E ^chr | sort -k 1,1V -k2,2n -k3,3n > data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks_sort.bed
+mv data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks_sort.bed data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed
+# get only peaks on chr1
+grep -E '^chr1[[:space:]]' data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks.bed > data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_peaks_chr1.bed
+
+# extract the barecodes corresponding to cells, based on 10XGenomics analysis
+grep -E _cell_[0-9]+ data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_singlecell.csv | cut -d ',' -f 1 > data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_barcodes.txt
+
+# filter out reads which do not have a proper barcode
+python3.6 scripts/bam_tools/filter_bam.py -i data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted.bam --tag CB --values data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_barcodes.txt -o data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered.bam
+
diff --git a/scripts/10xgenomics_PBMC_5k/run_all.sh b/scripts/10xgenomics_PBMC_5k/run_all.sh
new file mode 100755
index 0000000..e961146
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/run_all.sh
@@ -0,0 +1,11 @@
+
+# download the data, filter them and split by fragment size
+mkdir -p data/10xgenomics_PBMC_5k
+scripts/10xgenomics_PBMC_5k/process_data.sh
+scripts/10xgenomics_PBMC_5k/split_by_size.sh
+
+
+# analyse chromosome 1
+scripts/10xgenomics_PBMC_5k/analysis_chr1.sh
+Rscript scripts scripts/10xgenomics_PBMC_5k/analysis_chr1.R
+
diff --git a/scripts/10xgenomics_PBMC_5k/split_by_size.sh b/scripts/10xgenomics_PBMC_5k/split_by_size.sh
new file mode 100755
index 0000000..9df7482
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k/split_by_size.sh
@@ -0,0 +1,26 @@
+mkdir -p data/10xgenomics_PBMC_5k
+
+# get fragment lenghts, taken on https://dbrg77.wordpress.com/2017/02/10/atac-seq-insert-size-plotting/
+samtools view data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered.bam | awk '$9>0' | cut -f 9 | sort | uniq -c | sort -k 2,2n | sed -e 's/^[ \t]*//' > data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_fragment_lengths.txt
+
+# analyse the fragment lengths
+python3.6 scripts/bam_tools/split_by_length.py -i data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered.bam -o data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-149bp.bam --length 30-149
+
+# based on fragment length analysis, separate the fragments as follows :
+# 30 - 84bp : open chromatin fragments
+# 133 - 266bp : mono-nucleosome fragments
+# 341 - 500bp : di-nucleosome fragments
+python3.6 scripts/bam_tools/split_by_length.py -i data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered.bam -o data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam --length 30-84
+python3.6 scripts/bam_tools/split_by_length.py -i data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered.bam -o data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam --length 133-266
+python3.6 scripts/bam_tools/split_by_length.py -i data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered.bam -o data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam --length 341-500
+
+# split di-nucleosome fragments into mononucleosome fragments
+python3.6 scripts/bam_tools/split_in_two.py -i data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam -o data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam
+samtools sort data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam > data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo_sort.bam
+mv data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo_sort.bam data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam
+
+# index all BAM files
+samtools index data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_30-84bp.bam
+samtools index data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_133-266bp.bam
+samtools index data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_341-500bp.bam
+samtools index data/10xgenomics_PBMC_5k/atac_v1_pbmc_5k_possorted_filtered_341-500bp_splitintwo.bam
diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R
new file mode 100644
index 0000000..0d80131
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.R
@@ -0,0 +1,323 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## open chromatin patterns around ctcf motifs ##################
+
+# open chromatin
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_1class_ref.mat"))
+open.1.ref = data$references
+open.1.prob = data$prob
+open.1.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_1class_1nucl_fragment_center_ref.mat"))$ref
+open.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_2class_ref.mat"))
+open.2.ref = data$references
+open.2.prob = data$prob
+open.2.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_2class_1nucl_fragment_center_ref.mat"))$ref
+open.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_3class_ref.mat"))
+open.3.ref = data$references
+open.3.prob = data$prob
+open.3.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_3class_1nucl_fragment_center_ref.mat"))$ref
+open.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_4class_ref.mat"))
+open.4.ref = data$references
+open.4.prob = data$prob
+open.4.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_4class_1nucl_fragment_center_ref.mat"))$ref
+open.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_4class_ref.mat"))
+open.5.ref = data$references
+open.5.prob = data$prob
+open.5.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_5class_1nucl_fragment_center_ref.mat"))$ref
+open.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_6class_ref.mat"))
+open.6.ref = data$references
+open.6.prob = data$prob
+open.6.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_6class_1nucl_fragment_center_ref.mat"))$ref
+open.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_7class_ref.mat"))
+open.7.ref = data$references
+open.7.prob = data$prob
+open.7.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_7class_1nucl_fragment_center_ref.mat"))$ref
+open.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_8class_ref.mat"))
+open.8.ref = data$references
+open.8.prob = data$prob
+open.8.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_8class_1nucl_fragment_center_ref.mat"))$ref
+open.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_9class_ref.mat"))
+open.9.ref = data$references
+open.9.prob = data$prob
+open.9.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_9class_1nucl_fragment_center_ref.mat"))$ref
+open.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_10class_ref.mat"))
+open.10.ref = data$references
+open.10.prob = data$prob
+open.10.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_open_bin1bp_read_atac_10class_1nucl_fragment_center_ref.mat"))$ref
+open.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_open_bin1bp_read_atac_10class_aic.txt")))
+
+data = NULL
+
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_classification_open_bin1bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(open.10.prob, decreasing=T)
+ ref.open = open.10.ref[ord,]
+ ref.nucl = open.10.ref.nucl[ord,]
+ prob = open.10.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ for(i in 1:nrow(ref.open))
+ {
+ plot(ref.open[i,] / max(ref.open[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[1])
+ lines(ref.nucl[i,] / max(ref.nucl[i,]), lwd=2, col=col[2])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(open.10.ref, open.9.ref, open.8.ref, open.7.ref, open.6.ref,
+ open.5.ref, open.4.ref, open.3.ref, open.2.ref, open.1.ref)
+prob = list(open.10.prob, open.9.prob, open.8.prob, open.7.prob, open.6.prob,
+ open.5.prob, open.4.prob, open.3.prob, open.2.prob, open.1.prob)
+aic = c(open.10.aic, open.9.aic, open.8.aic, open.7.aic, open.6.aic,
+ open.5.aic, open.4.aic, open.3.aic, open.2.aic, open.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_classification_open_bin1bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
+
+
+
+
+
+
+################## nucleosome patterns around ctcf motifs ##################
+
+# nucleosomes
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_1class_ref.mat"))
+nucl.1.ref = data$references
+nucl.1.prob = data$prob
+nucl.1.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_1class_open_read_atac_ref.mat"))$ref
+nucl.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_2class_ref.mat"))
+nucl.2.ref = data$references
+nucl.2.prob = data$prob
+nucl.2.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_2class_open_read_atac_ref.mat"))$ref
+nucl.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_3class_ref.mat"))
+nucl.3.ref = data$references
+nucl.3.prob = data$prob
+nucl.3.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_3class_open_read_atac_ref.mat"))$ref
+nucl.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_4class_ref.mat"))
+nucl.4.ref = data$references
+nucl.4.prob = data$prob
+nucl.4.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_4class_open_read_atac_ref.mat"))$ref
+nucl.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_5class_ref.mat"))
+nucl.5.ref = data$references
+nucl.5.prob = data$prob
+nucl.5.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_5class_open_read_atac_ref.mat"))$ref
+nucl.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_6class_ref.mat"))
+nucl.6.ref = data$references
+nucl.6.prob = data$prob
+nucl.6.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_6class_open_read_atac_ref.mat"))$ref
+nucl.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_7class_ref.mat"))
+nucl.7.ref = data$references
+nucl.7.prob = data$prob
+nucl.7.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_7class_open_read_atac_ref.mat"))$ref
+nucl.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_8class_ref.mat"))
+nucl.8.ref = data$references
+nucl.8.prob = data$prob
+nucl.8.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_8class_open_read_atac_ref.mat"))$ref
+nucl.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_9class_ref.mat"))
+nucl.9.ref = data$references
+nucl.9.prob = data$prob
+nucl.9.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_9class_open_read_atac_ref.mat"))$ref
+nucl.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_10class_ref.mat"))
+nucl.10.ref = data$references
+nucl.10.prob = data$prob
+nucl.10.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_10class_open_read_atac_ref.mat"))$ref
+nucl.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_10class_aic.txt")))
+
+data = NULL
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", "ctcf_motifs_10e-6_classification_1nucl_bin1bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(nucl.10.prob, decreasing=T)
+ ref.nucl = nucl.10.ref[ord,]
+ ref.open = nucl.10.ref.open[ord,]
+ prob = nucl.10.prob[ord]
+ class = c(1:nrow(ref.nucl))[ord]
+ for(i in 1:nrow(ref.nucl))
+ {
+ plot(ref.nucl[i,] / max(ref.nucl[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[2])
+ lines(ref.open[i,] / max(ref.open[i,]), lwd=2, col=col[1])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(nucl.10.ref, nucl.9.ref, nucl.8.ref, nucl.7.ref, nucl.6.ref,
+ nucl.5.ref, nucl.4.ref, nucl.3.ref, nucl.2.ref, nucl.1.ref)
+prob = list(nucl.10.prob, nucl.9.prob, nucl.8.prob, nucl.7.prob, nucl.6.prob,
+ nucl.5.prob, nucl.4.prob, nucl.3.prob, nucl.2.prob, nucl.1.prob)
+aic = c(nucl.10.aic, nucl.9.aic, nucl.8.aic, nucl.7.aic, nucl.6.aic,
+ nucl.5.aic, nucl.4.aic, nucl.3.aic, nucl.2.aic, nucl.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_1",
+ "ctcf_motifs_10e-6_classification_1nucl_bin1bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh
new file mode 100755
index 0000000..89cd8c9
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ctcf_motif.sh
@@ -0,0 +1,47 @@
+# some paths
+## directories
+results_dir='results/10xgenomics_PBMC_5k_classification_1'
+data_dir='results/10xgenomics_PBMC_5k'
+## input
+file_mat_open="$results_dir/ctcf_motifs_10e-6_open_bin1bp_read_atac.mat"
+file_mat_1nucl="$results_dir/ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
+## file with seeds
+file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
+
+mkdir -p $results_dir
+touch $file_seed
+
+# parameters
+n_iter='20'
+n_shift='21'
+seeding='random'
+n_core=3
+
+# open chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_ref.mat'
+ file_ref2=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_ref.mat'
+ file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
+# 1nucl chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_ref.mat'
+ file_ref2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_ref.mat'
+ file_aic=$results_dir/'ctcf_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.R
new file mode 100644
index 0000000..dfc9dfa
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.R
@@ -0,0 +1,323 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## open chromatin patterns around EBF1 motifs ##################
+
+# open chromatin
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_1class_ref.mat"))
+open.1.ref = data$references
+open.1.prob = data$prob
+open.1.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_1class_1nucl_fragment_center_ref.mat"))$ref
+open.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_2class_ref.mat"))
+open.2.ref = data$references
+open.2.prob = data$prob
+open.2.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_2class_1nucl_fragment_center_ref.mat"))$ref
+open.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_3class_ref.mat"))
+open.3.ref = data$references
+open.3.prob = data$prob
+open.3.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_3class_1nucl_fragment_center_ref.mat"))$ref
+open.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_4class_ref.mat"))
+open.4.ref = data$references
+open.4.prob = data$prob
+open.4.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_4class_1nucl_fragment_center_ref.mat"))$ref
+open.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_4class_ref.mat"))
+open.5.ref = data$references
+open.5.prob = data$prob
+open.5.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_5class_1nucl_fragment_center_ref.mat"))$ref
+open.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_6class_ref.mat"))
+open.6.ref = data$references
+open.6.prob = data$prob
+open.6.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_6class_1nucl_fragment_center_ref.mat"))$ref
+open.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_7class_ref.mat"))
+open.7.ref = data$references
+open.7.prob = data$prob
+open.7.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_7class_1nucl_fragment_center_ref.mat"))$ref
+open.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_8class_ref.mat"))
+open.8.ref = data$references
+open.8.prob = data$prob
+open.8.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_8class_1nucl_fragment_center_ref.mat"))$ref
+open.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_9class_ref.mat"))
+open.9.ref = data$references
+open.9.prob = data$prob
+open.9.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_9class_1nucl_fragment_center_ref.mat"))$ref
+open.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_10class_ref.mat"))
+open.10.ref = data$references
+open.10.prob = data$prob
+open.10.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_open_bin1bp_read_atac_10class_1nucl_fragment_center_ref.mat"))$ref
+open.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_open_bin1bp_read_atac_10class_aic.txt")))
+
+data = NULL
+
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_classification_open_bin1bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(open.10.prob, decreasing=T)
+ ref.open = open.10.ref[ord,]
+ ref.nucl = open.10.ref.nucl[ord,]
+ prob = open.10.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ for(i in 1:nrow(ref.open))
+ {
+ plot(ref.open[i,] / max(ref.open[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[1])
+ lines(ref.nucl[i,] / max(ref.nucl[i,]), lwd=2, col=col[2])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(open.10.ref, open.9.ref, open.8.ref, open.7.ref, open.6.ref,
+ open.5.ref, open.4.ref, open.3.ref, open.2.ref, open.1.ref)
+prob = list(open.10.prob, open.9.prob, open.8.prob, open.7.prob, open.6.prob,
+ open.5.prob, open.4.prob, open.3.prob, open.2.prob, open.1.prob)
+aic = c(open.10.aic, open.9.aic, open.8.aic, open.7.aic, open.6.aic,
+ open.5.aic, open.4.aic, open.3.aic, open.2.aic, open.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_classification_open_bin1bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
+
+
+
+
+
+
+################## nucleosome patterns around EBF1 motifs ##################
+
+# nucleosomes
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_1class_ref.mat"))
+nucl.1.ref = data$references
+nucl.1.prob = data$prob
+nucl.1.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_1class_open_read_atac_ref.mat"))$ref
+nucl.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_2class_ref.mat"))
+nucl.2.ref = data$references
+nucl.2.prob = data$prob
+nucl.2.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_2class_open_read_atac_ref.mat"))$ref
+nucl.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_3class_ref.mat"))
+nucl.3.ref = data$references
+nucl.3.prob = data$prob
+nucl.3.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_3class_open_read_atac_ref.mat"))$ref
+nucl.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_4class_ref.mat"))
+nucl.4.ref = data$references
+nucl.4.prob = data$prob
+nucl.4.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_4class_open_read_atac_ref.mat"))$ref
+nucl.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_5class_ref.mat"))
+nucl.5.ref = data$references
+nucl.5.prob = data$prob
+nucl.5.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_5class_open_read_atac_ref.mat"))$ref
+nucl.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_6class_ref.mat"))
+nucl.6.ref = data$references
+nucl.6.prob = data$prob
+nucl.6.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_6class_open_read_atac_ref.mat"))$ref
+nucl.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_7class_ref.mat"))
+nucl.7.ref = data$references
+nucl.7.prob = data$prob
+nucl.7.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_7class_open_read_atac_ref.mat"))$ref
+nucl.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_8class_ref.mat"))
+nucl.8.ref = data$references
+nucl.8.prob = data$prob
+nucl.8.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_8class_open_read_atac_ref.mat"))$ref
+nucl.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_9class_ref.mat"))
+nucl.9.ref = data$references
+nucl.9.prob = data$prob
+nucl.9.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_9class_open_read_atac_ref.mat"))$ref
+nucl.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_10class_ref.mat"))
+nucl.10.ref = data$references
+nucl.10.prob = data$prob
+nucl.10.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_10class_open_read_atac_ref.mat"))$ref
+nucl.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_10class_aic.txt")))
+
+data = NULL
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", "ebf1_motifs_10e-6_classification_1nucl_bin1bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(nucl.10.prob, decreasing=T)
+ ref.nucl = nucl.10.ref[ord,]
+ ref.open = nucl.10.ref.open[ord,]
+ prob = nucl.10.prob[ord]
+ class = c(1:nrow(ref.nucl))[ord]
+ for(i in 1:nrow(ref.nucl))
+ {
+ plot(ref.nucl[i,] / max(ref.nucl[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[2])
+ lines(ref.open[i,] / max(ref.open[i,]), lwd=2, col=col[1])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(nucl.10.ref, nucl.9.ref, nucl.8.ref, nucl.7.ref, nucl.6.ref,
+ nucl.5.ref, nucl.4.ref, nucl.3.ref, nucl.2.ref, nucl.1.ref)
+prob = list(nucl.10.prob, nucl.9.prob, nucl.8.prob, nucl.7.prob, nucl.6.prob,
+ nucl.5.prob, nucl.4.prob, nucl.3.prob, nucl.2.prob, nucl.1.prob)
+aic = c(nucl.10.aic, nucl.9.aic, nucl.8.aic, nucl.7.aic, nucl.6.aic,
+ nucl.5.aic, nucl.4.aic, nucl.3.aic, nucl.2.aic, nucl.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_1",
+ "ebf1_motifs_10e-6_classification_1nucl_bin1bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh
new file mode 100755
index 0000000..0974ce0
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_ebf1_motif.sh
@@ -0,0 +1,47 @@
+# some paths
+## directories
+results_dir='results/10xgenomics_PBMC_5k_classification_1'
+data_dir='results/10xgenomics_PBMC_5k'
+## input
+file_mat_open="$results_dir/ebf1_motifs_10e-6_open_bin1bp_read_atac.mat"
+file_mat_1nucl="$results_dir/ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
+## file with seeds
+file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt'
+
+mkdir -p $results_dir
+touch $file_seed
+
+# parameters
+n_iter='20'
+n_shift='21'
+seeding='random'
+n_core=3
+
+# open chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_ref.mat'
+ file_ref2=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_ref.mat'
+ file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
+# 1nucl chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_ref.mat'
+ file_ref2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_ref.mat'
+ file_aic=$results_dir/'ebf1_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.R b/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.R
new file mode 100644
index 0000000..3cc9d49
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.R
@@ -0,0 +1,323 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## open chromatin patterns around myc motifs ##################
+
+# open chromatin
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_1class_ref.mat"))
+open.1.ref = data$references
+open.1.prob = data$prob
+open.1.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_1class_1nucl_fragment_center_ref.mat"))$ref
+open.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_2class_ref.mat"))
+open.2.ref = data$references
+open.2.prob = data$prob
+open.2.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_2class_1nucl_fragment_center_ref.mat"))$ref
+open.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_3class_ref.mat"))
+open.3.ref = data$references
+open.3.prob = data$prob
+open.3.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_3class_1nucl_fragment_center_ref.mat"))$ref
+open.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_4class_ref.mat"))
+open.4.ref = data$references
+open.4.prob = data$prob
+open.4.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_4class_1nucl_fragment_center_ref.mat"))$ref
+open.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_4class_ref.mat"))
+open.5.ref = data$references
+open.5.prob = data$prob
+open.5.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_5class_1nucl_fragment_center_ref.mat"))$ref
+open.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_6class_ref.mat"))
+open.6.ref = data$references
+open.6.prob = data$prob
+open.6.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_6class_1nucl_fragment_center_ref.mat"))$ref
+open.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_7class_ref.mat"))
+open.7.ref = data$references
+open.7.prob = data$prob
+open.7.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_7class_1nucl_fragment_center_ref.mat"))$ref
+open.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_8class_ref.mat"))
+open.8.ref = data$references
+open.8.prob = data$prob
+open.8.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_8class_1nucl_fragment_center_ref.mat"))$ref
+open.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_9class_ref.mat"))
+open.9.ref = data$references
+open.9.prob = data$prob
+open.9.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_9class_1nucl_fragment_center_ref.mat"))$ref
+open.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_10class_ref.mat"))
+open.10.ref = data$references
+open.10.prob = data$prob
+open.10.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_open_bin1bp_read_atac_10class_1nucl_fragment_center_ref.mat"))$ref
+open.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_open_bin1bp_read_atac_10class_aic.txt")))
+
+data = NULL
+
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_classification_open_bin1bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(open.10.prob, decreasing=T)
+ ref.open = open.10.ref[ord,]
+ ref.nucl = open.10.ref.nucl[ord,]
+ prob = open.10.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ for(i in 1:nrow(ref.open))
+ {
+ plot(ref.open[i,] / max(ref.open[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[1])
+ lines(ref.nucl[i,] / max(ref.nucl[i,]), lwd=2, col=col[2])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(open.10.ref, open.9.ref, open.8.ref, open.7.ref, open.6.ref,
+ open.5.ref, open.4.ref, open.3.ref, open.2.ref, open.1.ref)
+prob = list(open.10.prob, open.9.prob, open.8.prob, open.7.prob, open.6.prob,
+ open.5.prob, open.4.prob, open.3.prob, open.2.prob, open.1.prob)
+aic = c(open.10.aic, open.9.aic, open.8.aic, open.7.aic, open.6.aic,
+ open.5.aic, open.4.aic, open.3.aic, open.2.aic, open.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_classification_open_bin1bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
+
+
+
+
+
+
+################## nucleosome patterns around myc motifs ##################
+
+# nucleosomes
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_1class_ref.mat"))
+nucl.1.ref = data$references
+nucl.1.prob = data$prob
+nucl.1.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_1class_open_read_atac_ref.mat"))$ref
+nucl.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_2class_ref.mat"))
+nucl.2.ref = data$references
+nucl.2.prob = data$prob
+nucl.2.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_2class_open_read_atac_ref.mat"))$ref
+nucl.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_3class_ref.mat"))
+nucl.3.ref = data$references
+nucl.3.prob = data$prob
+nucl.3.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_3class_open_read_atac_ref.mat"))$ref
+nucl.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_4class_ref.mat"))
+nucl.4.ref = data$references
+nucl.4.prob = data$prob
+nucl.4.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_4class_open_read_atac_ref.mat"))$ref
+nucl.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_5class_ref.mat"))
+nucl.5.ref = data$references
+nucl.5.prob = data$prob
+nucl.5.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_5class_open_read_atac_ref.mat"))$ref
+nucl.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_6class_ref.mat"))
+nucl.6.ref = data$references
+nucl.6.prob = data$prob
+nucl.6.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_6class_open_read_atac_ref.mat"))$ref
+nucl.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_7class_ref.mat"))
+nucl.7.ref = data$references
+nucl.7.prob = data$prob
+nucl.7.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_7class_open_read_atac_ref.mat"))$ref
+nucl.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_8class_ref.mat"))
+nucl.8.ref = data$references
+nucl.8.prob = data$prob
+nucl.8.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_8class_open_read_atac_ref.mat"))$ref
+nucl.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_9class_ref.mat"))
+nucl.9.ref = data$references
+nucl.9.prob = data$prob
+nucl.9.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_9class_open_read_atac_ref.mat"))$ref
+nucl.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_10class_ref.mat"))
+nucl.10.ref = data$references
+nucl.10.prob = data$prob
+nucl.10.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_10class_open_read_atac_ref.mat"))$ref
+nucl.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_1nucl_bin1bp_fragment_center_10class_aic.txt")))
+
+data = NULL
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", "myc_motifs_10e-6_classification_1nucl_bin1bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(nucl.10.prob, decreasing=T)
+ ref.nucl = nucl.10.ref[ord,]
+ ref.open = nucl.10.ref.open[ord,]
+ prob = nucl.10.prob[ord]
+ class = c(1:nrow(ref.nucl))[ord]
+ for(i in 1:nrow(ref.nucl))
+ {
+ plot(ref.nucl[i,] / max(ref.nucl[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[2])
+ lines(ref.open[i,] / max(ref.open[i,]), lwd=2, col=col[1])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(nucl.10.ref, nucl.9.ref, nucl.8.ref, nucl.7.ref, nucl.6.ref,
+ nucl.5.ref, nucl.4.ref, nucl.3.ref, nucl.2.ref, nucl.1.ref)
+prob = list(nucl.10.prob, nucl.9.prob, nucl.8.prob, nucl.7.prob, nucl.6.prob,
+ nucl.5.prob, nucl.4.prob, nucl.3.prob, nucl.2.prob, nucl.1.prob)
+aic = c(nucl.10.aic, nucl.9.aic, nucl.8.aic, nucl.7.aic, nucl.6.aic,
+ nucl.5.aic, nucl.4.aic, nucl.3.aic, nucl.2.aic, nucl.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_1",
+ "myc_motifs_10e-6_classification_1nucl_bin1bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh
new file mode 100755
index 0000000..bd44d2c
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_myc_motif.sh
@@ -0,0 +1,47 @@
+# some paths
+## directories
+results_dir='results/10xgenomics_PBMC_5k_classification_1'
+data_dir='results/10xgenomics_PBMC_5k'
+## input
+file_mat_open="$results_dir/myc_motifs_10e-6_open_bin1bp_read_atac.mat"
+file_mat_1nucl="$results_dir/myc_motifs_10e-6_1nucl_bin1bp_fragment_center.mat"
+## file with seeds
+file_seed=$results_dir'/myc_motifs_10e-6_seed.txt'
+
+mkdir -p $results_dir
+touch $file_seed
+
+# parameters
+n_iter='20'
+n_shift='21'
+seeding='random'
+n_core=3
+
+# open chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_ref.mat'
+ file_ref2=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_ref.mat'
+ file_aic=$results_dir/'myc_motifs_10e-6_open_bin1bp_read_atac_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
+# 1nucl chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_ref.mat'
+ file_ref2=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_ref.mat'
+ file_aic=$results_dir/'myc_motifs_10e-6_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.R
new file mode 100644
index 0000000..e80b6db
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.R
@@ -0,0 +1,323 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## open chromatin patterns around sp1 motifs ##################
+
+# open chromatin
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_1class_ref.mat"))
+open.1.ref = data$references
+open.1.prob = data$prob
+open.1.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_1class_1nucl_fragment_center_ref.mat"))$ref
+open.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_2class_ref.mat"))
+open.2.ref = data$references
+open.2.prob = data$prob
+open.2.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_2class_1nucl_fragment_center_ref.mat"))$ref
+open.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_3class_ref.mat"))
+open.3.ref = data$references
+open.3.prob = data$prob
+open.3.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_3class_1nucl_fragment_center_ref.mat"))$ref
+open.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_4class_ref.mat"))
+open.4.ref = data$references
+open.4.prob = data$prob
+open.4.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_4class_1nucl_fragment_center_ref.mat"))$ref
+open.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_4class_ref.mat"))
+open.5.ref = data$references
+open.5.prob = data$prob
+open.5.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_5class_1nucl_fragment_center_ref.mat"))$ref
+open.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_6class_ref.mat"))
+open.6.ref = data$references
+open.6.prob = data$prob
+open.6.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_6class_1nucl_fragment_center_ref.mat"))$ref
+open.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_7class_ref.mat"))
+open.7.ref = data$references
+open.7.prob = data$prob
+open.7.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_7class_1nucl_fragment_center_ref.mat"))$ref
+open.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_8class_ref.mat"))
+open.8.ref = data$references
+open.8.prob = data$prob
+open.8.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_8class_1nucl_fragment_center_ref.mat"))$ref
+open.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_9class_ref.mat"))
+open.9.ref = data$references
+open.9.prob = data$prob
+open.9.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_9class_1nucl_fragment_center_ref.mat"))$ref
+open.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_10class_ref.mat"))
+open.10.ref = data$references
+open.10.prob = data$prob
+open.10.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_open_bin1bp_read_atac_10class_1nucl_fragment_center_ref.mat"))$ref
+open.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_open_bin1bp_read_atac_10class_aic.txt")))
+
+data = NULL
+
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_classification_open_bin1bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(open.10.prob, decreasing=T)
+ ref.open = open.10.ref[ord,]
+ ref.nucl = open.10.ref.nucl[ord,]
+ prob = open.10.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ for(i in 1:nrow(ref.open))
+ {
+ plot(ref.open[i,] / max(ref.open[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[1])
+ lines(ref.nucl[i,] / max(ref.nucl[i,]), lwd=2, col=col[2])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(open.10.ref, open.9.ref, open.8.ref, open.7.ref, open.6.ref,
+ open.5.ref, open.4.ref, open.3.ref, open.2.ref, open.1.ref)
+prob = list(open.10.prob, open.9.prob, open.8.prob, open.7.prob, open.6.prob,
+ open.5.prob, open.4.prob, open.3.prob, open.2.prob, open.1.prob)
+aic = c(open.10.aic, open.9.aic, open.8.aic, open.7.aic, open.6.aic,
+ open.5.aic, open.4.aic, open.3.aic, open.2.aic, open.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_classification_open_bin1bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
+
+
+
+
+
+
+################## nucleosome patterns around sp1 motifs ##################
+
+# nucleosomes
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_1class_ref.mat"))
+nucl.1.ref = data$references
+nucl.1.prob = data$prob
+nucl.1.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_1class_open_read_atac_ref.mat"))$ref
+nucl.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_2class_ref.mat"))
+nucl.2.ref = data$references
+nucl.2.prob = data$prob
+nucl.2.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_2class_open_read_atac_ref.mat"))$ref
+nucl.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_3class_ref.mat"))
+nucl.3.ref = data$references
+nucl.3.prob = data$prob
+nucl.3.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_3class_open_read_atac_ref.mat"))$ref
+nucl.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_4class_ref.mat"))
+nucl.4.ref = data$references
+nucl.4.prob = data$prob
+nucl.4.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_4class_open_read_atac_ref.mat"))$ref
+nucl.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_5class_ref.mat"))
+nucl.5.ref = data$references
+nucl.5.prob = data$prob
+nucl.5.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_5class_open_read_atac_ref.mat"))$ref
+nucl.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_6class_ref.mat"))
+nucl.6.ref = data$references
+nucl.6.prob = data$prob
+nucl.6.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_6class_open_read_atac_ref.mat"))$ref
+nucl.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_7class_ref.mat"))
+nucl.7.ref = data$references
+nucl.7.prob = data$prob
+nucl.7.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_7class_open_read_atac_ref.mat"))$ref
+nucl.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_8class_ref.mat"))
+nucl.8.ref = data$references
+nucl.8.prob = data$prob
+nucl.8.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_8class_open_read_atac_ref.mat"))$ref
+nucl.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_9class_ref.mat"))
+nucl.9.ref = data$references
+nucl.9.prob = data$prob
+nucl.9.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_9class_open_read_atac_ref.mat"))$ref
+nucl.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_10class_ref.mat"))
+nucl.10.ref = data$references
+nucl.10.prob = data$prob
+nucl.10.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_10class_open_read_atac_ref.mat"))$ref
+nucl.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_10class_aic.txt")))
+
+data = NULL
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_1", "sp1_motifs_10e-7_classification_1nucl_bin1bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(nucl.10.prob, decreasing=T)
+ ref.nucl = nucl.10.ref[ord,]
+ ref.open = nucl.10.ref.open[ord,]
+ prob = nucl.10.prob[ord]
+ class = c(1:nrow(ref.nucl))[ord]
+ for(i in 1:nrow(ref.nucl))
+ {
+ plot(ref.nucl[i,] / max(ref.nucl[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[2])
+ lines(ref.open[i,] / max(ref.open[i,]), lwd=2, col=col[1])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(nucl.10.ref, nucl.9.ref, nucl.8.ref, nucl.7.ref, nucl.6.ref,
+ nucl.5.ref, nucl.4.ref, nucl.3.ref, nucl.2.ref, nucl.1.ref)
+prob = list(nucl.10.prob, nucl.9.prob, nucl.8.prob, nucl.7.prob, nucl.6.prob,
+ nucl.5.prob, nucl.4.prob, nucl.3.prob, nucl.2.prob, nucl.1.prob)
+aic = c(nucl.10.aic, nucl.9.aic, nucl.8.aic, nucl.7.aic, nucl.6.aic,
+ nucl.5.aic, nucl.4.aic, nucl.3.aic, nucl.2.aic, nucl.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_1",
+ "sp1_motifs_10e-7_classification_1nucl_bin1bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
diff --git a/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh
new file mode 100755
index 0000000..49318bc
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_1/classification_sp1_motif.sh
@@ -0,0 +1,47 @@
+# some paths
+## directories
+results_dir='results/10xgenomics_PBMC_5k_classification_1'
+data_dir='results/10xgenomics_PBMC_5k'
+## input
+file_mat_open="$results_dir/sp1_motifs_10e-7_open_bin1bp_read_atac.mat"
+file_mat_1nucl="$results_dir/sp1_motifs_10e-7_1nucl_bin1bp_fragment_center.mat"
+## file with seeds
+file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
+
+mkdir -p $results_dir
+touch $file_seed
+
+# parameters
+n_iter='20'
+n_shift='21'
+seeding='random'
+n_core=3
+
+# open chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_ref.mat'
+ file_ref2=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_1nucl_fragment_center_ref.mat'
+ file_aic=$results_dir/'sp1_motifs_10e-7_open_bin1bp_read_atac_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
+# 1nucl chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_ref.mat'
+ file_ref2=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_open_read_atac_ref.mat'
+ file_aic=$results_dir/'sp1_motifs_10e-7_1nucl_bin1bp_fragment_center_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R
new file mode 100644
index 0000000..f3ec785
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.R
@@ -0,0 +1,323 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## open chromatin patterns around ctcf motifs ##################
+
+# open chromatin
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_1class_ref.mat"))
+open.1.ref = data$references
+open.1.prob = data$prob
+open.1.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_1class_1nucl_fragment_center_ref.mat"))$ref
+open.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_2class_ref.mat"))
+open.2.ref = data$references
+open.2.prob = data$prob
+open.2.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_2class_1nucl_fragment_center_ref.mat"))$ref
+open.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_3class_ref.mat"))
+open.3.ref = data$references
+open.3.prob = data$prob
+open.3.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_3class_1nucl_fragment_center_ref.mat"))$ref
+open.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_4class_ref.mat"))
+open.4.ref = data$references
+open.4.prob = data$prob
+open.4.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_4class_1nucl_fragment_center_ref.mat"))$ref
+open.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_4class_ref.mat"))
+open.5.ref = data$references
+open.5.prob = data$prob
+open.5.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_5class_1nucl_fragment_center_ref.mat"))$ref
+open.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_6class_ref.mat"))
+open.6.ref = data$references
+open.6.prob = data$prob
+open.6.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_6class_1nucl_fragment_center_ref.mat"))$ref
+open.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_7class_ref.mat"))
+open.7.ref = data$references
+open.7.prob = data$prob
+open.7.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_7class_1nucl_fragment_center_ref.mat"))$ref
+open.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_8class_ref.mat"))
+open.8.ref = data$references
+open.8.prob = data$prob
+open.8.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_8class_1nucl_fragment_center_ref.mat"))$ref
+open.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_9class_ref.mat"))
+open.9.ref = data$references
+open.9.prob = data$prob
+open.9.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_9class_1nucl_fragment_center_ref.mat"))$ref
+open.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_10class_ref.mat"))
+open.10.ref = data$references
+open.10.prob = data$prob
+open.10.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_open_bin2bp_read_atac_10class_1nucl_fragment_center_ref.mat"))$ref
+open.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_open_bin2bp_read_atac_10class_aic.txt")))
+
+data = NULL
+
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_classification_open_bin2bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(open.10.prob, decreasing=T)
+ ref.open = open.10.ref[ord,]
+ ref.nucl = open.10.ref.nucl[ord,]
+ prob = open.10.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ for(i in 1:nrow(ref.open))
+ {
+ plot(ref.open[i,] / max(ref.open[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[1])
+ lines(ref.nucl[i,] / max(ref.nucl[i,]), lwd=2, col=col[2])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(open.10.ref, open.9.ref, open.8.ref, open.7.ref, open.6.ref,
+ open.5.ref, open.4.ref, open.3.ref, open.2.ref, open.1.ref)
+prob = list(open.10.prob, open.9.prob, open.8.prob, open.7.prob, open.6.prob,
+ open.5.prob, open.4.prob, open.3.prob, open.2.prob, open.1.prob)
+aic = c(open.10.aic, open.9.aic, open.8.aic, open.7.aic, open.6.aic,
+ open.5.aic, open.4.aic, open.3.aic, open.2.aic, open.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_classification_open_bin2bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
+
+
+
+
+
+
+################## nucleosome patterns around ctcf motifs ##################
+
+# nucleosomes
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_1class_ref.mat"))
+nucl.1.ref = data$references
+nucl.1.prob = data$prob
+nucl.1.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_1class_open_read_atac_ref.mat"))$ref
+nucl.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_2class_ref.mat"))
+nucl.2.ref = data$references
+nucl.2.prob = data$prob
+nucl.2.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_2class_open_read_atac_ref.mat"))$ref
+nucl.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_3class_ref.mat"))
+nucl.3.ref = data$references
+nucl.3.prob = data$prob
+nucl.3.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_3class_open_read_atac_ref.mat"))$ref
+nucl.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_4class_ref.mat"))
+nucl.4.ref = data$references
+nucl.4.prob = data$prob
+nucl.4.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_4class_open_read_atac_ref.mat"))$ref
+nucl.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_5class_ref.mat"))
+nucl.5.ref = data$references
+nucl.5.prob = data$prob
+nucl.5.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_5class_open_read_atac_ref.mat"))$ref
+nucl.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_6class_ref.mat"))
+nucl.6.ref = data$references
+nucl.6.prob = data$prob
+nucl.6.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_6class_open_read_atac_ref.mat"))$ref
+nucl.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_7class_ref.mat"))
+nucl.7.ref = data$references
+nucl.7.prob = data$prob
+nucl.7.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_7class_open_read_atac_ref.mat"))$ref
+nucl.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_8class_ref.mat"))
+nucl.8.ref = data$references
+nucl.8.prob = data$prob
+nucl.8.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_8class_open_read_atac_ref.mat"))$ref
+nucl.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_9class_ref.mat"))
+nucl.9.ref = data$references
+nucl.9.prob = data$prob
+nucl.9.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_9class_open_read_atac_ref.mat"))$ref
+nucl.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_10class_ref.mat"))
+nucl.10.ref = data$references
+nucl.10.prob = data$prob
+nucl.10.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_10class_open_read_atac_ref.mat"))$ref
+nucl.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_10class_aic.txt")))
+
+data = NULL
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", "ctcf_motifs_10e-6_classification_1nucl_bin2bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(nucl.10.prob, decreasing=T)
+ ref.nucl = nucl.10.ref[ord,]
+ ref.open = nucl.10.ref.open[ord,]
+ prob = nucl.10.prob[ord]
+ class = c(1:nrow(ref.nucl))[ord]
+ for(i in 1:nrow(ref.nucl))
+ {
+ plot(ref.nucl[i,] / max(ref.nucl[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[2])
+ lines(ref.open[i,] / max(ref.open[i,]), lwd=2, col=col[1])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(nucl.10.ref, nucl.9.ref, nucl.8.ref, nucl.7.ref, nucl.6.ref,
+ nucl.5.ref, nucl.4.ref, nucl.3.ref, nucl.2.ref, nucl.1.ref)
+prob = list(nucl.10.prob, nucl.9.prob, nucl.8.prob, nucl.7.prob, nucl.6.prob,
+ nucl.5.prob, nucl.4.prob, nucl.3.prob, nucl.2.prob, nucl.1.prob)
+aic = c(nucl.10.aic, nucl.9.aic, nucl.8.aic, nucl.7.aic, nucl.6.aic,
+ nucl.5.aic, nucl.4.aic, nucl.3.aic, nucl.2.aic, nucl.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_2",
+ "ctcf_motifs_10e-6_classification_1nucl_bin2bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh
new file mode 100755
index 0000000..15f2219
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ctcf_motif.sh
@@ -0,0 +1,47 @@
+# some paths
+## directories
+data_dir='results/10xgenomics_PBMC_5k'
+results_dir='results/10xgenomics_PBMC_5k_classification_2'
+## input
+file_mat_open="$data_dir/ctcf_motifs_10e-6_open_bin2bp_read_atac.mat"
+file_mat_1nucl="$data_dir/ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center.mat"
+## file with seeds
+file_seed=$results_dir'/ctcf_motifs_10e-6_seed.txt'
+
+mkdir -p $results_dir
+touch $file_seed
+
+# parameters
+n_iter='20'
+n_shift='7'
+seeding='random'
+n_core=3
+
+# open chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'ctcf_motifs_10e-6_open_bin2bp_read_atac_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'ctcf_motifs_10e-6_open_bin2bp_read_atac_'$k'class_ref.mat'
+ file_ref2=$results_dir/'ctcf_motifs_10e-6_open_bin2bp_read_atac_'$k'class_1nucl_fragment_center_ref.mat'
+ file_aic=$results_dir/'ctcf_motifs_10e-6_open_bin2bp_read_atac_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
+# 1nucl chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_ref.mat'
+ file_ref2=$results_dir/'ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_open_read_atac_ref.mat'
+ file_aic=$results_dir/'ctcf_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R
new file mode 100644
index 0000000..0825159
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.R
@@ -0,0 +1,323 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## open chromatin patterns around ebf1 motifs ##################
+
+# open chromatin
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_1class_ref.mat"))
+open.1.ref = data$references
+open.1.prob = data$prob
+open.1.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_1class_1nucl_fragment_center_ref.mat"))$ref
+open.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_2class_ref.mat"))
+open.2.ref = data$references
+open.2.prob = data$prob
+open.2.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_2class_1nucl_fragment_center_ref.mat"))$ref
+open.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_3class_ref.mat"))
+open.3.ref = data$references
+open.3.prob = data$prob
+open.3.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_3class_1nucl_fragment_center_ref.mat"))$ref
+open.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_4class_ref.mat"))
+open.4.ref = data$references
+open.4.prob = data$prob
+open.4.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_4class_1nucl_fragment_center_ref.mat"))$ref
+open.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_4class_ref.mat"))
+open.5.ref = data$references
+open.5.prob = data$prob
+open.5.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_5class_1nucl_fragment_center_ref.mat"))$ref
+open.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_6class_ref.mat"))
+open.6.ref = data$references
+open.6.prob = data$prob
+open.6.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_6class_1nucl_fragment_center_ref.mat"))$ref
+open.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_7class_ref.mat"))
+open.7.ref = data$references
+open.7.prob = data$prob
+open.7.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_7class_1nucl_fragment_center_ref.mat"))$ref
+open.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_8class_ref.mat"))
+open.8.ref = data$references
+open.8.prob = data$prob
+open.8.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_8class_1nucl_fragment_center_ref.mat"))$ref
+open.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_9class_ref.mat"))
+open.9.ref = data$references
+open.9.prob = data$prob
+open.9.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_9class_1nucl_fragment_center_ref.mat"))$ref
+open.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_10class_ref.mat"))
+open.10.ref = data$references
+open.10.prob = data$prob
+open.10.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_open_bin2bp_read_atac_10class_1nucl_fragment_center_ref.mat"))$ref
+open.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_open_bin2bp_read_atac_10class_aic.txt")))
+
+data = NULL
+
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_classification_open_bin2bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(open.10.prob, decreasing=T)
+ ref.open = open.10.ref[ord,]
+ ref.nucl = open.10.ref.nucl[ord,]
+ prob = open.10.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ for(i in 1:nrow(ref.open))
+ {
+ plot(ref.open[i,] / max(ref.open[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[1])
+ lines(ref.nucl[i,] / max(ref.nucl[i,]), lwd=2, col=col[2])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(open.10.ref, open.9.ref, open.8.ref, open.7.ref, open.6.ref,
+ open.5.ref, open.4.ref, open.3.ref, open.2.ref, open.1.ref)
+prob = list(open.10.prob, open.9.prob, open.8.prob, open.7.prob, open.6.prob,
+ open.5.prob, open.4.prob, open.3.prob, open.2.prob, open.1.prob)
+aic = c(open.10.aic, open.9.aic, open.8.aic, open.7.aic, open.6.aic,
+ open.5.aic, open.4.aic, open.3.aic, open.2.aic, open.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_classification_open_bin2bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
+
+
+
+
+
+
+################## nucleosome patterns around ebf1 motifs ##################
+
+# nucleosomes
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_1class_ref.mat"))
+nucl.1.ref = data$references
+nucl.1.prob = data$prob
+nucl.1.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_1class_open_read_atac_ref.mat"))$ref
+nucl.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_2class_ref.mat"))
+nucl.2.ref = data$references
+nucl.2.prob = data$prob
+nucl.2.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_2class_open_read_atac_ref.mat"))$ref
+nucl.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_3class_ref.mat"))
+nucl.3.ref = data$references
+nucl.3.prob = data$prob
+nucl.3.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_3class_open_read_atac_ref.mat"))$ref
+nucl.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_4class_ref.mat"))
+nucl.4.ref = data$references
+nucl.4.prob = data$prob
+nucl.4.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_4class_open_read_atac_ref.mat"))$ref
+nucl.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_5class_ref.mat"))
+nucl.5.ref = data$references
+nucl.5.prob = data$prob
+nucl.5.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_5class_open_read_atac_ref.mat"))$ref
+nucl.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_6class_ref.mat"))
+nucl.6.ref = data$references
+nucl.6.prob = data$prob
+nucl.6.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_6class_open_read_atac_ref.mat"))$ref
+nucl.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_7class_ref.mat"))
+nucl.7.ref = data$references
+nucl.7.prob = data$prob
+nucl.7.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_7class_open_read_atac_ref.mat"))$ref
+nucl.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_8class_ref.mat"))
+nucl.8.ref = data$references
+nucl.8.prob = data$prob
+nucl.8.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_8class_open_read_atac_ref.mat"))$ref
+nucl.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_9class_ref.mat"))
+nucl.9.ref = data$references
+nucl.9.prob = data$prob
+nucl.9.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_9class_open_read_atac_ref.mat"))$ref
+nucl.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_10class_ref.mat"))
+nucl.10.ref = data$references
+nucl.10.prob = data$prob
+nucl.10.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_10class_open_read_atac_ref.mat"))$ref
+nucl.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_10class_aic.txt")))
+
+data = NULL
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", "ebf1_motifs_10e-6_classification_1nucl_bin2bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(nucl.10.prob, decreasing=T)
+ ref.nucl = nucl.10.ref[ord,]
+ ref.open = nucl.10.ref.open[ord,]
+ prob = nucl.10.prob[ord]
+ class = c(1:nrow(ref.nucl))[ord]
+ for(i in 1:nrow(ref.nucl))
+ {
+ plot(ref.nucl[i,] / max(ref.nucl[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[2])
+ lines(ref.open[i,] / max(ref.open[i,]), lwd=2, col=col[1])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(nucl.10.ref, nucl.9.ref, nucl.8.ref, nucl.7.ref, nucl.6.ref,
+ nucl.5.ref, nucl.4.ref, nucl.3.ref, nucl.2.ref, nucl.1.ref)
+prob = list(nucl.10.prob, nucl.9.prob, nucl.8.prob, nucl.7.prob, nucl.6.prob,
+ nucl.5.prob, nucl.4.prob, nucl.3.prob, nucl.2.prob, nucl.1.prob)
+aic = c(nucl.10.aic, nucl.9.aic, nucl.8.aic, nucl.7.aic, nucl.6.aic,
+ nucl.5.aic, nucl.4.aic, nucl.3.aic, nucl.2.aic, nucl.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_2",
+ "ebf1_motifs_10e-6_classification_1nucl_bin2bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh
new file mode 100755
index 0000000..0b2e0d9
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_ebf1_motif.sh
@@ -0,0 +1,47 @@
+# some paths
+## directories
+data_dir='results/10xgenomics_PBMC_5k'
+results_dir='results/10xgenomics_PBMC_5k_classification_2'
+## input
+file_mat_open="$data_dir/ebf1_motifs_10e-6_open_bin2bp_read_atac.mat"
+file_mat_1nucl="$data_dir/ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center.mat"
+## file with seeds
+file_seed=$results_dir'/ebf1_motifs_10e-6_seed.txt'
+
+mkdir -p $results_dir
+touch $file_seed
+
+# parameters
+n_iter='20'
+n_shift='7'
+seeding='random'
+n_core=3
+
+# open chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'ebf1_motifs_10e-6_open_bin2bp_read_atac_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'ebf1_motifs_10e-6_open_bin2bp_read_atac_'$k'class_ref.mat'
+ file_ref2=$results_dir/'ebf1_motifs_10e-6_open_bin2bp_read_atac_'$k'class_1nucl_fragment_center_ref.mat'
+ file_aic=$results_dir/'ebf1_motifs_10e-6_open_bin2bp_read_atac_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
+# 1nucl chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_ref.mat'
+ file_ref2=$results_dir/'ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_open_read_atac_ref.mat'
+ file_aic=$results_dir/'ebf1_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R b/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R
new file mode 100644
index 0000000..0cf8084
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.R
@@ -0,0 +1,323 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## open chromatin patterns around myc motifs ##################
+
+# open chromatin
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_1class_ref.mat"))
+open.1.ref = data$references
+open.1.prob = data$prob
+open.1.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_1class_1nucl_fragment_center_ref.mat"))$ref
+open.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_2class_ref.mat"))
+open.2.ref = data$references
+open.2.prob = data$prob
+open.2.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_2class_1nucl_fragment_center_ref.mat"))$ref
+open.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_3class_ref.mat"))
+open.3.ref = data$references
+open.3.prob = data$prob
+open.3.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_3class_1nucl_fragment_center_ref.mat"))$ref
+open.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_4class_ref.mat"))
+open.4.ref = data$references
+open.4.prob = data$prob
+open.4.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_4class_1nucl_fragment_center_ref.mat"))$ref
+open.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_4class_ref.mat"))
+open.5.ref = data$references
+open.5.prob = data$prob
+open.5.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_5class_1nucl_fragment_center_ref.mat"))$ref
+open.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_6class_ref.mat"))
+open.6.ref = data$references
+open.6.prob = data$prob
+open.6.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_6class_1nucl_fragment_center_ref.mat"))$ref
+open.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_7class_ref.mat"))
+open.7.ref = data$references
+open.7.prob = data$prob
+open.7.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_7class_1nucl_fragment_center_ref.mat"))$ref
+open.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_8class_ref.mat"))
+open.8.ref = data$references
+open.8.prob = data$prob
+open.8.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_8class_1nucl_fragment_center_ref.mat"))$ref
+open.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_9class_ref.mat"))
+open.9.ref = data$references
+open.9.prob = data$prob
+open.9.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_9class_1nucl_fragment_center_ref.mat"))$ref
+open.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_10class_ref.mat"))
+open.10.ref = data$references
+open.10.prob = data$prob
+open.10.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_open_bin2bp_read_atac_10class_1nucl_fragment_center_ref.mat"))$ref
+open.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_open_bin2bp_read_atac_10class_aic.txt")))
+
+data = NULL
+
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_classification_open_bin2bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(open.10.prob, decreasing=T)
+ ref.open = open.10.ref[ord,]
+ ref.nucl = open.10.ref.nucl[ord,]
+ prob = open.10.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ for(i in 1:nrow(ref.open))
+ {
+ plot(ref.open[i,] / max(ref.open[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[1])
+ lines(ref.nucl[i,] / max(ref.nucl[i,]), lwd=2, col=col[2])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(open.10.ref, open.9.ref, open.8.ref, open.7.ref, open.6.ref,
+ open.5.ref, open.4.ref, open.3.ref, open.2.ref, open.1.ref)
+prob = list(open.10.prob, open.9.prob, open.8.prob, open.7.prob, open.6.prob,
+ open.5.prob, open.4.prob, open.3.prob, open.2.prob, open.1.prob)
+aic = c(open.10.aic, open.9.aic, open.8.aic, open.7.aic, open.6.aic,
+ open.5.aic, open.4.aic, open.3.aic, open.2.aic, open.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_classification_open_bin2bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
+
+
+
+
+
+
+################## nucleosome patterns around myc motifs ##################
+
+# nucleosomes
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_1class_ref.mat"))
+nucl.1.ref = data$references
+nucl.1.prob = data$prob
+nucl.1.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_1class_open_read_atac_ref.mat"))$ref
+nucl.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_2class_ref.mat"))
+nucl.2.ref = data$references
+nucl.2.prob = data$prob
+nucl.2.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_2class_open_read_atac_ref.mat"))$ref
+nucl.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_3class_ref.mat"))
+nucl.3.ref = data$references
+nucl.3.prob = data$prob
+nucl.3.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_3class_open_read_atac_ref.mat"))$ref
+nucl.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_4class_ref.mat"))
+nucl.4.ref = data$references
+nucl.4.prob = data$prob
+nucl.4.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_4class_open_read_atac_ref.mat"))$ref
+nucl.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_5class_ref.mat"))
+nucl.5.ref = data$references
+nucl.5.prob = data$prob
+nucl.5.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_5class_open_read_atac_ref.mat"))$ref
+nucl.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_6class_ref.mat"))
+nucl.6.ref = data$references
+nucl.6.prob = data$prob
+nucl.6.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_6class_open_read_atac_ref.mat"))$ref
+nucl.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_7class_ref.mat"))
+nucl.7.ref = data$references
+nucl.7.prob = data$prob
+nucl.7.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_7class_open_read_atac_ref.mat"))$ref
+nucl.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_8class_ref.mat"))
+nucl.8.ref = data$references
+nucl.8.prob = data$prob
+nucl.8.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_8class_open_read_atac_ref.mat"))$ref
+nucl.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_9class_ref.mat"))
+nucl.9.ref = data$references
+nucl.9.prob = data$prob
+nucl.9.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_9class_open_read_atac_ref.mat"))$ref
+nucl.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_10class_ref.mat"))
+nucl.10.ref = data$references
+nucl.10.prob = data$prob
+nucl.10.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_10class_open_read_atac_ref.mat"))$ref
+nucl.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_1nucl_bin2bp_fragment_center_10class_aic.txt")))
+
+data = NULL
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", "myc_motifs_10e-6_classification_1nucl_bin2bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(nucl.10.prob, decreasing=T)
+ ref.nucl = nucl.10.ref[ord,]
+ ref.open = nucl.10.ref.open[ord,]
+ prob = nucl.10.prob[ord]
+ class = c(1:nrow(ref.nucl))[ord]
+ for(i in 1:nrow(ref.nucl))
+ {
+ plot(ref.nucl[i,] / max(ref.nucl[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[2])
+ lines(ref.open[i,] / max(ref.open[i,]), lwd=2, col=col[1])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(nucl.10.ref, nucl.9.ref, nucl.8.ref, nucl.7.ref, nucl.6.ref,
+ nucl.5.ref, nucl.4.ref, nucl.3.ref, nucl.2.ref, nucl.1.ref)
+prob = list(nucl.10.prob, nucl.9.prob, nucl.8.prob, nucl.7.prob, nucl.6.prob,
+ nucl.5.prob, nucl.4.prob, nucl.3.prob, nucl.2.prob, nucl.1.prob)
+aic = c(nucl.10.aic, nucl.9.aic, nucl.8.aic, nucl.7.aic, nucl.6.aic,
+ nucl.5.aic, nucl.4.aic, nucl.3.aic, nucl.2.aic, nucl.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_2",
+ "myc_motifs_10e-6_classification_1nucl_bin2bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh
new file mode 100755
index 0000000..dfc6bd6
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_myc_motif.sh
@@ -0,0 +1,47 @@
+# some paths
+## directories
+data_dir='results/10xgenomics_PBMC_5k'
+results_dir='results/10xgenomics_PBMC_5k_classification_2'
+## input
+file_mat_open="$data_dir/myc_motifs_10e-6_open_bin2bp_read_atac.mat"
+file_mat_1nucl="$data_dir/myc_motifs_10e-6_1nucl_bin2bp_fragment_center.mat"
+## file with seeds
+file_seed=$results_dir'/myc_motifs_10e-6_seed.txt'
+
+mkdir -p $results_dir
+touch $file_seed
+
+# parameters
+n_iter='20'
+n_shift='7'
+seeding='random'
+n_core=3
+
+# open chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'myc_motifs_10e-6_open_bin2bp_read_atac_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'myc_motifs_10e-6_open_bin2bp_read_atac_'$k'class_ref.mat'
+ file_ref2=$results_dir/'myc_motifs_10e-6_open_bin2bp_read_atac_'$k'class_1nucl_fragment_center_ref.mat'
+ file_aic=$results_dir/'myc_motifs_10e-6_open_bin2bp_read_atac_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
+# 1nucl chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'myc_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'myc_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_ref.mat'
+ file_ref2=$results_dir/'myc_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_open_read_atac_ref.mat'
+ file_aic=$results_dir/'myc_motifs_10e-6_1nucl_bin2bp_fragment_center_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R b/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R
new file mode 100644
index 0000000..d096bc7
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.R
@@ -0,0 +1,323 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+
+################## open chromatin patterns around sp1 motifs ##################
+
+# open chromatin
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_1class_ref.mat"))
+open.1.ref = data$references
+open.1.prob = data$prob
+open.1.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_1class_1nucl_fragment_center_ref.mat"))$ref
+open.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_2class_ref.mat"))
+open.2.ref = data$references
+open.2.prob = data$prob
+open.2.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_2class_1nucl_fragment_center_ref.mat"))$ref
+open.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_3class_ref.mat"))
+open.3.ref = data$references
+open.3.prob = data$prob
+open.3.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_3class_1nucl_fragment_center_ref.mat"))$ref
+open.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_4class_ref.mat"))
+open.4.ref = data$references
+open.4.prob = data$prob
+open.4.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_4class_1nucl_fragment_center_ref.mat"))$ref
+open.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_4class_ref.mat"))
+open.5.ref = data$references
+open.5.prob = data$prob
+open.5.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_5class_1nucl_fragment_center_ref.mat"))$ref
+open.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_6class_ref.mat"))
+open.6.ref = data$references
+open.6.prob = data$prob
+open.6.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_6class_1nucl_fragment_center_ref.mat"))$ref
+open.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_7class_ref.mat"))
+open.7.ref = data$references
+open.7.prob = data$prob
+open.7.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_7class_1nucl_fragment_center_ref.mat"))$ref
+open.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_8class_ref.mat"))
+open.8.ref = data$references
+open.8.prob = data$prob
+open.8.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_8class_1nucl_fragment_center_ref.mat"))$ref
+open.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_9class_ref.mat"))
+open.9.ref = data$references
+open.9.prob = data$prob
+open.9.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_9class_1nucl_fragment_center_ref.mat"))$ref
+open.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_10class_ref.mat"))
+open.10.ref = data$references
+open.10.prob = data$prob
+open.10.ref.nucl = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_open_bin2bp_read_atac_10class_1nucl_fragment_center_ref.mat"))$ref
+open.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_open_bin2bp_read_atac_10class_aic.txt")))
+
+data = NULL
+
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_classification_open_bin2bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(open.10.prob, decreasing=T)
+ ref.open = open.10.ref[ord,]
+ ref.nucl = open.10.ref.nucl[ord,]
+ prob = open.10.prob[ord]
+ class = c(1:nrow(ref.open))[ord]
+ for(i in 1:nrow(ref.open))
+ {
+ plot(ref.open[i,] / max(ref.open[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[1])
+ lines(ref.nucl[i,] / max(ref.nucl[i,]), lwd=2, col=col[2])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(open.10.ref, open.9.ref, open.8.ref, open.7.ref, open.6.ref,
+ open.5.ref, open.4.ref, open.3.ref, open.2.ref, open.1.ref)
+prob = list(open.10.prob, open.9.prob, open.8.prob, open.7.prob, open.6.prob,
+ open.5.prob, open.4.prob, open.3.prob, open.2.prob, open.1.prob)
+aic = c(open.10.aic, open.9.aic, open.8.aic, open.7.aic, open.6.aic,
+ open.5.aic, open.4.aic, open.3.aic, open.2.aic, open.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_classification_open_bin2bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
+
+
+
+
+
+
+################## nucleosome patterns around sp1 motifs ##################
+
+# nucleosomes
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_1class_ref.mat"))
+nucl.1.ref = data$references
+nucl.1.prob = data$prob
+nucl.1.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_1class_open_read_atac_ref.mat"))$ref
+nucl.1.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_1class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_2class_ref.mat"))
+nucl.2.ref = data$references
+nucl.2.prob = data$prob
+nucl.2.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_2class_open_read_atac_ref.mat"))$ref
+nucl.2.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_2class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_3class_ref.mat"))
+nucl.3.ref = data$references
+nucl.3.prob = data$prob
+nucl.3.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_3class_open_read_atac_ref.mat"))$ref
+nucl.3.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_3class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_4class_ref.mat"))
+nucl.4.ref = data$references
+nucl.4.prob = data$prob
+nucl.4.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_4class_open_read_atac_ref.mat"))$ref
+nucl.4.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_4class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_5class_ref.mat"))
+nucl.5.ref = data$references
+nucl.5.prob = data$prob
+nucl.5.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_5class_open_read_atac_ref.mat"))$ref
+nucl.5.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_5class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_6class_ref.mat"))
+nucl.6.ref = data$references
+nucl.6.prob = data$prob
+nucl.6.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_6class_open_read_atac_ref.mat"))$ref
+nucl.6.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_6class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_7class_ref.mat"))
+nucl.7.ref = data$references
+nucl.7.prob = data$prob
+nucl.7.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_7class_open_read_atac_ref.mat"))$ref
+nucl.7.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_7class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_8class_ref.mat"))
+nucl.8.ref = data$references
+nucl.8.prob = data$prob
+nucl.8.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_8class_open_read_atac_ref.mat"))$ref
+nucl.8.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_8class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_9class_ref.mat"))
+nucl.9.ref = data$references
+nucl.9.prob = data$prob
+nucl.9.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_9class_open_read_atac_ref.mat"))$ref
+nucl.9.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_9class_aic.txt")))
+
+data = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_10class_ref.mat"))
+nucl.10.ref = data$references
+nucl.10.prob = data$prob
+nucl.10.ref.open = read.references(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_10class_open_read_atac_ref.mat"))$ref
+nucl.10.aic = as.matrix(read.table(file.path("results", "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_10class_aic.txt")))
+
+data = NULL
+
+# plot 10 classes
+col = brewer.pal(3, "Set1")
+
+# X11(width=8, height=12)
+png(filename=file.path("results", "10xgenomics_PBMC_5k_classification_2", "sp1_motifs_10e-7_classification_1nucl_bin2bp_10class.png"),
+ units="in", res=720, width=8, height=12)
+ m = matrix(1:10, nrow=5, ncol=2, byrow=F)
+ layout(m)
+ # order from most to least probable class
+ ord = order(nucl.10.prob, decreasing=T)
+ ref.nucl = nucl.10.ref[ord,]
+ ref.open = nucl.10.ref.open[ord,]
+ prob = nucl.10.prob[ord]
+ class = c(1:nrow(ref.nucl))[ord]
+ for(i in 1:nrow(ref.nucl))
+ {
+ plot(ref.nucl[i,] / max(ref.nucl[i,]), type='l', lwd=2, ylim=c(0,1),
+ main=sprintf("class %d (p=%.2f)", class[i], prob[i]), col=col[2])
+ lines(ref.open[i,] / max(ref.open[i,]), lwd=2, col=col[1])
+ }
+dev.off()
+
+
+# plot all classes
+ref = list(nucl.10.ref, nucl.9.ref, nucl.8.ref, nucl.7.ref, nucl.6.ref,
+ nucl.5.ref, nucl.4.ref, nucl.3.ref, nucl.2.ref, nucl.1.ref)
+prob = list(nucl.10.prob, nucl.9.prob, nucl.8.prob, nucl.7.prob, nucl.6.prob,
+ nucl.5.prob, nucl.4.prob, nucl.3.prob, nucl.2.prob, nucl.1.prob)
+aic = c(nucl.10.aic, nucl.9.aic, nucl.8.aic, nucl.7.aic, nucl.6.aic,
+ nucl.5.aic, nucl.4.aic, nucl.3.aic, nucl.2.aic, nucl.1.aic)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+# plot
+plot.references(file.path("results",
+ "10xgenomics_PBMC_5k_classification_2",
+ "sp1_motifs_10e-7_classification_1nucl_bin2bp_classes.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
diff --git a/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh b/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh
new file mode 100755
index 0000000..53674d5
--- /dev/null
+++ b/scripts/10xgenomics_PBMC_5k_classification_2/classification_sp1_motif.sh
@@ -0,0 +1,47 @@
+# some paths
+## directories
+data_dir='results/10xgenomics_PBMC_5k'
+results_dir='results/10xgenomics_PBMC_5k_classification_2'
+## input
+file_mat_open="$data_dir/sp1_motifs_10e-7_open_bin2bp_read_atac.mat"
+file_mat_1nucl="$data_dir/sp1_motifs_10e-7_1nucl_bin2bp_fragment_center.mat"
+## file with seeds
+file_seed=$results_dir'/sp1_motifs_10e-7_seed.txt'
+
+mkdir -p $results_dir
+touch $file_seed
+
+# parameters
+n_iter='20'
+n_shift='7'
+seeding='random'
+n_core=3
+
+# open chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'sp1_motifs_10e-7_open_bin2bp_read_atac_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'sp1_motifs_10e-7_open_bin2bp_read_atac_'$k'class_ref.mat'
+ file_ref2=$results_dir/'sp1_motifs_10e-7_open_bin2bp_read_atac_'$k'class_1nucl_fragment_center_ref.mat'
+ file_aic=$results_dir/'sp1_motifs_10e-7_open_bin2bp_read_atac_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_open --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
+# 1nucl chromatin
+for k in 1 2 3 4 5 6 7 8 9 10
+do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
+ file_prob=$results_dir/'sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_'$k'class_prob.mat4d'
+ file_ref1=$results_dir/'sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_'$k'class_ref.mat'
+ file_ref2=$results_dir/'sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_'$k'class_open_read_atac_ref.mat'
+ file_aic=$results_dir/'sp1_motifs_10e-7_1nucl_bin2bp_fragment_center_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mat_1nucl --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mat_1nucl --prob $file_prob --parallel $n_core 1> $file_ref1 2> $file_aic
+ bin/probToRef --data $file_mat_open --prob $file_prob --parallel $n_core 1> $file_ref2 2> /dev/null
+done
+
diff --git a/scripts/bam_tools/.idea/bam_tools.iml b/scripts/bam_tools/.idea/bam_tools.iml
new file mode 100644
index 0000000..3a4807d
--- /dev/null
+++ b/scripts/bam_tools/.idea/bam_tools.iml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/scripts/bam_tools/.idea/encodings.xml b/scripts/bam_tools/.idea/encodings.xml
new file mode 100644
index 0000000..15a15b2
--- /dev/null
+++ b/scripts/bam_tools/.idea/encodings.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/scripts/bam_tools/.idea/libraries/R_User_Library.xml b/scripts/bam_tools/.idea/libraries/R_User_Library.xml
new file mode 100644
index 0000000..71f5ff7
--- /dev/null
+++ b/scripts/bam_tools/.idea/libraries/R_User_Library.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/scripts/bam_tools/.idea/misc.xml b/scripts/bam_tools/.idea/misc.xml
new file mode 100644
index 0000000..65531ca
--- /dev/null
+++ b/scripts/bam_tools/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/scripts/bam_tools/.idea/modules.xml b/scripts/bam_tools/.idea/modules.xml
new file mode 100644
index 0000000..7bb896e
--- /dev/null
+++ b/scripts/bam_tools/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/scripts/bam_tools/.idea/workspace.xml b/scripts/bam_tools/.idea/workspace.xml
new file mode 100644
index 0000000..221e41b
--- /dev/null
+++ b/scripts/bam_tools/.idea/workspace.xml
@@ -0,0 +1,183 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ file_bam
+ tuple_lengths
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1549034112423
+
+
+ 1549034112423
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/scripts/bam_tools/filter_bam.py b/scripts/bam_tools/filter_bam.py
new file mode 100644
index 0000000..b6a1dba
--- /dev/null
+++ b/scripts/bam_tools/filter_bam.py
@@ -0,0 +1,83 @@
+import optparse
+import sys
+import os
+import typing as tp
+import pysam
+
+def construct_value_set(file: str) -> tp.Set[str]:
+ """
+ Constructs a set containing the values listed in the given file.
+ The file should contain on value per line.
+ :param file: the file of interest.
+ :return: a set with the values listed in the file.
+ """
+ s = set()
+ with open(file, "rt") as f:
+ for line in f:
+ s.add(line.rstrip())
+ return s
+
+def filter_bam(file_in: str, file_out: str, tag: str, values: tp.Set[str]) -> None:
+ """
+ Filters the reads in the bam file.
+ :param file_in:
+ :param file_out:
+ :param values:
+ :return:
+ """
+
+ # read bam file and dispatch the reads
+ bam_in = pysam.AlignmentFile(file_in)
+ bam_out = pysam.AlignmentFile(file_out, template=bam_in, mode="wb")
+ for read in bam_in:
+ if read.has_tag(tag):
+ value = read.get_tag(tag)
+ # check if read has the good tag
+ if value in values:
+ bam_out.write(read)
+ bam_in.close()
+ bam_out.close()
+
+if __name__ == "__main__":
+ # parse options
+ usage = "usage: %s [options]" % os.path.basename(__file__)
+ epilog = "This program reads a bam file and filters out any read that is not associated with one of the given " \
+ "tag values." \
+ "Written by Romain Groux, February 2019"
+ parser = optparse.OptionParser(usage=usage, epilog=epilog)
+ parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store",
+ help="The addresse of the bam file to filter.")
+ parser.add_option("-o", "--output", dest="file_out", default=None, type="string", action="store",
+ help="The addresse of the output file.")
+ parser.add_option("--values", dest="file_values", default=None, type="string", action="store",
+ help="The value of the relevant tag for the filtering.")
+ parser.add_option("--tag", dest="tag", default=None, type="string", action="store",
+ help="The tag which values will be used for the filtering.")
+ (options, args) = parser.parse_args()
+
+ file_in = options.file_in
+ file_out = options.file_out
+ file_values = options.file_values
+ tag = options.tag
+
+ # check options
+ if file_in is None:
+ print("Error! No input file given (-i)!", sys.stderr)
+ exit(1)
+ elif not os.path.isfile(file_in):
+ print("Error! %s does not exist!" % file_in)
+ exit(1)
+ elif file_out is None:
+ print("Error! no output file given (-o)!")
+ elif file_values is None:
+ print("Error! No value file given (--values)!", sys.stderr)
+ exit(1)
+ elif not os.path.isfile(file_values):
+ print("Error! %s does not exist!" % file_values)
+ exit(1)
+ elif tag is None:
+ print("Error! no tag was given (--tag)!")
+
+ value_set = construct_value_set(file_values)
+ filter_bam(file_in, file_out, tag, value_set)
+
diff --git a/scripts/bam_tools/head_bam.py b/scripts/bam_tools/head_bam.py
new file mode 100644
index 0000000..7febfcb
--- /dev/null
+++ b/scripts/bam_tools/head_bam.py
@@ -0,0 +1,54 @@
+
+import optparse
+import sys
+import os
+import typing as tp
+import pysam
+
+
+def get_file_subset(file_in: str, file_out:str, n: int) -> None:
+ f_in = pysam.AlignmentFile(file_in)
+ f_out = pysam.AlignmentFile(file_out, template=f_in, mode="wb")
+
+ for i, line in enumerate(f_in):
+ if i >= n:
+ break
+ else:
+ f_out.write(line)
+ f_in.close()
+ f_out.close()
+
+
+if __name__ == "__main__":
+ # parse options
+ usage = "usage: %s [options]" % os.path.basename(__file__)
+ epilog = "This program reads a bam file and writes the first reads to another file.\n" \
+ "Written by Romain Groux, February 2019"
+ parser = optparse.OptionParser(usage=usage, epilog=epilog)
+ parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store",
+ help="the addresse of the bam file to split.")
+ parser.add_option("-o", "--output", dest="file_out", default=None, type="string", action="store",
+ help="the addresse of the output file.")
+ parser.add_option("-n", "--nlines", dest="nlines", default=10, type="int", action="store",
+ help="the addresse of the output bam file.")
+
+ (options, args) = parser.parse_args()
+ file_in = options.file_in
+ file_out = options.file_out
+ nlines = options.nlines
+
+
+ # check options
+ if file_in is None:
+ print("Error! No input file given (-i)!", sys.stderr)
+ exit(1)
+ elif not os.path.isfile(file_in):
+ print("Error! %s does not exist!" % file_in)
+ exit(1)
+ elif file_out is None:
+ print("Error! No output file given (-o)!", sys.stderr)
+ exit(1)
+ elif nlines <= 0:
+ print("Error! number of lines <= 0 (-n)!", sys.stderr)
+
+ get_file_subset(file_in, file_out, nlines)
diff --git a/scripts/bam_tools/split_bam.py b/scripts/bam_tools/split_bam.py
new file mode 100644
index 0000000..8719a90
--- /dev/null
+++ b/scripts/bam_tools/split_bam.py
@@ -0,0 +1,108 @@
+import optparse
+import sys
+import os
+import typing as tp
+import pysam
+
+
+def construct_dict_files(f_values: str, f_prefix: str) -> tp.Dict[str, str]:
+ """
+ Reads a file containing a list of tag values (one value per line) and constructs a dictionary
+ of values (key) and file addresses (values) to later dispatch the reads in.
+ :param f_values: the address of the file containing the tag values
+ :param f_prefix: a common prefix for the addresses of all the file addresses in the dictionary.
+ :return: a dictionary with tag values and their associated file addresses.
+ """
+ d = dict()
+ with open(f_values, "rt") as f:
+ for line in f:
+ value = line.rstrip()
+ if d.get(value, None) is None:
+ f_reads = "%s%s.sam" % (f_prefix, value)
+ d[value] = f_reads
+ else:
+ pass
+ return d
+
+
+def split_bam(f_bam: str, tag:str, d_files: tp.Dict[str, str]):
+ """
+ Splits the bam file according to the given tag values.
+ The bam file is read and each read is check for the given tag value. If the value is listed in the given file
+ dictionary, then the read is written to the corresponding file.
+ :param f_bam: the address of the bam file to split.
+ :param tag: the tag which should be used for splitting.
+ :param d_files: a dictionary containing the accepted values for sorting (key) and the addresses of the
+ corresponding files in which the reads should be dispatched.
+ """
+
+ # Create all files and a 2nd dictionary telling whether header has already been written, the key is still the
+ # value. Don't write the sam file headers now. If a file is given no read, then it will be empty, not only
+ # containing a header
+ d_header = dict()
+ for key in d_files.keys():
+ f = open(d_files[key], "wt")
+ f.close()
+ d_header[key] = False
+
+ # read bam file and dispatch the reads
+ bam = pysam.AlignmentFile(f_bam)
+ for read in bam:
+ if read.has_tag(tag):
+ value = read.get_tag(tag)
+ # only treat value present in the list
+ if d_files.get(value, None) is not None:
+ # cannot keep all files open, raises an OS Error if too many are open at the same time
+ with open(d_files[value], "at") as f:
+ # write header if file has not been written before
+ if d_header[value] is False:
+ f.write(str(read.header))
+ d_header[value] = True
+ f.write("%s\n" % read.to_string())
+ bam.close()
+
+
+if __name__ == "__main__":
+ # parse options
+ usage = "usage: %s [options]" % os.path.basename(__file__)
+ epilog = "This program reads a bam file and dispatches the reads into separated sam files according to the " \
+ "values associated with a specified tag. The accepted values should be listed into a text file. The " \
+ "output files will be located in the current directory.\n" \
+ "Written by Romain Groux, January 2019"
+ parser = optparse.OptionParser(usage=usage, epilog=epilog)
+ parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store",
+ help="the addresse of the bam file to split.")
+ parser.add_option("-p", "--prefix", dest="prefix", default="", type="string", action="store",
+ help="a name prefix for the files in which the reads will be dispatched.")
+ parser.add_option("--values", dest="file_values", default=None, type="string", action="store",
+ help="the address of the file containing the associated tag values relevant for the splitting.")
+ parser.add_option("--tag", dest="tag", default=None, type="string", action="store",
+ help="The tag which values will be used for the splitting.")
+ (options, args) = parser.parse_args()
+ file_in = options.file_in
+ file_values = options.file_values
+ prefix = options.prefix
+ tag = options.tag
+
+ # check options
+ if file_in is None:
+ print("Error! No input file given (-i)!", sys.stderr)
+ exit(1)
+ elif not os.path.isfile(file_in):
+ print("Error! %s does not exist!" % file_in)
+ exit(1)
+ elif file_values is None:
+ print("Error! No value file given (--values)!", sys.stderr)
+ exit(1)
+ elif not os.path.isfile(file_values):
+ print("Error! %s does not exist!" % file_values)
+ exit(1)
+ elif tag is None:
+ print("Error! no tag was given (--tag)!")
+
+ if prefix != "":
+ prefix = "%s_" % prefix
+
+ # split bam file
+ dict_files = construct_dict_files(file_values, prefix)
+ split_bam(file_in, tag, dict_files)
diff --git a/scripts/bam_tools/split_by_length.py b/scripts/bam_tools/split_by_length.py
new file mode 100644
index 0000000..216b3ab
--- /dev/null
+++ b/scripts/bam_tools/split_by_length.py
@@ -0,0 +1,80 @@
+import optparse
+import sys
+import os
+import typing as tp
+import pysam
+
+
+def parse_lengths(lengths_str: str) -> tp.Tuple[int, int]:
+
+ tuple_lengths = ()
+
+ try:
+ if '-' not in lengths_str:
+ raise RuntimeError("invalid fragment lengths : %s" % lengths_str)
+ else:
+ duo = lengths_str.split('-')
+ # not two values
+ if len(duo) != 2:
+ raise RuntimeError("invalid list of fragment lengths : %s" % lengths_str)
+ duo = (int(duo[0]), int(duo[1]))
+ # to <= from
+ if duo[1] <= duo[0]:
+ raise RuntimeError("invalid list of fragment lengths : %s" % lengths_str)
+
+ except Exception as e:
+ print(e, sys.stderr)
+ raise RuntimeError("invalid list of fragment lengths : %s" % lengths_str)
+
+ return (duo[0], duo[1])
+
+
+def split_bam(file_bam, file_out, lengths):
+
+ bam_in = pysam.AlignmentFile(file_in)
+ bam_out = pysam.AlignmentFile(file_out, template=bam_in, mode="wb")
+
+ for read in bam_in:
+ # don't know how to get fragment length from bam so convert the fragment to
+ # sam and parse the sam representation
+ # frag. with 1st read on reverse have negative length
+ read_l = abs(int(read.to_string().split('\t')[8]))
+ if read_l >= lengths[0] and read_l <= lengths[1]:
+ bam_out.write(read)
+
+ bam_in.close()
+ bam_out.close()
+
+
+if __name__ == "__main__":
+
+ # parse options
+ usage = "usage: %s [options]" % os.path.basename(__file__)
+ epilog = "This program reads a bam file and filters out any read that is not associated with one of the given " \
+ "tag values." \
+ "Written by Romain Groux, February 2019"
+ parser = optparse.OptionParser(usage=usage, epilog=epilog)
+ parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store",
+ help="the addresse of the bam file to filter.")
+ parser.add_option("-o", "--output", dest="file_out", default=None, type="string", action="store",
+ help="The addresse of the output file.")
+ parser.add_option("--length", dest="lengths", default=None, type="string", action="store",
+ help="A pair of non-overlapping [from,to] values that will be used to "
+ "filter (including the boundaries) the fragments, for instance --length 1-200.")
+ (options, args) = parser.parse_args()
+
+ file_in = options.file_in
+ file_out = options.file_out
+ from_to = parse_lengths(options.lengths)
+
+ # check options
+ if file_in is None:
+ print("Error! No input file given (-i)!", sys.stderr)
+ exit(1)
+ elif not os.path.isfile(file_in):
+ print("Error! %s does not exist!" % file_in)
+ exit(1)
+ elif file_out is None:
+ print("Error! no output file given (-o)!")
+
+ split_bam(file_in, file_out, from_to)
diff --git a/scripts/bam_tools/split_in_two.py b/scripts/bam_tools/split_in_two.py
new file mode 100644
index 0000000..8ae2c37
--- /dev/null
+++ b/scripts/bam_tools/split_in_two.py
@@ -0,0 +1,275 @@
+import optparse
+import sys
+import os
+import typing as tp
+import pysam
+
+def split_bam(file_in, file_out):
+
+ bam_in = pysam.AlignmentFile(file_in)
+ bam_out = pysam.AlignmentFile(file_out, template=bam_in, mode="wb")
+
+ for read in bam_in:
+ # check whether there is a pair
+ read_start = read.reference_start
+ read_flags = list(str(bin(read.flag))[2:][::-1])
+ read_is_paired = read_flags[0] == '1'
+ read_is_rev = read_flags[4] == '1'
+ read_is_first_in_pair = read_flags[6] == '1'
+ read_name = read.query_name
+
+ mate_start = read.next_reference_start
+ mate_is_rev = read_flags[5] == '1'
+
+ # check that read and fragment are OK
+ # qc
+ if read.is_qcfail:
+ continue
+ # check pair
+ elif not read_is_paired:
+ continue
+ # --> -->
+ elif(not read_is_rev and not mate_is_rev):
+ continue
+ # <-- <--
+ elif (read_is_rev and read_is_rev):
+ continue
+ # <-- -->
+ elif (read_is_rev and not mate_is_rev) and (read_start < mate_read_start):
+ continue
+ # <-- -->
+ elif (not read_is_rev and mate_is_rev) and (read_start > mate_start):
+ continue
+
+ # Split the fragment in two equally long fragments.
+ # Each read has a length of 1.
+ # Reads that do not create a proper fragment
+ # with their pair read are filtered (can only
+ # create 2 fragments from a fragment!).
+ if read_is_first_in_pair:
+
+ # strand related parameters
+ # r1 is fw
+ # r1 r3
+ # ---> --->
+ # |---------|----------|
+ # <--- <---
+ # r4 r2
+ if not read_is_rev:
+
+ frag_ref_id = read.reference_id
+ frag_start = read.reference_start
+ frag_len = read.template_length
+ frag_end = frag_start + frag_len - 1
+ frag_mid = frag_start + (frag_len // 2)
+ frag_cell = read.get_tag("CB")
+
+ # read 1
+ r1_ref_id = frag_ref_id
+ r1_next_ref_id = r1_ref_id
+ r1_start = frag_start # read start inclusive
+ r1_flags = ['0' for _ in range(12)]
+ r1_flags[0] = '1' # paired
+ r1_flags[1] = '1' # proper pair
+ r1_flags[5] = '1' # mate is rev
+ r1_flags[6] = '1' # 1st in pair
+ r1_name = "%s_r1" % read_name
+ r1_tags = (("CB", frag_cell),)
+ r1_query_seq = 'N'
+ # read 2
+ r2_ref_id = frag_ref_id
+ r2_next_ref_id = r2_ref_id
+ r2_start = frag_end -1 # read start inclusive
+ r2_flags = ['0' for _ in range(12)]
+ r2_flags[0] = '1' # paired
+ r2_flags[1] = '1' # proper pair
+ r2_flags[4] = '1' # read is rev
+ r2_flags[7] = '1' # mate 1st in pair
+ r2_name = "%s_r2" % read_name
+ r2_tags = (("CB", frag_cell),)
+ r2_query_seq = 'N'
+ # read 3
+ r3_ref_id = frag_ref_id
+ r3_next_ref_id = r3_ref_id
+ r3_start = frag_mid # read start inclusive
+ r3_flags = ['0' for _ in range(12)]
+ r3_flags[0] = '1' # paired
+ r3_flags[1] = '1' # proper pair
+ r3_flags[5] = '1' # mate is rev
+ r3_flags[6] = '1' # 1st in pair
+ r3_name = "%s_r3" % read_name
+ r3_tags = (("CB", frag_cell),)
+ r3_query_seq = 'N'
+ # read 4
+ r4_ref_id = frag_ref_id
+ r4_next_ref_id = r4_ref_id
+ r4_start = frag_mid - 1 # read start inclusive
+ r4_flags = ['0' for _ in range(12)]
+ r4_flags[0] = '1' # paired
+ r4_flags[1] = '1' # proper pair
+ r4_flags[4] = '1' # read is rev
+ r4_flags[7] = '1' # mate 1st in pair
+ r4_name = "%s_r4" % read_name
+ r4_tags = (("CB", frag_cell),)
+ r4_query_seq = 'N'
+ # fragment lengths
+ frag14_len = r4_start - r1_start + 1
+ frag23_len = r2_start - r3_start + 1
+ r1_tlen = frag14_len
+ r2_tlen = -frag23_len
+ r3_tlen = frag23_len
+ r4_tlen = -frag14_len
+
+ # r1 is rv
+ # r2 r4
+ # ---> --->
+ # |---------|----------|
+ # <--- <---
+ # r3 r1
+ else:
+ frag_ref_id = read.reference_id
+ frag_start = read.next_reference_start
+ frag_len = abs(read.template_length)
+ frag_end = frag_start + frag_len - 1
+ frag_mid = frag_start + (frag_len // 2)
+ frag_cell = read.get_tag("CB")
+
+ # read 1
+ r1_ref_id = frag_ref_id
+ r1_next_ref_id = r1_ref_id
+ r1_start = frag_end - 1 # read start inclusive
+ r1_flags = ['0' for _ in range(12)]
+ r1_flags[0] = '1' # paired
+ r1_flags[1] = '1' # proper pair
+ r1_flags[4] = '1' # read is rev
+ r1_flags[6] = '1' # 1st in pair
+ r1_name = "%s_r1" % read_name
+ r1_tags = (("CB", frag_cell),)
+ r1_query_seq = 'N'
+ # read 2
+ r2_ref_id = frag_ref_id
+ r2_next_ref_id = r2_ref_id
+ r2_start = frag_start # read start inclusive
+ r2_flags = ['0' for _ in range(12)]
+ r2_flags[0] = '1' # paired
+ r2_flags[1] = '1' # proper pair
+ r2_flags[5] = '1' # mate is rev
+ r2_flags[7] = '1' # mate 1st in pair
+ r2_name = "%s_r2" % read_name
+ r2_tags = (("CB", frag_cell),)
+ r2_query_seq = 'N'
+ # read 3
+ r3_ref_id = frag_ref_id
+ r3_next_ref_id = r3_ref_id
+ r3_start = frag_mid - 1 # read start inclusive
+ r3_flags = ['0' for _ in range(12)]
+ r3_flags[0] = '1' # paired
+ r3_flags[1] = '1' # proper pair
+ r3_flags[4] = '1' # read is rev
+ r3_flags[6] = '1' # 1st in pair
+ r3_name = "%s_r3" % read_name
+ r3_tags = (("CB", frag_cell),)
+ r3_query_seq = 'N'
+ # read 4
+ r4_ref_id = frag_ref_id
+ r4_next_ref_id = r4_ref_id
+ r4_start = frag_mid # read start inclusive
+ r4_flags = ['0' for _ in range(12)]
+ r4_flags[0] = '1' # paired
+ r4_flags[1] = '1' # proper pair
+ r4_flags[5] = '1' # mate is rev
+ r4_flags[7] = '1' # mate 1st in pair
+ r4_name = "%s_r4" % read_name
+ r4_tags = (("CB", frag_cell),)
+ r4_query_seq = 'N'
+ # fragment lengths
+ frag14_len = r4_start - r1_start + 1
+ frag23_len = r2_start - r3_start + 1
+ r1_tlen = -frag14_len
+ r2_tlen = frag23_len
+ r3_tlen = -frag23_len
+ r4_tlen = frag14_len
+
+ # create the reads
+ read1 = pysam.AlignedSegment()
+ read1.query_name = r1_name
+ read1.flag = int(''.join(r1_flags)[::-1], base=2)
+ read1.reference_id = frag_ref_id
+ read1.reference_start = r1_start
+ read1.next_reference_id = frag_ref_id
+ read1.next_reference_start = r4_start
+ read1.tags = r1_tags
+ read1.template_length = r1_tlen
+ read1.query_sequence = r1_query_seq
+
+ read2 = pysam.AlignedSegment()
+ read2.query_name = r2_name
+ read2.flag = int(''.join(r2_flags)[::-1], base=2)
+ read2.reference_id = frag_ref_id
+ read2.reference_start = r2_start
+ read2.next_reference_id = frag_ref_id
+ read2.next_reference_start = r3_start
+ read2.tags = r2_tags
+ read2.template_length = r2_tlen
+ read2.query_sequence = r2_query_seq
+
+ read3 = pysam.AlignedSegment()
+ read3.query_name = r3_name
+ read3.flag = int(''.join(r3_flags)[::-1], base=2)
+ read3.reference_id = frag_ref_id
+ read3.reference_start = r3_start
+ read3.next_reference_id = frag_ref_id
+ read3.next_reference_start = r2_start
+ read3.tags = r3_tags
+ read3.template_length = r3_tlen
+ read3.query_sequence = r3_query_seq
+
+ read4 = pysam.AlignedSegment()
+ read4.query_name = r4_name
+ read4.flag = int(''.join(r4_flags)[::-1], base=2)
+ read4.reference_id = frag_ref_id
+ read4.reference_start = r4_start
+ read4.next_reference_id = frag_ref_id
+ read4.next_reference_start = r1_start
+ read4.tags = r4_tags
+ read4.template_length = r4_tlen
+ read4.query_sequence = r4_query_seq
+
+ # write
+ bam_out.write(read1)
+ bam_out.write(read2)
+ bam_out.write(read3)
+ bam_out.write(read4)
+
+ bam_in.close()
+ bam_out.close()
+
+
+if __name__ == "__main__":
+
+ # parse options
+ usage = "usage: %s [options]" % os.path.basename(__file__)
+ epilog = "This program reads a bam file and split the fragments within in two fragments " \
+ "of equal size." \
+ "Written by Romain Groux, June 2019"
+ parser = optparse.OptionParser(usage=usage, epilog=epilog)
+ parser.add_option("-i", "--input", dest="file_in", default=None, type="string", action="store",
+ help="the addresse of the bam file to filter.")
+ parser.add_option("-o", "--output", dest="file_out", default=None, type="string", action="store",
+ help="The addresse of the output file.")
+ (options, args) = parser.parse_args()
+
+ file_in = options.file_in
+ file_out = options.file_out
+
+ # check options
+ if file_in is None:
+ print("Error! No input file given (-i)!", sys.stderr)
+ exit(1)
+ elif not os.path.isfile(file_in):
+ print("Error! %s does not exist!" % file_in)
+ exit(1)
+ elif file_out is None:
+ print("Error! no output file given (-o)!")
+
+ split_bam(file_in, file_out)
diff --git a/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R b/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R
new file mode 100755
index 0000000..7377bed
--- /dev/null
+++ b/scripts/bulk_sequencing/analysis_cluster_ctcf_dnase_k562.R
@@ -0,0 +1,138 @@
+setwd(file.path("/", "local", "groux", "scATAC-seq"))
+
+# libraries
+library(RColorBrewer)
+
+# functions
+source(file.path("scripts", "functions.R"))
+
+# data
+data.1 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_1class_ref.mat"))
+ref.1 = data.1$references
+prob.1 = data.1$prob
+aic.1 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_1class_aic.txt")))
+data.1 = NULL
+
+data.2 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class_ref.mat"))
+ref.2 = data.2$references
+prob.2 = data.2$prob
+aic.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class_aic.txt")))
+data.2 = NULL
+
+data.3 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class_ref.mat"))
+ref.3 = data.3$references
+prob.3 = data.3$prob
+aic.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class_aic.txt")))
+data.3 = NULL
+
+data.4 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class_ref.mat"))
+ref.4 = data.4$references
+prob.4 = data.4$prob
+aic.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class_aic.txt")))
+data.4 = NULL
+
+data.5 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class_ref.mat"))
+ref.5 = data.5$references
+prob.5 = data.5$prob
+aic.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class_aic.txt")))
+data.5 = NULL
+
+data.6 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class_ref.mat"))
+ref.6 = data.6$references
+prob.6 = data.6$prob
+aic.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class_aic.txt")))
+data.6 = NULL
+
+data.7 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_7class_ref.mat"))
+ref.7 = data.7$references
+prob.7 = data.7$prob
+aic.7 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_7class_aic.txt")))
+data.7 = NULL
+
+data.8 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_8class_ref.mat"))
+ref.8 = data.8$references
+prob.8 = data.8$prob
+aic.8 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_8class_aic.txt")))
+data.8 = NULL
+
+data.9 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_9class_ref.mat"))
+ref.9 = data.9$references
+prob.9 = data.9$prob
+aic.9 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_9class_aic.txt")))
+data.9 = NULL
+
+data.10 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_10class_ref.mat"))
+ref.10 = data.10$references
+prob.10 = data.10$prob
+aic.10 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_10class_aic.txt")))
+data.10 = NULL
+
+data.11 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_11class_ref.mat"))
+ref.11 = data.11$references
+prob.11 = data.11$prob
+aic.11 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_11class_aic.txt")))
+data.11 = NULL
+
+data.12 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_12class_ref.mat"))
+ref.12 = data.12$references
+prob.12 = data.12$prob
+aic.12 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_12class_aic.txt")))
+data.12 = NULL
+
+data.13 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_13class_ref.mat"))
+ref.13 = data.13$references
+prob.13 = data.13$prob
+aic.13 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_13class_aic.txt")))
+data.13 = NULL
+
+data.14 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_14class_ref.mat"))
+ref.14 = data.14$references
+prob.14 = data.14$prob
+aic.14 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_14class_aic.txt")))
+data.14 = NULL
+
+data.15 = read.references(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_15class_ref.mat"))
+ref.15 = data.15$references
+prob.15 = data.15$prob
+aic.15 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_15class_aic.txt")))
+data.15 = NULL
+
+ref = list(ref.15, ref.14, ref.13, ref.12, ref.11, ref.10, ref.9, ref.8, ref.7, ref.6, ref.5, ref.4, ref.3, ref.2, ref.1)
+prob = list(prob.15, prob.14, prob.13, prob.12, prob.11, prob.10, prob.9, prob.8, prob.7, prob.6, prob.5, prob.4, prob.3, prob.2,prob.1)
+aic = c(aic.15, aic.14, aic.13, aic.12, aic.11, aic.10, aic.9, aic.8, aic.7, aic.6, aic.5, aic.4, aic.3, aic.2, aic.1)
+
+# number of runs
+n_run = length(ref)
+# number of different classes overall
+n_class_tot = sum(unlist(lapply(ref, nrow)))
+# max value of K
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[1], n_class_max)
+
+# construct a matrix with all discovered references on the rows
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
+k = 1
+for(i in 1:n_run)
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
+ run_value[k] = i
+ k_value[k] = j
+ k = k + 1
+ }
+}
+
+# distance matrix between all references
+distances = distance.ref(references)
+rownames(distances) = 1:nrow(distances)
+colnames(distances) = 1:ncol(distances)
+
+plot.references(file.path("results","bulk_sequencing", "ctcf_dnase.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
+
diff --git a/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R b/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R
index bb60a9b..20bc1dd 100755
--- a/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R
+++ b/scripts/bulk_sequencing/analysis_cluster_ctcf_mnase_k562.R
@@ -1,197 +1,138 @@
-
-
-# functions
-
-#' Compute the euclidean distance between two references.
-#' It also check if a reference is in reverse orientation
-#' and returns the smallest distance value.
-#' \param ref1 a vector containing the first reference.
-#' \param ref2 a vector containing the second reference.
-#' \return the euclidean distance.
-eucl.dist.ref = function(ref1, ref2)
-{
- return(min(sqrt(sum(((ref1 - ref2 ) ^ 2))),
- sqrt(sum(((ref1 - rev(ref2)) ^ 2)))))
-}
-
-
-#' Compute the correlation distance between two references.
-#' It also check if a reference is in reverse orientation
-#' and returns the smallest distance value.
-#' \param ref1 a vector containing the first reference.
-#' \param ref2 a vector containing the second reference.
-#' \return the euclidean distance.
-cor.dist.ref= function(ref1, ref2)
-{
- return(1 - min(cor(ref1, ref2 ),
- cor(ref1, rev(ref2))))
-}
-
-
-#' Computes the distance matrix, using the euclidean distance, for all
-#' the references aggregations given. As some references may be in reverse
-#' orientation compared to others, the distance in both orientation is
-#' computed, for each pair, and the best is returned.
-distance.ref = function(references)
-{ n = nrow(references)
- d = matrix(nrow=n, ncol=n, data=0)
-
- for(i in 1:n)
- { for(j in 1:i)
- { x = eucl.dist.ref(references[i,], references[j,])
- d[i,j] = x
- d[j,i] = x
- }
- }
- return(d)
-}
-
-
-get_matches = function(distances, run_value)
-{
- matches = matrix(nrow=0, ncol=4)
-
- # references of run i on the row -> y coord
- # references of run j on the col -> x coord
-
- # run labels
- run_i = 1
- # run_j = 2
-
- for(run_j in setdiff(unique(run_value), run_i))
- {
- # number of references in each run
- n_i = length(which(run_value == run_i))
- n_j = length(which(run_value == run_j))
-
- index_i = which(run_value == run_i) # rows of run i
- index_j = which(run_value == run_j) # columns of run j
-
- i_taken = c() # classes of i already plotted -> rows to ignore
- j_taken = c() # classes of j already plotted -> columns to ignore
-
- # while not all classes in j have been plotted
- row_n = 1
- while(length(j_taken) < n_j)
- { if(length(i_taken) == 0 &&
- length(j_taken) == 0)
- { distances_tmp = distances[index_i, index_j]
- coord = which(distances_tmp == min(distances_tmp), arr.ind=T)
- coord_i = as.numeric(rownames(distances_tmp)[coord[1]])
- coord_j = as.numeric(colnames(distances_tmp)[coord[2]])
- coord = c(coord_i, coord_j)
- } else {
- rows = setdiff(index_i, i_taken)
- cols = setdiff(index_j, j_taken)
- distances_tmp = distances[rows, cols, drop=F]
- coord = which(distances_tmp == min(distances_tmp), arr.ind=T)
- coord_i = as.numeric(rownames(distances_tmp)[coord[1]])
- coord_j = as.numeric(colnames(distances_tmp)[coord[2]])
- coord = c(coord_i, coord_j)
- }
- coord = c(coord, row_n, run_j)
- i_taken = c(i_taken, coord[1])
- j_taken = c(j_taken, coord[2])
- matches = rbind(matches, coord)
- row_n = row_n + 1
- }
- }
- return(matches)
-}
-
-
-plot.references = function(references, distances, n_run, run_value, n_class_max)
-{
- colors = brewer.pal(6, "Set1")
-
- # compute the best matches between all references to 1st run references
- matches = get_matches(distances, run_value)
-
- # make a matrix for layout with good plot numbers
- plots.lab = matrix(nrow=n_class_max, ncol=n_run)
- plots.lab[,1] = 1:n_class_max # for run with max number of classes
- z = n_class_max + 1
- for(i in 1:nrow(matches))
- { coord = matches[i,]
- # plots.lab[coord[3], coord[4]] = z
- plots.lab[coord[1], coord[4]] = z
- z = z + 1
- }
- # these will be the empty plots
- for(i in 1:nrow(plots.lab))
- { for(j in 1:ncol(plots.lab))
- { if(is.na(plots.lab[i,j]))
- { plots.lab[i,j] = z
- z = z + 1
- }
- }
- }
-
- # plot
- X11(height=12, width=10)
- # a grid
- m = layout(mat = plots.lab)
- # layout.show(m)
- x = 1:ncol(references)
-
- # plot run 1 references
- for(i in 1:n_class_max)
- { plot(x=x, y=references[i,], lwd=3, type='l', col=colors[i], main="", xlab="pos [bp]", ylab="Nb reads") }
-
- # plot others
- for(i in 1:nrow(matches))
- { ref_index = matches[i,2]
- col_index = matches[i,3]
- plot(x=x, y=references[ref_index,], lwd=3, type='l', col=colors[col_index], main="", xlab="pos [bp]", ylab="Nb reads")
- }
-}
-
-
-
-
-
-library(RColorBrewer)
-
setwd(file.path("/", "local", "groux", "scATAC-seq"))
-data.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_2class.mat")))
-data.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_3class.mat")))
-data.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_4class.mat")))
-data.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_5class.mat")))
-data.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_dnase_k562_6class.mat")))
-data = list(data.6, data.5, data.4, data.3, data.2)
+# libraries
+library(RColorBrewer)
-# some colors
-colors = brewer.pal(6, "Set1")
+# functions
+source(file.path("scripts", "functions.R"))
+
+# data
+data.1 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_1class_ref.mat"))
+ref.1 = data.1$references
+prob.1 = data.1$prob
+aic.1 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_1class_aic.txt")))
+data.1 = NULL
+
+data.2 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_2class_ref.mat"))
+ref.2 = data.2$references
+prob.2 = data.2$prob
+aic.2 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_2class_aic.txt")))
+data.2 = NULL
+
+data.3 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_3class_ref.mat"))
+ref.3 = data.3$references
+prob.3 = data.3$prob
+aic.3 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_3class_aic.txt")))
+data.3 = NULL
+
+data.4 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_4class_ref.mat"))
+ref.4 = data.4$references
+prob.4 = data.4$prob
+aic.4 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_4class_aic.txt")))
+data.4 = NULL
+
+data.5 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_5class_ref.mat"))
+ref.5 = data.5$references
+prob.5 = data.5$prob
+aic.5 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_5class_aic.txt")))
+data.5 = NULL
+
+data.6 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_6class_ref.mat"))
+ref.6 = data.6$references
+prob.6 = data.6$prob
+aic.6 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_6class_aic.txt")))
+data.6 = NULL
+
+data.7 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_7class_ref.mat"))
+ref.7 = data.7$references
+prob.7 = data.7$prob
+aic.7 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_7class_aic.txt")))
+data.7 = NULL
+
+data.8 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_8class_ref.mat"))
+ref.8 = data.8$references
+prob.8 = data.8$prob
+aic.8 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_8class_aic.txt")))
+data.8 = NULL
+
+data.9 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_9class_ref.mat"))
+ref.9 = data.9$references
+prob.9 = data.9$prob
+aic.9 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_9class_aic.txt")))
+data.9 = NULL
+
+data.10 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_10class_ref.mat"))
+ref.10 = data.10$references
+prob.10 = data.10$prob
+aic.10 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_10class_aic.txt")))
+data.10 = NULL
+
+data.11 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_11class_ref.mat"))
+ref.11 = data.11$references
+prob.11 = data.11$prob
+aic.11 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_11class_aic.txt")))
+data.11 = NULL
+
+data.12 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_12class_ref.mat"))
+ref.12 = data.12$references
+prob.12 = data.12$prob
+aic.12 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_12class_aic.txt")))
+data.12 = NULL
+
+data.13 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_13class_ref.mat"))
+ref.13 = data.13$references
+prob.13 = data.13$prob
+aic.13 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_13class_aic.txt")))
+data.13 = NULL
+
+data.14 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_14class_ref.mat"))
+ref.14 = data.14$references
+prob.14 = data.14$prob
+aic.14 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_14class_aic.txt")))
+data.14 = NULL
+
+data.15 = read.references(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_15class_ref.mat"))
+ref.15 = data.15$references
+prob.15 = data.15$prob
+aic.15 = as.matrix(read.table(file.path("results", "bulk_sequencing", "ctcf_mnase_k562_15class_aic.txt")))
+data.15 = NULL
+
+ref = list(ref.15, ref.14, ref.13, ref.12, ref.11, ref.10, ref.9, ref.8, ref.7, ref.6, ref.5, ref.4, ref.3, ref.2, ref.1)
+prob = list(prob.15, prob.14, prob.13, prob.12, prob.11, prob.10, prob.9, prob.8, prob.7, prob.6, prob.5, prob.4, prob.3, prob.2,prob.1)
+aic = c(aic.15, aic.14, aic.13, aic.12, aic.11, aic.10, aic.9, aic.8, aic.7, aic.6, aic.5, aic.4, aic.3, aic.2, aic.1)
# number of runs
-n_run = length(data)
+n_run = length(ref)
# number of different classes overall
-n_class_tot = sum(unlist(lapply(data, nrow)))
+n_class_tot = sum(unlist(lapply(ref, nrow)))
# max value of K
-n_class_max = max(unlist(lapply(data, nrow)))
+n_class_max = max(unlist(lapply(ref, nrow)))
+
+# some colors
+colors = rep(brewer.pal(9, "Set1")[2], n_class_max)
# construct a matrix with all discovered references on the rows
-references = matrix(nrow=n_class_tot, ncol=ncol(data[[1]]))
-run_value = vector(length=n_class_tot)
-k_value = vector(length=n_class_tot)
+references = matrix(nrow=n_class_tot, ncol=ncol(ref[[1]]))
+run_value = vector(length=n_class_tot)
+k_value = vector(length=n_class_tot)
+probabilities = vector(length=n_class_tot)
k = 1
for(i in 1:n_run)
-{ for(j in 1:nrow(data[[i]]))
- { references[k,] = data[[i]][j,]
+{
+ for(j in 1:nrow(ref[[i]]))
+ { references[k,] = ref[[i]][j,]
+ probabilities[k] = prob[[i]][j]
run_value[k] = i
k_value[k] = j
k = k + 1
}
}
# distance matrix between all references
-distances = distance.ref(references)
+distances = distance.ref(references)
rownames(distances) = 1:nrow(distances)
colnames(distances) = 1:ncol(distances)
-
-plot.references(references, distances, n_run, run_value, n_class_max)
-savePlot("tmp_dnase.png")
-
+plot.references(file.path("results","bulk_sequencing", "ctcf_mnase.png"),
+ references, probabilities, colors, aic, distances, n_run, run_value, n_class_max)
diff --git a/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh b/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh
index 7c28f17..4414100 100755
--- a/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh
+++ b/scripts/bulk_sequencing/cluster_ctcf_dnase_k562.sh
@@ -1,20 +1,23 @@
results_dir='results/bulk_sequencing'
data_dir='data/bulk_sequencing/'
mkdir -p $results_dir
file_mnase=$data_dir'/ctcf_dnase_k562.mat'
+file_seed=$results_dir'/ctcf_dnase_k562_seed.txt'
n_iter='20'
n_shift='21'
-seed='12345678'
seeding='random'
+n_core=5
-for k in 2 3 4 5 6
+for k in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_dnase_k562_'$k'class_prob.mat4d'
file_ref=$results_dir/'ctcf_dnase_k562_'$k'class_ref.mat'
- file_aic=$results_dir/'ctcf_dnase_k562_'$k'class_aic.mat'
- bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed -p 6 > $file_prob
- bin/probToRef --data $file_mnase --prob $file_prob 1> $file_ref 2> $file_aic
+ file_aic=$results_dir/'ctcf_dnase_k562_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mnase --prob $file_prob --parallel $n_core 1> $file_ref 2> $file_aic
done
diff --git a/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh b/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh
index 1f3a3fd..29779c0 100755
--- a/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh
+++ b/scripts/bulk_sequencing/cluster_ctcf_mnase_k562.sh
@@ -1,20 +1,23 @@
results_dir='results/bulk_sequencing'
data_dir='data/bulk_sequencing/'
mkdir -p $results_dir
file_mnase=$data_dir'/ctcf_mnase_k562.mat'
+file_seed=$results_dir'/ctcf_mnase_k562_seed.txt'
n_iter='20'
n_shift='21'
-seed='12345678'
seeding='random'
+n_core=5
-for k in 2 3 4 5 6
+for k in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
do
+ seed=$(< /dev/urandom tr -dc _A-Z-a-z-0-9 | head -c${1:-15};echo)
file_prob=$results_dir/'ctcf_mnase_k562_'$k'class_prob.mat4d'
file_ref=$results_dir/'ctcf_mnase_k562_'$k'class_ref.mat'
- file_aic=$results_dir/'ctcf_mnase_k562_'$k'class_aic.mat'
- bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed -p 6 > $file_prob
- bin/probToRef --data $file_mnase --prob $file_prob 1> $file_ref 2> $file_aic
+ file_aic=$results_dir/'ctcf_mnase_k562_'$k'class_aic.txt'
+ echo "$file_prob $seed" >> $file_seed
+ bin/ChIPPartitioning --data $file_mnase --class $k --shift $n_shift --flip --iter $n_iter --seeding $seeding --seed $seed --parallel $n_core > $file_prob
+ bin/probToRef --data $file_mnase --prob $file_prob --parallel $n_core 1> $file_ref 2> $file_aic
done
diff --git a/scripts/bulk_sequencing/functions.R b/scripts/bulk_sequencing/functions.R
new file mode 100644
index 0000000..453b404
--- /dev/null
+++ b/scripts/bulk_sequencing/functions.R
@@ -0,0 +1,221 @@
+
+#' Reads a reference file and returns a list
+#' with the class references and the associated
+#' class probabilities.
+#' \param file the path to the file of interest.
+#' \return a list of two elements : "references"
+#' a matrix with the references on each row and
+#' "prob" the associated class probabilities.
+#'
+read.references = function(file)
+{ ref = as.matrix(read.table(file), drop=F)
+ prob = ref[,1]
+ ref = ref[,-1, drop=F]
+ return(list(references=ref, prob=prob))
+}
+
+#' Compute the euclidean distance between two references.
+#' It also check if a reference is in reverse orientation
+#' and returns the smallest distance value.
+#' \param ref1 a vector containing the first reference.
+#' \param ref2 a vector containing the second reference.
+#' \return the euclidean distance.
+eucl.dist.ref = function(ref1, ref2)
+{
+ return(min(sqrt(sum(((ref1 - ref2 ) ^ 2))),
+ sqrt(sum(((ref1 - rev(ref2)) ^ 2)))))
+}
+
+
+#' Compute the correlation distance between two references.
+#' It also check if a reference is in reverse orientation
+#' and returns the smallest distance value.
+#' \param ref1 a vector containing the first reference.
+#' \param ref2 a vector containing the second reference.
+#' \return the euclidean distance.
+cor.dist.ref= function(ref1, ref2)
+{
+ return(1 - min(cor(ref1, ref2 ),
+ cor(ref1, rev(ref2))))
+}
+
+
+#' Computes the (eucliden) distance matrix for all the given
+#' the references As some references may be in reverse
+#' orientation compared to others, the distance in both
+#' orientation is computed, for each pair, and the best is
+#' returned.
+#' \param references a matrix with the references on each row.
+#' \return a matrix containing the distances between each reference.
+distance.ref = function(references)
+{ n = nrow(references)
+ d = matrix(nrow=n, ncol=n, data=0)
+
+ for(i in 1:n)
+ { for(j in 1:i)
+ { x = eucl.dist.ref(references[i,], references[j,])
+ d[i,j] = x
+ d[j,i] = x
+ }
+ }
+ return(d)
+}
+
+
+get_matches = function(distances, run_value)
+{
+ matches = matrix(nrow=0, ncol=4)
+
+ # references of run i on the row -> y coord
+ # references of run j on the col -> x coord
+
+ # run labels
+ run_i = 1
+ # run_j = 2
+
+ for(run_j in setdiff(unique(run_value), run_i))
+ {
+ # number of references in each run
+ n_i = length(which(run_value == run_i))
+ n_j = length(which(run_value == run_j))
+
+ index_i = which(run_value == run_i) # rows of run i
+ index_j = which(run_value == run_j) # columns of run j
+
+ i_taken = c() # classes of i already matched -> rows to ignore
+ j_taken = c() # classes of j already matched -> columns to ignore
+
+ # while not all classes in j have been assigned a best match
+ row_n = 1
+ while(length(j_taken) < n_j)
+ { if(length(i_taken) == 0 &&
+ length(j_taken) == 0)
+ { distances_tmp = distances[index_i, index_j]
+ coord = which(distances_tmp == min(distances_tmp), arr.ind=T)
+ coord_i = as.numeric(rownames(distances_tmp)[coord[1]])
+ coord_j = as.numeric(colnames(distances_tmp)[coord[2]])
+ coord = c(coord_i, coord_j)
+ } else {
+ rows = setdiff(index_i, i_taken)
+ cols = setdiff(index_j, j_taken)
+ distances_tmp = distances[rows, cols, drop=F]
+ coord = which(distances_tmp == min(distances_tmp), arr.ind=T)
+ coord_i = as.numeric(rownames(distances_tmp)[coord[1]])
+ coord_j = as.numeric(colnames(distances_tmp)[coord[2]])
+ coord = c(coord_i, coord_j)
+ }
+ coord = c(coord, row_n, run_j)
+ i_taken = c(i_taken, coord[1])
+ j_taken = c(j_taken, coord[2])
+ matches = rbind(matches, coord)
+ row_n = row_n + 1
+ }
+ }
+ return(matches)
+}
+
+
+
+#'Creates a composite figure in which several class references from
+#'several partitions, with different numbers of classes, are plotted.
+#'The figure is composed of a matrix of rows and
+#'columns where is the highest number of classes in all
+#'partitions and the number of different partition. T
+#'The first column will contain the references of the
+#'partition with classes. The next columns will contain the
+#'references of the partition with the second biggest number of
+#'classes (and so on). In a given column, except the 1st one,
+#'the references are ordered (over the rows) such that the
+#'overall similarity (euclidean distance) with the 1st column
+#'references are maximized.
+#'\file the file name where the image will be saved.
+#'\param references a matrix with the different references to draw on
+#'each row.
+#'\param a vector containing the class probability (or weight) associated
+#'to each corresponding reference (row) in matrix.
+#'\param a vector of values that will be displayed atop of each
+#'column of plots.
+#'\param distances a distance matrix containing the distance between all
+#'references. The row and column labels have to be the row and column
+#'number (1, 2, 3, ...)!
+#'\param n_run the total number of different partitions to which all
+#'references belong.
+#'\param run_value a vector indicating to which partition each reference
+#'(row of references) belong to. It should be a simple vector of integers,
+#'for instance 1,1,1,1,2,2,2,3,3
+#'\param n_class_max, the highest number of classes searches in all partitions ()
+plot.references = function(file, references, probabilities, col.titles, distances, n_run, run_value, n_class_max)
+{
+ colors = brewer.pal(6, "Set1")
+
+ # compute the best matches between all references to 1st run references
+ matches = get_matches(distances, run_value)
+
+ # make a matrix for layout with good plot numbers
+ plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles)
+ plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab)
+ plots.lab[-1,1] = 1:n_class_max # for run with max number of classes
+ z = n_class_max + 1
+ for(i in 1:nrow(matches))
+ { coord = matches[i,]
+ # plots.lab[coord[3], coord[4]] = z
+ plots.lab[coord[1]+1, coord[4]] = z
+ z = z + 1
+ }
+ # these will be the empty plots
+ for(i in 1:nrow(plots.lab))
+ { for(j in 1:ncol(plots.lab))
+ { if(is.na(plots.lab[i,j]))
+ { plots.lab[i,j] = z
+ z = z + 1
+ }
+ }
+ }
+
+ # plot
+ # X11(height=24, width=20)
+ png(filename=file, width=20, height=24, unit="in", res=720)
+ # a grid
+ m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) )
+ # layout.show(m)
+ x = 1:ncol(references)
+
+ # plot references of partition with highest number of classes
+ for(i in 1:n_class_max)
+ { plot(x=x, y=references[i,], lwd=3, type='l', ylim=c(0, 1.2*max(references[i,])),
+ col=colors[i], main="", xlab="pos [bp]", ylab="Nb reads")
+ # prob
+ x_ = 0.85*length(references[i,])
+ y_ = max(references[i,])
+ lab = round(probabilities[i],3)
+ text(x=x_, y=y_, labels=lab, cex=1.2)
+ }
+
+ # plot others
+ for(i in 1:nrow(matches))
+ { ref_index = matches[i,2]
+ col_index = matches[i,3]
+ plot(x=x, y=references[ref_index,], lwd=3, type='l', ylim=c(0, 1.2*max(references[ref_index,])),
+ col=colors[col_index], main="", xlab="pos [bp]", ylab="Nb reads")
+ # prob
+ x_ = 0.85*length(references[ref_index,])
+ y_ = max(references[ref_index,])
+ lab = round(probabilities[ref_index],3)
+ text(x=x_, y=y_, labels=lab, cex=1.2)
+ }
+
+ # empty plots
+ for(i in (length(run_value)+1):(n_run*n_class_max))
+ { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") }
+
+ # col titles
+ p = par(mar=c(0,0,0,0))
+ for(i in 1:length(col.titles))
+ { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n")
+ text(1,1, labels=col.titles[i], cex=2)
+ }
+ par(p)
+ dev.off()
+}
+
+
diff --git a/scripts/bulk_sequencing/prepare_data.R b/scripts/bulk_sequencing/prepare_data.R
index ad4b79a..b7ee0ab 100755
--- a/scripts/bulk_sequencing/prepare_data.R
+++ b/scripts/bulk_sequencing/prepare_data.R
@@ -1,13 +1,21 @@
setwd(file.path("/", "local", "groux", "scATAC-seq"))
-dnase1 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep1.mat")))
-dnase2 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep1.mat")))
-dnase3 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep3.mat")))
+# DNaseI around CTCF
+ctcf.dnase1 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep1.mat")))
+ctcf.dnase2 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep1.mat")))
+ctcf.dnase3 = as.matrix(read.table(file.path("data", "bulk_sequencing", "ctcf_dnase_k562_rep3.mat")))
+
+# DNaseI around TSS
+tss.dnase1 = as.matrix(read.table(file.path("data", "bulk_sequencing", "tss_dnase_k562_rep1.mat")))
+tss.dnase2 = as.matrix(read.table(file.path("data", "bulk_sequencing", "tss_dnase_k562_rep1.mat")))
+tss.dnase3 = as.matrix(read.table(file.path("data", "bulk_sequencing", "tss_dnase_k562_rep3.mat")))
# sum everything to increase coverage
-dnase = dnase1 + dnase2 + dnase3
+ctcf.dnase = ctcf.dnase1 + ctcf.dnase2 + ctcf.dnase3
+tss.dnase = tss.dnase1 + tss.dnase2 + tss.dnase3
# write the new tables
-write.table(dnase, file=file.path("data", "bulk_sequencing", "ctcf_dnase_k562.mat"), col.names=F, row.names=F, quote=F, eol='\n', sep='\t')
+write.table(ctcf.dnase, file=file.path("data", "bulk_sequencing", "ctcf_dnase_k562.mat"), col.names=F, row.names=F, quote=F, eol='\n', sep='\t')
+write.table(tss.dnase, file=file.path("data", "bulk_sequencing", "tss_dnase_k562.mat"), col.names=F, row.names=F, quote=F, eol='\n', sep='\t')
diff --git a/scripts/functions.R b/scripts/functions.R
new file mode 100644
index 0000000..d981bc0
--- /dev/null
+++ b/scripts/functions.R
@@ -0,0 +1,511 @@
+
+#' Reads a reference file and returns a list
+#' with the class references and the associated
+#' class probabilities.
+#' \param file the path to the file of interest.
+#' \return a list of two elements : "references"
+#' a matrix with the references on each row and
+#' "prob" the associated class probabilities.
+#'
+read.references = function(file)
+{ ref = as.matrix(read.table(file), drop=F)
+ prob = ref[,1]
+ ref = ref[,-1, drop=F]
+ return(list(references=ref, prob=prob))
+}
+
+#' Compute the euclidean distance between two references.
+#' It also check if a reference is in reverse orientation
+#' and returns the smallest distance value.
+#' \param ref1 a vector containing the first reference.
+#' \param ref2 a vector containing the second reference.
+#' \return the euclidean distance.
+eucl.dist.ref = function(ref1, ref2)
+{
+ return(min(sqrt(sum(((ref1 - ref2 ) ^ 2))),
+ sqrt(sum(((ref1 - rev(ref2)) ^ 2)))))
+}
+
+
+#' Compute the correlation distance between two references.
+#' It also check if a reference is in reverse orientation
+#' and returns the smallest distance value.
+#' \param ref1 a vector containing the first reference.
+#' \param ref2 a vector containing the second reference.
+#' \return the euclidean distance.
+cor.dist.ref= function(ref1, ref2)
+{
+ return(1 - min(cor(ref1, ref2 ),
+ cor(ref1, rev(ref2))))
+}
+
+
+#' Computes the (eucliden) distance matrix for all the given
+#' the references As some references may be in reverse
+#' orientation compared to others, the distance in both
+#' orientation is computed, for each pair, and the best is
+#' returned.
+#' \param references a matrix with the references on each row.
+#' \return a matrix containing the distances between each reference.
+distance.ref = function(references)
+{ n = nrow(references)
+ d = matrix(nrow=n, ncol=n, data=0)
+
+ for(i in 1:n)
+ { for(j in 1:i)
+ { x = eucl.dist.ref(references[i,], references[j,])
+ d[i,j] = x
+ d[j,i] = x
+ }
+ }
+ return(d)
+}
+
+
+get_matches = function(distances, run_value)
+{
+ matches = matrix(nrow=0, ncol=4)
+
+ # references of run i on the row -> y coord
+ # references of run j on the col -> x coord
+
+ # run labels
+ run_i = 1
+ # run_j = 2
+
+ for(run_j in setdiff(unique(run_value), run_i))
+ {
+ # number of references in each run
+ n_i = length(which(run_value == run_i))
+ n_j = length(which(run_value == run_j))
+
+ index_i = which(run_value == run_i) # rows of run i
+ index_j = which(run_value == run_j) # columns of run j
+
+ i_taken = c() # classes of i already matched -> rows to ignore
+ j_taken = c() # classes of j already matched -> columns to ignore
+
+ # while not all classes in j have been assigned a best match
+ row_n = 1
+ while(length(j_taken) < n_j)
+ { if(length(i_taken) == 0 &&
+ length(j_taken) == 0)
+ { distances_tmp = distances[index_i, index_j, drop=F]
+ coord = which(distances_tmp == min(distances_tmp), arr.ind=T)
+ coord_i = as.numeric(rownames(distances_tmp)[coord[1]])
+ coord_j = as.numeric(colnames(distances_tmp)[coord[2]])
+ coord = c(coord_i, coord_j)
+ } else {
+ rows = setdiff(index_i, i_taken)
+ cols = setdiff(index_j, j_taken)
+ distances_tmp = distances[rows, cols, drop=F]
+ coord = which(distances_tmp == min(distances_tmp), arr.ind=T)
+ coord_i = as.numeric(rownames(distances_tmp)[coord[1]])
+ coord_j = as.numeric(colnames(distances_tmp)[coord[2]])
+ coord = c(coord_i, coord_j)
+ }
+ coord = c(coord, row_n, run_j)
+ i_taken = c(i_taken, coord[1])
+ j_taken = c(j_taken, coord[2])
+ matches = rbind(matches, coord)
+ row_n = row_n + 1
+ }
+ }
+ return(matches)
+}
+
+
+
+#'Creates a composite figure in which several class references from
+#'several partitions, with different numbers of classes, are plotted.
+#'The figure is composed of a matrix of rows and
+#'columns where is the highest number of classes in all
+#'partitions and the number of different partition. T
+#'The first column will contain the references of the
+#'partition with classes. The next columns will contain the
+#'references of the partition with the second biggest number of
+#'classes (and so on). In a given column, except the 1st one,
+#'the references are ordered (over the rows) such that the
+#'overall similarity (euclidean distance) with the 1st column
+#'references are maximized.
+#'\param file the file name where the image will be saved.
+#'\param references a matrix with the different references to draw on
+#'each row.
+#'\param references a vector containing the class probability (or weight) associated
+#'to each corresponding reference (row) in matrix.
+#'\param probabilities a vector of values that will be displayed atop of each
+#'column of plots.
+#'\param colors a vector of colors to draw the class profiles. There should
+#'be colors, they can be the same.
+#'\param distances a distance matrix containing the distance between all
+#'references. The row and column labels have to be the row and column
+#'number (1, 2, 3, ...)!
+#'\param n_run the total number of different partitions to which all
+#'references belong.
+#'\param run_value a vector indicating to which partition each reference
+#'(row of references) belong to. It should be a simple vector of integers,
+#'for instance 1,1,1,1,2,2,2,3,3
+#'\param n_class_max, the highest number of classes searches in all partitions ()
+plot.references = function(file,
+ references,
+ probabilities,
+ colors,
+ col.titles,
+ distances,
+ n_run,
+ run_value,
+ n_class_max,
+ width=15,
+ height=18)
+{
+ # compute the best matches between all references to 1st run references
+ matches = get_matches(distances, run_value)
+
+ # make a matrix for layout with good plot numbers
+ plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles)
+ plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab)
+ plots.lab[-1,1] = 1:n_class_max # for run with max number of classes
+ z = n_class_max + 1
+ for(i in 1:nrow(matches))
+ { coord = matches[i,]
+ # plots.lab[coord[3], coord[4]] = z
+ plots.lab[coord[1]+1, coord[4]] = z
+ z = z + 1
+ }
+ # these will be the empty plots
+ for(i in 1:nrow(plots.lab))
+ { for(j in 1:ncol(plots.lab))
+ { if(is.na(plots.lab[i,j]))
+ { plots.lab[i,j] = z
+ z = z + 1
+ }
+ }
+ }
+
+
+ # plot
+ png(filename=file, width=width, height=height, unit="in", res=720)
+ # a grid
+ m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) )
+ layout.show(m)
+ x = 1:ncol(references)
+
+ # plot references of partition with highest number of classes
+ for(i in 1:n_class_max)
+ { plot(x=x, y=references[i,], lwd=2, type='l', ylim=c(0, 1.2*max(references[i,])),
+ col=colors[i], main="", xlab="pos [bp]", ylab="Nb reads")
+ # prob
+ x_ = 0.85*length(references[i,])
+ y_ = max(references[i,])
+ lab = round(probabilities[i],3)
+ text(x=x_, y=y_, labels=lab, cex=1.2)
+ }
+
+ # plot others
+ for(i in 1:nrow(matches))
+ {
+ ref_index = matches[i,2]
+ col_index = matches[i,3]
+
+
+ plot(x=x, y=references[ref_index,], lwd=2, type='l', ylim=c(0, 1.2*max(references[ref_index,])),
+ col=colors[col_index], main="", xlab="pos [bp]", ylab="Nb reads")
+ # prob
+ x_ = 0.85*length(references[ref_index,])
+ y_ = max(references[ref_index,])
+ lab = round(probabilities[ref_index],3)
+ text(x=x_, y=y_, labels=lab, cex=1.2)
+ }
+
+ # empty plots
+ for(i in (length(run_value)+1):(n_run*n_class_max))
+ { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") }
+
+ # col titles
+ p = par(mar=c(0,0,0,0))
+ for(i in 1:length(col.titles))
+ { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n")
+ text(1,1, labels=col.titles[i], cex=2)
+ }
+ par(p)
+ dev.off()
+}
+
+
+
+
+plot.references.2 = function(file,
+ references,
+ probabilities,
+ colors,
+ col.titles,
+ distances,
+ n_run,
+ run_value,
+ n_class_max,
+ width=15,
+ height=18)
+{
+ # compute the best matches between all references to 1st run references
+ matches = get_matches(distances, run_value)
+
+ # make a matrix for layout with good plot numbers
+ plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles)
+ plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab)
+ plots.lab[-1,1] = 1:n_class_max # for run with max number of classes
+ z = n_class_max + 1
+ for(i in 1:nrow(matches))
+ { coord = matches[i,]
+ # plots.lab[coord[3], coord[4]] = z
+ plots.lab[coord[1]+1, coord[4]] = z
+ z = z + 1
+ }
+ # these will be the empty plots
+ for(i in 1:nrow(plots.lab))
+ { for(j in 1:ncol(plots.lab))
+ { if(is.na(plots.lab[i,j]))
+ { plots.lab[i,j] = z
+ z = z + 1
+ }
+ }
+ }
+
+ # plot
+ if(is.null(file))
+ { X11(width=width, height=height) }
+ else
+ { png(filename=file, width=width, height=height, unit="in", res=720) }
+ # a grid
+ m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) )
+ # layout.show(m)
+ x = 1:ncol(references[[1]])
+
+ # plot references of partition with highest number of classes
+ for(i in 1:n_class_max)
+ { for(j in 1:length(references))
+ {
+ ylim = c(0, 1.2)
+ if(j == 1)
+ { plot(x=x, y=references[[j]][i,]/max(references[[j]][i,]),
+ lwd=2, type='l', ylim=ylim,
+ col=colors[j], main="", xlab="pos [bp]", ylab="Nb reads")
+ }
+ else
+ { lines(x=x, y=references[[j]][i,]/max(references[[j]][i,]),
+ lwd=2, type='l', col=colors[j])
+ }
+ }
+
+ # prob
+ x_ = 0.85*length(references[[1]][i,])
+ # y_ = max(references[[1]][i,])
+ y_ = 0.85
+ lab = round(probabilities[i],3)
+ text(x=x_, y=y_, labels=lab, cex=1.2)
+ }
+
+ # plot others
+ for(i in 1:nrow(matches))
+ { ref_index = matches[i,2]
+ col_index = matches[i,3]
+ for(j in 1:length(references))
+ { ylim = c(0, 1.2)
+ if(j == 1)
+ { plot(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]),
+ lwd=2, type='l', ylim=ylim,
+ col=colors[j], main="", xlab="pos [bp]", ylab="Nb reads")
+ }
+ else
+ { lines(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]),
+ lwd=2, col=colors[j])
+ }
+ }
+ # prob
+ x_ = 0.85*length(references[[1]][ref_index,])
+ # y_ = max(references[[1]][ref_index,])
+ y_ = 0.85
+ lab = round(probabilities[ref_index],3)
+ text(x=x_, y=y_, labels=lab, cex=1.2)
+ }
+
+ # empty plots
+ for(i in (length(run_value)+1):(n_run*n_class_max))
+ { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") }
+
+ # col titles
+ p = par(mar=c(0,0,0,0))
+ for(i in 1:length(col.titles))
+ { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n")
+ text(1,1, labels=col.titles[i], cex=2)
+ }
+ par(p)
+ if(!is.null(file))
+ { dev.off() }
+}
+
+
+plot.references.3 = function(file,
+ references,
+ probabilities,
+ colors,
+ col.titles,
+ distances,
+ n_run,
+ run_value,
+ n_class_max,
+ width=15,
+ height=18)
+{
+ # compute the best matches between all references to 1st run references
+ matches = get_matches(distances, run_value)
+
+ # make a matrix for layout with good plot numbers
+ plots.lab = matrix(nrow=n_class_max+1, ncol=n_run) # the 1st row will be filled last with only text (col.titles)
+ plots.lab[1,] = (length(plots.lab) - ncol(plots.lab) + 1) : length(plots.lab)
+ plots.lab[-1,1] = 1:n_class_max # for run with max number of classes
+ z = n_class_max + 1
+ for(i in 1:nrow(matches))
+ { coord = matches[i,]
+ # plots.lab[coord[3], coord[4]] = z
+ plots.lab[coord[1]+1, coord[4]] = z
+ z = z + 1
+ }
+ # these will be the empty plots
+ for(i in 1:nrow(plots.lab))
+ { for(j in 1:ncol(plots.lab))
+ { if(is.na(plots.lab[i,j]))
+ { plots.lab[i,j] = z
+ z = z + 1
+ }
+ }
+ }
+
+ # plot
+ if(is.null(file))
+ { X11(width=width, height=height) }
+ else
+ { png(filename=file, width=width, height=height, unit="in", res=720) }
+
+ p = par(mar=c(0,0,0,0))
+
+ # a grid
+ m = layout(mat = plots.lab, heights=c(0.3, rep(1, nrow(plots.lab)-1)) )
+ # layout.show(m)
+ x = 1:ncol(references[[1]])
+
+ # plot references of partition with highest number of classes
+ for(i in 1:n_class_max)
+ { for(j in 1:length(references))
+ {
+ ylim = c(0, 1.2)
+ if(j == 1)
+ { plot(x=x, y=references[[j]][i,]/max(references[[j]][i,]),
+ lwd=2, type='l', ylim=ylim,
+ col=colors[j], main='', xlab='', ylab='',
+ xaxt='n', yaxt='n')
+ }
+ else
+ { lines(x=x, y=references[[j]][i,]/max(references[[j]][i,]),
+ lwd=2, type='l', col=colors[j])
+ }
+ }
+
+ # prob
+ x_ = 0.85*length(references[[1]][i,])
+ # y_ = max(references[[1]][i,])
+ y_ = 0.85
+ lab = round(probabilities[i],3)
+ text(x=x_, y=y_, labels=lab, cex=1.2)
+ }
+
+ # plot others
+ for(i in 1:nrow(matches))
+ { ref_index = matches[i,2]
+ col_index = matches[i,3]
+ for(j in 1:length(references))
+ { ylim = c(0, 1.2)
+ if(j == 1)
+ { plot(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]),
+ lwd=2, type='l', ylim=ylim,
+ col=colors[j], main='', xlab='', ylab='',
+ xaxt='n', yaxt='n')
+ }
+ else
+ { lines(x=x, y=references[[j]][ref_index,]/max(references[[j]][ref_index,]),
+ lwd=2, col=colors[j])
+ }
+ }
+ # prob
+ x_ = 0.85*length(references[[1]][ref_index,])
+ # y_ = max(references[[1]][ref_index,])
+ y_ = 0.85
+ lab = round(probabilities[ref_index],3)
+ text(x=x_, y=y_, labels=lab, cex=1.2)
+ }
+
+ # empty plots
+ for(i in (length(run_value)+1):(n_run*n_class_max))
+ { plot(1,1,xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n") }
+
+ # col titles
+ for(i in 1:length(col.titles))
+ { plot(1,1, xlab="", ylab="", main="", col=0, xaxt="n", yaxt="n", bty="n")
+ text(1,1, labels=col.titles[i], cex=2)
+ }
+ par(p)
+ if(!is.null(file))
+ { dev.off() }
+}
+
+
+plot.references.4 = function(file,
+ references,
+ probabilities,
+ colors,
+ width=15,
+ height=18)
+{
+ n_class = nrow(references[[1]])
+ n_col = ncol(references[[1]])
+ mat = matrix(nrow=n_class, ncol=1, data=1:n_class)
+
+ # plot
+ if(is.null(file))
+ { X11(width=width, height=height) }
+ else
+ { png(filename=file, width=width, height=height, unit="in", res=720) }
+
+ p = par(mar=c(0,0,0,0))
+
+ # a grid
+ m = layout(mat = mat)
+ # layout.show(m)
+ x = 1:n_col
+
+ for(i in 1:n_class)
+ { for(j in 1:length(references))
+ {
+ ylim = c(0, 1.2)
+ if(j == 1)
+ { plot(x=x, y=references[[j]][i,]/max(references[[j]][i,]),
+ lwd=2, type='l', ylim=ylim,
+ col=colors[j], main='', xlab='', ylab='',
+ xaxt='n', yaxt='n')
+ }
+ else
+ { lines(x=x, y=references[[j]][i,]/max(references[[j]][i,]),
+ lwd=2, type='l', col=colors[j])
+ }
+ }
+ # prob
+ x_ = 0.85*length(references[[1]][i,])
+ # y_ = max(references[[1]][i,])
+ y_ = 0.85
+ lab = round(probabilities[i],3)
+ text(x=x_, y=y_, labels=lab, cex=1.2)
+ }
+
+ if(!is.null(file))
+ { dev.off() }
+}
+
diff --git a/scripts/install_libraries/install_libSeqAn.sh b/scripts/install_libraries/install_libSeqAn.sh
new file mode 100644
index 0000000..1bdd88f
--- /dev/null
+++ b/scripts/install_libraries/install_libSeqAn.sh
@@ -0,0 +1,14 @@
+# install the header only SeqAn library
+
+library_dir='lib/seqan'
+
+# clone git
+git clone https://github.com/seqan/seqan.git
+cd seqan
+#install
+mkdir -p ../$library_dir
+## header files
+mv * ../$library_dir
+cd ..
+# clean
+rm -rf seqan
diff --git a/scripts/install_libraries/run_all.sh b/scripts/install_libraries/run_all.sh
new file mode 100644
index 0000000..6251f3d
--- /dev/null
+++ b/scripts/install_libraries/run_all.sh
@@ -0,0 +1,5 @@
+mkdir lib/
+mkdir lib/include
+
+scripts/install_libraries/install_libStatGen.sh
+
diff --git a/scripts/install_programs/install_deeptools.sh b/scripts/install_programs/install_deeptools.sh
new file mode 100644
index 0000000..9df2c69
--- /dev/null
+++ b/scripts/install_programs/install_deeptools.sh
@@ -0,0 +1,8 @@
+
+# make sure that pip is installed for python3.6
+# curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+# sudo python3.6 get-pip.py --force-reinstall
+
+# install deeptools for python3.6
+sudo pip3.6 install deeptools
+
diff --git a/scripts/install_programs/run_all.sh b/scripts/install_programs/run_all.sh
new file mode 100644
index 0000000..fd23b47
--- /dev/null
+++ b/scripts/install_programs/run_all.sh
@@ -0,0 +1 @@
+scripts/install_programs/install_deeptools.sh
diff --git a/scripts/plot_references.R b/scripts/plot_references.R
deleted file mode 100644
index 1c7c10a..0000000
--- a/scripts/plot_references.R
+++ /dev/null
@@ -1,27 +0,0 @@
-data_rand = as.matrix(read.table("mnase_random.txt", h=F))
-data_rand_new = as.matrix(read.table("mnase_random_new.txt", h=F))
-data_sampling = as.matrix(read.table("mnase_sampling.txt", h=F))
-data_rand_r = as.matrix(read.table("mnase_R.txt", h=F))
-
-par(mfrow=c(2,2))
-
-x = 1:ncol(data_rand_r)
-plot(x, data_rand_r[1,], type='l', lwd=3, ylim=c(min(data_rand_r), max(data_rand_r)), main="R Random seeding")
-for(i in 2:nrow(data_rand_r))
-{lines(x, data_rand_r[i,], lwd=3, col=i) }
-
-x = 1:ncol(data_rand)
-plot(x, data_rand[1,], type='l', lwd=3, ylim=c(min(data_rand), max(data_rand)), main="C++ Random seeding")
-for(i in 2:nrow(data_rand))
-{lines(x, data_rand[i,], lwd=3, col=i) }
-
-x = 1:ncol(data_rand_new)
-plot(x, data_rand_new[1,], type='l', lwd=3, ylim=c(min(data_rand_new), max(data_rand_new)), main="C++ New random seeding")
-for(i in 2:nrow(data_rand_new))
-{lines(x, data_rand_new[i,], lwd=3, col=i) }
-
-x = 1:ncol(data_sampling)
-plot(x, data_sampling[1,], type='l', lwd=3, ylim=c(min(data_sampling), max(data_sampling)), main="C++ Sampling seeding")
-for(i in 2:nrow(data_sampling))
-{lines(x, data_sampling[i,], lwd=3, col=i) }
-
diff --git a/scripts/run_all.sh b/scripts/run_all.sh
index 55b002e..a698e65 100755
--- a/scripts/run_all.sh
+++ b/scripts/run_all.sh
@@ -1,2 +1,11 @@
-scripts/simulate_chipseq_data/run_all.sh
+# install programs
+scripts/install_programs/run_all.sh
+
+# install libraries
+scripts/install_libraries/run_all.sh
+
+# simulate data for testing purposes
+scripts/generate_toy_data/run_all.sh
+
+
diff --git a/scripts/simulate_chipseq_data/run_all.sh b/scripts/simulate_chipseq_data/run_all.sh
deleted file mode 100755
index 716b134..0000000
--- a/scripts/simulate_chipseq_data/run_all.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-Rscript scripts/simulate_chipseq_data/simulate_data_chipseq.R
-
diff --git a/scripts/toy_data/generate_bam_file.py b/scripts/toy_data/generate_bam_file.py
new file mode 100644
index 0000000..13c7419
--- /dev/null
+++ b/scripts/toy_data/generate_bam_file.py
@@ -0,0 +1,238 @@
+
+# This toy dataset contains 2 chromosomes with exactly the same fragments
+# on each one of them. Each fragment belong to 1 cell For each chromosome, the situation is
+# the following.
+# In the following picture, the fragments are depicted as [from,to) inverval. Thus the fragments
+# contain the from position but do NOT contain the end position. Each fragment is present 2x,
+# once the fw read is the 1st read of the pair, once the rv read is the 1st read of the pair.
+#
+# Each fragment is composed of two reads of 35bp each. The fw read is always the 1st read of the
+# pair and the rv read the 2nd.
+#
+# <-------------------------->
+# AAAAAAA
+# ------> <------
+# TTTTTTT
+# read fw (35bp) read rv (35bp)
+#
+# The genome is only made of A on the fw strand and T on the rv strand.
+#
+# 550 650 750 850 950 1050 1150 1250 1350 1450
+# | | | | | | | | | |
+# --------------------------------------------------------------------------------------------------> chrom
+# 400 480 | | | | | | | | | |
+# cell 0 <-----> | | | | | | | | | |
+# 480 550 | | | | | | | | |
+# cell 1 <----->| | | | | | | | | |
+# | 560 | | 800 | | | | | | |
+# cell 2 | <------------------> | | | | | | |
+# | 560 640| | | | | | | | |
+# cell 3 | <----->| | | | | | | | |
+# | 610| 690 | | | | | | | |
+# cell 4 | <-----> | | | | | | | |
+# | |670 750 | | | | | | |
+# cell 5 | | <-----> | | | | | | |
+# | | 730 810 | | | | | | |
+# cell 6 | | <-----> | | | | | | |
+# | | | 770 850 | | | | | |
+# cell 7 | | | <-----> | | | | | |
+# | | | | 950 | 1150 | | |
+# cell 8 | | | | <-----------------> | | |
+# | | | | |960 1040 | | | |
+# cell 9 | | | | | <----->| | | | |
+# | | | | | 1010 1090 | | | |
+# cell 10 | | | | | <-----> | | | |
+# | | | | | |1060 1140 | | |
+# cell 11 | | | | | | <----->| | | |
+# | | | | | | 1070 1150 | | |
+# cell 12 | | | | | | <-----> | | |
+# | | | | | | | | 1350 1430|
+# cell 13 | | | | | | | | <-----> |
+# | | | | | | | | |1360 1440
+# cell 14 | | | | | | | | | <----->|
+# | | | | | | | | | 1410 | 1490
+# cell 15 | | | | | | | | | <----->
+# | | | | | | | | | | 1500 1600
+# cell 16 | | | | | | | | | | <------------>
+# | | | | | | | | | | 1600 1700
+# cell 17 | | | | | | | | | | <------------>
+
+import pysam
+import os
+
+def create_read_pair(frag_start, ref_id, frag_len):
+
+ # Creates 2 pairs of read representing twice
+ # the same fragment. In one pair, the 1st
+ # read is fw and the 2n is rev, in the other
+ # pair, the 1st read is rev and the 2nd is fw.
+ # ---------------------------------> reference
+ # read_fw1 read_rv1
+ # --------> <--------
+ # |start end|
+ # |-----------frag len-----------|
+ # |end start|
+ # read_fw2 read_rv2
+ # --------> <--------
+
+ n = create_read_pair.counter
+
+ # the reads
+ read_fw1 = pysam.AlignedSegment()
+ read_rv1 = pysam.AlignedSegment()
+ read_fw2 = pysam.AlignedSegment()
+ read_rv2 = pysam.AlignedSegment()
+
+ # the start of the reverse mate (rightmost pos)
+ # start_rv = frag_start + frag_len - 1
+
+ # the start of the reverse mate (leftmost pos)
+ start_rv = frag_start + frag_len - 35
+ print("%d %d" % (frag_start, start_rv))
+
+ # flags
+ flag_fw1 = 99 # paired read, read mapped in proper pair
+ # mate rev strand, first in pair
+ flag_rv1 = 147 # paired read, read mapped in proper pair
+ # read in rev strand, second in pair
+
+ flag_fw2 = 163 # paired read, read mapped in proper pair
+ # mate rev strand, second in pair
+ flag_rv2 = 83 # paired read, read mapped in proper pair
+ # read in rev strand, first in pair
+
+ # optional field tags
+ extra_tags = (("NM", 1), # edit distance with ref
+ ("RG", "L1"), # read group
+ ("CB", "cell_%d" % n)) # cell barcode
+
+ # pair 1 : 1st read fw, 2nd read rev
+ ## fw read
+ read_fw1.query_name = "read_fw1_%d" % n
+ read_fw1.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+ read_fw1.flag = flag_fw1
+ read_fw1.reference_id = ref_id
+ read_fw1.reference_start = frag_start
+ read_fw1.mapping_quality = 20
+ # read_fw1.cigar = ((0,10), (2,1), (0,25))
+ read_fw1.next_reference_id = ref_id
+ read_fw1.next_reference_start = start_rv
+ read_fw1.template_length = frag_len
+ read_fw1.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
+ read_fw1.tags = extra_tags
+
+ ## rev read
+ read_rv1.query_name = "read_rv1_%d" % n
+ read_rv1.query_sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+ read_rv1.flag = flag_rv1
+ read_rv1.reference_id = ref_id
+ read_rv1.reference_start = start_rv
+ read_rv1.mapping_quality = 20
+ # read_rv1.cigar = ((0,10), (2,1), (0,25))
+ read_rv1.next_reference_id = ref_id
+ read_rv1.next_reference_start = frag_start
+ read_rv1.template_length = -frag_len
+ read_rv1.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
+ read_rv1.tags = extra_tags
+ create_read_pair.counter += 1
+
+ # pair 2 : 1st read rev, 2nd read fw
+ ## fw read
+ read_fw2.query_name = "read_fw2_%d" % n
+ read_fw2.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+ read_fw2.flag = flag_fw2
+ read_fw2.reference_id = ref_id
+ read_fw2.reference_start = frag_start
+ read_fw2.mapping_quality = 20
+ # read_fw2.cigar = ((0,10), (2,1), (0,25))
+ read_fw2.next_reference_id = ref_id
+ read_fw2.next_reference_start = start_rv
+ read_fw2.template_length = frag_len
+ read_fw2.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
+ read_fw2.tags = extra_tags
+
+ ## rev read
+ read_rv2.query_name = "read_rv2_%d" % n
+ read_rv2.query_sequence="TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+ read_rv2.flag = flag_rv2
+ read_rv2.reference_id = ref_id
+ read_rv2.reference_start = start_rv
+ read_rv2.mapping_quality = 20
+ # read_rv2.cigar = ((0,10), (2,1), (0,25))
+ read_rv2.next_reference_id = ref_id
+ read_rv2.next_reference_start = frag_start
+ read_rv2.template_length = -frag_len
+ read_rv2.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
+ read_rv2.tags = extra_tags
+ create_read_pair.counter += 1
+
+ return ((read_fw1, read_rv1), (read_fw2, read_rv2))
+
+create_read_pair.counter = 0
+
+if __name__ == "__main__":
+ # file header, the genome will have 2 chromosomes
+ header = { 'HD': {'VN': '1.0',
+ 'SO': 'unsorted'},
+ 'SQ': [{'LN': 2000, 'SN': 'chr1'}, # chrom index 0
+ {'LN': 2000, 'SN': 'chr2'}] } # chrom index 1
+
+ file_out = os.path.join("data", "toy_data", "sc_reads.bam")
+ f_out = pysam.AlignmentFile(file_out, header=header, mode="wb")
+
+ chromosomes = [0, 1]
+
+ read_fw_starts = (400, 470, 560, 560, 610, 670, 730, 770, 950, 960, \
+ 1010, 1060, 1070, 1350, 1360, 1410, 1500, 1600)
+
+ frag_lengths = (80, 80, 240, 80, 80, 80, 80, 80, 200, 80, \
+ 80, 80, 80, 80, 80, 80, 100, 100)
+
+ for chrom in chromosomes:
+ for i in range(0, len(read_fw_starts), 1):
+ read_fw_start = read_fw_starts[i]
+ frag_len = frag_lengths[i]
+ reads = create_read_pair(read_fw_start, chrom, frag_len)
+
+ f_out.write(reads[0][0])
+ f_out.write(reads[0][1])
+ f_out.write(reads[1][0])
+ f_out.write(reads[1][1])
+
+ f_out.close()
+
+# read_fw, read_rev = create_read_pair(0, 0, 100)
+# f_out.write(read_fw)
+# f_out.write(read_rev)
+# f_out.close()
+
+
+# read_fw = pysam.AlignedSegment()
+# read_fw.query_name = "read_1"
+# read_fw.query_sequence="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+# read_fw.flag = 99
+# read_fw.reference_id = 0
+# read_fw.reference_start = 32
+# read_fw.mapping_quality = 20
+# # read_fw.cigar = ((0,10), (2,1), (0,25))
+# read_fw.next_reference_id = 0
+# read_fw.next_reference_start=199
+# read_fw.template_length=167
+# read_fw.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
+# read_fw.tags = (("NM", 1),("RG", "L1"))
+# f_out.write(read_fw)
+
+# read_rv = pysam.AlignedSegment()
+# read_rv.query_name = "read_1"
+# read_rv.query_sequence= "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
+# read_rv.flag = 147
+# read_rv.reference_id = 0
+# read_rv.reference_start = 199
+# read_rv.mapping_quality = 20
+# read_fw.cigar = ((0,10), (2,1), (0,25))
+# read_rv.next_reference_id = 0
+# read_rv.next_reference_start=32
+# read_rv.template_length=167
+# read_rv.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<")
+# read_rv.tags = (("NM", 1),("RG", "L1"))
+# f_out.write(read_rv)
diff --git a/scripts/toy_data/generate_bed_file.py b/scripts/toy_data/generate_bed_file.py
new file mode 100755
index 0000000..135e1f8
--- /dev/null
+++ b/scripts/toy_data/generate_bed_file.py
@@ -0,0 +1,11 @@
+# simply generate a bed file with two region on two chromosomes
+
+import os
+
+file_bed = os.path.join("data", "toy_data", "peaks.bed")
+
+# write the cell barcodes
+# add an extra barcode associated with no reads -> empty cell
+with open(file_bed, "wt") as f_out:
+ f_out.write("chr1\t990\t1010\n")
+ f_out.write("chr2\t990\t1010\n")
diff --git a/scripts/simulate_chipseq_data/simulate_data_chipseq.R b/scripts/toy_data/generate_matrix_data_chipseq.R
similarity index 98%
rename from scripts/simulate_chipseq_data/simulate_data_chipseq.R
rename to scripts/toy_data/generate_matrix_data_chipseq.R
index 082cf0a..c6447e1 100755
--- a/scripts/simulate_chipseq_data/simulate_data_chipseq.R
+++ b/scripts/toy_data/generate_matrix_data_chipseq.R
@@ -1,133 +1,133 @@
#' \brief This function
#'
generate_data_chipseq = function(n_col, shift_max, p_flip, p_noise, coverage, shape_list, nrows_list)
{ # number of datum to generate
n_row = sum(unlist(nrows_list))
# data structure to store results
data = matrix(0, nrow=n_row, ncol=n_col)
shifts = vector(length=n_row, mode="numeric")
flips = vector(length=n_row, mode="numeric")
classes = vector(length=n_row, mode="numeric")
shapes = matrix(0, nrow=length(shape_list), ncol=n_col)
# the proportion of reads which are signal
p_signal = 1 - p_noise
# noise : a uniform distribution
x = 1:n_col
shape_noise = dunif(x, min=min(x), max=max(x))
i_tot = 1
for(k in 1:length(shape_list))
{
l = length(shape_list[[k]])
from_s = floor((n_col-l)/2)
to_s = from_s + l - 1
for(i in 1:nrows_list[[k]])
{
# shift
shifts[i_tot] = ceiling(runif(1,1,shift_max))
# flip
flips[i_tot] = rbinom(1, 1, p_flip)
# class
classes[i_tot] = k
from = shifts[i_tot]
to = shifts[i_tot] + length(shape_list[[k]]) - 1
# construct shape given shift and flip
# ensure min value equal to min in shape
shape = rep(min(shape_list[[k]]), n_col) # flat, only lowest possible value
if(flips[i_tot])
{ shape[from:to] = rev(shape_list[[k]])
tmp = from
from = to
to = tmp
} else {
shape[from:to] = shape_list[[k]]
}
shape = shape*coverage*p_signal + shape_noise*coverage*p_noise # scale to right coverage and add noise
# sample reads from shape
data[i_tot,] = rpois(n_col, shape)
shapes[k,from_s:to_s] = shapes[k,from_s:to_s] + ( data[i_tot, from:to] / nrows_list[[k]])
i_tot = i_tot + 1
}
}
return(list("data"=data, "shifts"=shifts, "flips"=flips, "classes"=classes, "shapes"=shapes))
}
setwd(file.path("/", "local", "groux", "scATAC-seq"))
seed = 20170426
-dir_data = file.path("data", "simulated_chipseq_data")
+dir_data = file.path("data", "toy_data")
dir.create(dir_data, showWarnings=F)
if(!file.exists(dir_data))
{ dir.create(dir_data) }
# general parameter
n_samples = 1000
n_col = 2001 # the length of a signal vector
shift_max = 100 # the maximum possible shift
p_flip = 0.3 # the prob of having a flipped signal
# class 1 : a simple gaussian
class1_n = 600
class1_m = ceiling(n_col/2) - ceiling(shift_max/2) # class 1 mean, mean will be in average in the middle of the data vector
class1_s = 40 # class 1 sd
# the signal shape
shape1 = dnorm(1:(n_col-shift_max+1), class1_m, class1_s)
# class 2 : half a gaussian
class2_n = n_samples - class1_n
class2_m = floor(n_col/2) - floor(shift_max/2) # class 2 mean, mean will be in average in the middle of the data vector
class2_s = 40 # class 2 sd
# the signal shape
shape2 = dnorm(1:(n_col-shift_max+1), class2_m, class2_s)
shape2[class2_m:length(shape2)] = min(shape2)
# class 3 : a uniform
class3_n = 333
class3_from = floor(n_col/2) - floor(shift_max/2) -120 # class 3 from, mean will be in average in the middle of the data vector
class3_to = floor(n_col/2) - floor(shift_max/2) +120 # class 3 to, mean will be in average in the middle of the data vector
# the signal shape
shape3 = dunif(1:(n_col-shift_max+1), class3_from, class3_to)
# normalize
shape1 = shape1 / sum(shape1)
shape2 = shape2 / sum(shape2)
shape2 = shape2 / sum(shape2)
# sequencing coverage
# the mean number of read per sample
coverages = c(1, 10, 100)
# noise proportion
# in the end, the noise is added EVERYWHERE (also on the signal core)
# _
# /\ |
# | - | proportion of signal
# | - - |
# |_____- -__________ _
# | | proportion of noise
# ----------------------> _
noises = c(0)
for(p_noise in noises) # the proportion of reads which are noise
{ for(coverage in coverages)
{
# ----------------------------------------------------- data with 3 classes -----------------------------------------------------
set.seed(seed)
data = generate_data_chipseq(n_col, shift_max, p_flip, p_noise, coverage, list(shape1, shape2, shape3), list(class3_n, class3_n, class3_n+1))
# save
write.table(data$data, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F)
write.table(data$shifts, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_shifts_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F)
write.table(data$flips, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_flips_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F)
write.table(data$classes, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_classes_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F)
write.table(data$shapes, file=file.path(dir_data, sprintf("simulated_data_3_class_asym_shapes_cov%d_noise%.1f.txt", coverage, p_noise)), row.names=F, col.names=F, quote=F)
# clean
data = shifts = flips = classes = shapes = NULL
}
}
diff --git a/scripts/toy_data/run_all.sh b/scripts/toy_data/run_all.sh
new file mode 100755
index 0000000..c8adfba
--- /dev/null
+++ b/scripts/toy_data/run_all.sh
@@ -0,0 +1,10 @@
+mkdir -p data/toy_data
+
+Rscript scripts/toy_data/generate_matrix_data_chipseq.R
+# generate peaks
+python3.6 scripts/toy_data/generate_bed_file.py
+# generate reads
+python3.6 scripts/toy_data/generate_bam_file.py
+samtools sort data/toy_data/sc_reads.bam > data/toy_data/sc_reads_sort.bam
+mv data/toy_data/sc_reads_sort.bam data/toy_data/sc_reads.bam
+samtools index data/toy_data/sc_reads.bam
diff --git a/src/Applications/CorrelationMatrixCreatorApplication.cpp b/src/Applications/CorrelationMatrixCreatorApplication.cpp
new file mode 100644
index 0000000..e9fab0a
--- /dev/null
+++ b/src/Applications/CorrelationMatrixCreatorApplication.cpp
@@ -0,0 +1,186 @@
+
+#include
+#include // MatrixCreator::methods
+#include
+
+#include
+#include
+#include
+#include // std::invalid_argument
+
+
+namespace po = boost::program_options ;
+
+// the valid values for --method option
+std::string method_read = "read" ;
+std::string method_read_atac = "read_atac" ;
+std::string method_fragment = "fragment" ;
+std::string method_fragment_center = "fragment_center" ;
+
+
+CorrelationMatrixCreatorApplication::CorrelationMatrixCreatorApplication(int argn, char** argv)
+ : file_bed(""), file_bam(""), from(0), to(0), bin_size(0),
+ method(MatrixCreator::FRAGMENT), runnable(true)
+{
+ // parse command line options and set the fields
+ this->parseOptions(argn, argv) ;
+}
+
+int CorrelationMatrixCreatorApplication::run()
+{ if(this->runnable)
+ { CorrelationMatrixCreator mc(this->file_bed,
+ this->file_bam,
+ this->file_bai,
+ this->from,
+ this->to,
+ this->bin_size,
+ this->method) ;
+
+ std::cout << mc.create_matrix() << std::endl ;
+ return EXIT_SUCCESS ;
+ }
+ else
+ { return EXIT_FAILURE ; }
+}
+
+void CorrelationMatrixCreatorApplication::parseOptions(int argn, char** argv)
+{
+ // no option to parse
+ if(argv == nullptr)
+ { std::string message = "no options to parse!" ;
+ throw std::invalid_argument(message) ;
+ }
+
+ // help messages
+ std::string desc_msg = "\n"
+ "CorrelationMatrixCreator is an application that creates a "
+ "count matrix from a BED file and a BAM file and returnes it "
+ "through stdout.\n"
+ "The matrix contains one row per reference region present in the "
+ "BED file. The region center is computed and then a region covering the "
+ "interval [from,to] is build around the middle and divided into "
+ "equally sized bins. Finally, each bin is assigned the number of "
+ "target present in the BAM file that are mapped at that position.\n\n" ;
+ std::string opt_help_msg = "Produces this help message." ;
+ std::string opt_bed_msg = "The path to the BED file containing the references";
+ std::string opt_bam_msg = "The path to the BAM file containing the targets";
+ std::string opt_bai_msg = "The path to the BAM index file of the BAM file containing the targets";
+ std::string opt_from_msg = "The upstream limit - in relative coordinate - of the region to build "
+ "around each reference center." ;
+ std::string opt_to_msg = "The downstream limit - in relative coordinate - of the region to build "
+ "around each reference center." ;
+ std::string opt_binsize_msg = "The size of the bins." ;
+ char tmp[4096] ;
+ sprintf(tmp,
+ "How the data in the BAM file should be handled when computing "
+ "the number of counts in each bin.\n"
+ "\t\"%s\" uses each position within the reads (by default)\n"
+ "\t\"%s\" uses only the insertion site for ATAC-seq data\n"
+ "\t\"%s\" uses each position within the fragments\n"
+ "\t\"%s\" uses only the fragment central positions\n",
+ method_read.c_str(),
+ method_read_atac.c_str(),
+ method_fragment.c_str(),
+ method_fragment_center.c_str()) ;
+
+ std::string opt_method_msg = tmp ;
+
+ // option parser
+ boost::program_options::variables_map vm ;
+ boost::program_options::options_description desc(desc_msg) ;
+
+ std::string method(method_read) ;
+
+ desc.add_options()
+ ("help,h", opt_help_msg.c_str())
+
+ ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str())
+ ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str())
+ ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str())
+
+ ("from,f", po::value(&(this->from)), opt_from_msg.c_str())
+ ("to,t", po::value(&(this->to)), opt_to_msg.c_str())
+ ("binSize,b", po::value(&(this->bin_size)), opt_binsize_msg.c_str())
+ ("method,m", po::value(&(method)), opt_method_msg.c_str()) ;
+
+ // parse
+ try
+ { po::store(po::parse_command_line(argn, argv, desc), vm) ;
+ po::notify(vm) ;
+ }
+ catch(std::invalid_argument& e)
+ { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
+ throw std::invalid_argument(msg) ;
+ }
+ catch(...)
+ { throw std::invalid_argument("An unknown error occured while parsing the options") ; }
+
+ bool help = vm.count("help") ;
+
+ // checks unproper option settings
+ if(this->file_bed == "" and (not help))
+ { std::string msg("Error! No BED file was given (--bed)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->file_bam == "" and (not help))
+ { std::string msg("Error! No BAM file was given (--bam)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->file_bam == "" and (not help))
+ { std::string msg("Error! No BAM index file was given (--bai)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->from == 0 and this->to == 0 and (not help))
+ { std::string msg("Error! No range given (--from and --to)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->from >= this->to and (not help))
+ { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->bin_size <= 0 and (not help))
+ { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(method != method_read and
+ method != method_read_atac and
+ method != method_fragment and
+ method != method_fragment_center)
+ { char msg[4096] ;
+ sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)",
+ method_read.c_str(),
+ method_read_atac.c_str(),
+ method_fragment.c_str(),
+ method_fragment_center.c_str()) ;
+ throw std::invalid_argument(msg) ;
+ }
+
+ // set method
+ if(method == method_read)
+ { this->method = MatrixCreator::READ ; }
+ else if(method == method_read_atac)
+ { this->method = MatrixCreator::READ_ATAC ; }
+ else if(method == method_fragment)
+ { this->method = MatrixCreator::FRAGMENT ; }
+ else if(method == method_fragment_center)
+ { this->method = MatrixCreator::FRAGMENT_CENTER ; }
+
+ // help invoked, run() cannot be invoked
+ if(help)
+ { std::cout << desc << std::endl ;
+ this->runnable = false ;
+ return ;
+ }
+ // everything fine, run() can be called
+ else
+ { this->runnable = true ;
+ return ;
+ }
+}
+
+
+int main(int argn, char** argv)
+{ CorrelationMatrixCreatorApplication app(argn, argv) ;
+ return app.run() ;
+}
+
diff --git a/src/Applications/CorrelationMatrixCreatorApplication.hpp b/src/Applications/CorrelationMatrixCreatorApplication.hpp
new file mode 100644
index 0000000..9a5cd1a
--- /dev/null
+++ b/src/Applications/CorrelationMatrixCreatorApplication.hpp
@@ -0,0 +1,100 @@
+#ifndef CORRELATIONMATRIXCREATORAPPLICATION_HPP
+#define CORRELATIONMATRIXCREATORAPPLICATION_HPP
+
+#include
+#include // MatrixCreator::methods
+
+#include
+
+/*!
+ * \brief The CorrelationMatrixCreatorApplication class is a wrapper around a
+ * RegionMatrixCreator instance creating an autonomous application to
+ * compute a count matrix from a BAM file by directly passing all the options
+ * and parameters from the command line.
+ */
+class CorrelationMatrixCreatorApplication: public ApplicationInterface
+{
+ public:
+ CorrelationMatrixCreatorApplication() = delete ;
+ CorrelationMatrixCreatorApplication(const CorrelationMatrixCreatorApplication& app) = delete ;
+ /*!
+ * \brief Constructs an object from the command line
+ * options.
+ * \param argn the number of options passed to the
+ * main() function.
+ * \param argv the vector of options passed to the
+ * main() function.
+ */
+ CorrelationMatrixCreatorApplication(int argn, char** argv) ;
+
+ /*!
+ * \brief Runs the application. The data are classified
+ * using the given settings and the posterior probability
+ * matrix is returned through the stdout.
+ * The matrix is a 4D matrix with dimensions :
+ * regions, class, shift flip.
+ * \return an exit code EXIT_SUCCESS or EXIT_FAILURE
+ * to return to the OS.
+ */
+ virtual int run() override ;
+
+ private:
+ /*!
+ * \brief Parses the program command line options and
+ * sets the object field accordingly.
+ * If the help option is detected, the "runnable"
+ * field is set to false and subsequent calls to
+ * run() will produce nothing.
+ * \param argn the number of options passed to the
+ * main() function.
+ * \param argv the vector of options passed to the
+ * main() function.
+ * \throw std::invalid_argument if an error is found
+ * in the program options.
+ */
+ void parseOptions(int argn, char** argv) ;
+
+ /*!
+ * \brief the path to the bed file.
+ */
+ std::string file_bed ;
+ /*!
+ * \brief the path to the bam file.
+ */
+ std::string file_bam ;
+ /*!
+ * \brief the path to the bam index file.
+ */
+ std::string file_bai ;
+ /*!
+ * \brief a relative coordinate indicating the
+ * most downstream position to consider around
+ * each region in the bed file.
+ */
+ int from ;
+ /*!
+ * \brief a relative coordinate indicating the
+ * most upstream position to consider around
+ * each region in the bed file.
+ */
+ int to ;
+ /*!
+ * \brief the size of the bin that will be used
+ * to bin the signal in the regions [from,to] around
+ * each region in the bed file.
+ */
+ int bin_size ;
+ /*!
+ * \brief How to consider the sequenced fragments when computing
+ * the bin values.
+ */
+ MatrixCreator::methods method ;
+ /*!
+ * \brief a flag indicating whether the core of run() can be
+ * run or not.
+ */
+ bool runnable ;
+} ;
+
+
+#endif // CORRELATIONMATRIXCREATORAPPLICATION_HPP
diff --git a/src/Applications/CorrelationMatrixCreatorApplicationParallel.cpp b/src/Applications/CorrelationMatrixCreatorApplicationParallel.cpp
new file mode 100644
index 0000000..804d785
--- /dev/null
+++ b/src/Applications/CorrelationMatrixCreatorApplicationParallel.cpp
@@ -0,0 +1,193 @@
+
+#include
+#include // MatrixCreator::methods
+#include
+
+#include
+#include
+#include
+#include // std::invalid_argument
+
+
+namespace po = boost::program_options ;
+
+// the valid values for --method option
+std::string method_read = "read" ;
+std::string method_read_atac = "read_atac" ;
+std::string method_fragment = "fragment" ;
+std::string method_fragment_center = "fragment_center" ;
+
+
+CorrelationMatrixCreatorApplication::CorrelationMatrixCreatorApplication(int argn, char** argv)
+ : file_bed(""), file_bam(""), from(0), to(0), bin_size(0),
+ method(MatrixCreator::FRAGMENT), runnable(true), n_threads(1)
+{
+ // parse command line options and set the fields
+ this->parseOptions(argn, argv) ;
+}
+
+int CorrelationMatrixCreatorApplication::run()
+{ if(this->runnable)
+ { CorrelationMatrixCreator mc(this->file_bed,
+ this->file_bam,
+ this->file_bai,
+ this->from,
+ this->to,
+ this->bin_size,
+ this->method,
+ this->n_threads) ;
+
+ std::cout << mc.create_matrix() << std::endl ;
+ return EXIT_SUCCESS ;
+ }
+ else
+ { return EXIT_FAILURE ; }
+}
+
+void CorrelationMatrixCreatorApplication::parseOptions(int argn, char** argv)
+{
+ // no option to parse
+ if(argv == nullptr)
+ { std::string message = "no options to parse!" ;
+ throw std::invalid_argument(message) ;
+ }
+
+ // help messages
+ std::string desc_msg = "\n"
+ "CorrelationMatrixCreator is an application that creates a "
+ "count matrix from a BED file and a BAM file and returnes it "
+ "through stdout.\n"
+ "The matrix contains one row per reference region present in the "
+ "BED file. The region center is computed and then a region covering the "
+ "interval [from,to] is build around the middle and divided into "
+ "equally sized bins. Finally, each bin is assigned the number of "
+ "target present in the BAM file that are mapped at that position.\n\n" ;
+ std::string opt_help_msg = "Produces this help message." ;
+ std::string opt_bed_msg = "The path to the BED file containing the references";
+ std::string opt_bam_msg = "The path to the BAM file containing the targets";
+ std::string opt_bai_msg = "The path to the BAM index file of the BAM file containing the targets";
+ std::string opt_from_msg = "The upstream limit - in relative coordinate - of the region to build "
+ "around each reference center." ;
+ std::string opt_to_msg = "The downstream limit - in relative coordinate - of the region to build "
+ "around each reference center." ;
+ std::string opt_thread_msg = "The number of threads to use." ;
+ std::string opt_binsize_msg = "The size of the bins." ;
+ char tmp[4096] ;
+ sprintf(tmp,
+ "How the data in the BAM file should be handled when computing "
+ "the number of counts in each bin.\n"
+ "\t\"%s\" uses each position within the reads (by default)\n"
+ "\t\"%s\" uses only the insertion site for ATAC-seq data\n"
+ "\t\"%s\" uses each position within the fragments\n"
+ "\t\"%s\" uses only the fragment central positions\n",
+ method_read.c_str(),
+ method_read_atac.c_str(),
+ method_fragment.c_str(),
+ method_fragment_center.c_str()) ;
+
+ std::string opt_method_msg = tmp ;
+
+ // option parser
+ boost::program_options::variables_map vm ;
+ boost::program_options::options_description desc(desc_msg) ;
+
+ std::string method(method_read) ;
+
+ desc.add_options()
+ ("help,h", opt_help_msg.c_str())
+
+ ("bed", po::value(&(this->file_bed)), opt_bed_msg.c_str())
+ ("bam", po::value(&(this->file_bam)), opt_bam_msg.c_str())
+ ("bai", po::value(&(this->file_bai)), opt_bai_msg.c_str())
+
+ ("from,f", po::value(&(this->from)), opt_from_msg.c_str())
+ ("to,t", po::value(&(this->to)), opt_to_msg.c_str())
+ ("binSize,b", po::value(&(this->bin_size)), opt_binsize_msg.c_str())
+ ("method,m", po::value(&(method)), opt_method_msg.c_str())
+ ("parallel,p", po::value(&(this->n_threads)), opt_thread_msg.c_str()) ;
+
+ // parse
+ try
+ { po::store(po::parse_command_line(argn, argv, desc), vm) ;
+ po::notify(vm) ;
+ }
+ catch(std::invalid_argument& e)
+ { std::string msg = std::string("Error! Invalid option given!\n") + std::string(e.what()) ;
+ throw std::invalid_argument(msg) ;
+ }
+ catch(...)
+ { throw std::invalid_argument("An unknown error occured while parsing the options") ; }
+
+ bool help = vm.count("help") ;
+
+ // checks unproper option settings
+ if(this->file_bed == "" and (not help))
+ { std::string msg("Error! No BED file was given (--bed)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->file_bam == "" and (not help))
+ { std::string msg("Error! No BAM file was given (--bam)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->file_bam == "" and (not help))
+ { std::string msg("Error! No BAM index file was given (--bai)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->from == 0 and this->to == 0 and (not help))
+ { std::string msg("Error! No range given (--from and --to)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->from >= this->to and (not help))
+ { std::string msg("Error! from shoud be smaller than to (--from and --to)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->bin_size <= 0 and (not help))
+ { std::string msg("Error! bin size should be bigger than 0 (--binSize)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(method != method_read and
+ method != method_read_atac and
+ method != method_fragment and
+ method != method_fragment_center)
+ { char msg[4096] ;
+ sprintf(msg, "Error! method should be %s, %s, %s or %s (--method)",
+ method_read.c_str(),
+ method_read_atac.c_str(),
+ method_fragment.c_str(),
+ method_fragment_center.c_str()) ;
+ throw std::invalid_argument(msg) ;
+ }
+ else if(this->n_threads == 0)
+ { std::string msg("Error! at least one thread should be used (--parallel)!") ;
+ throw std::invalid_argument(msg) ;
+ }
+
+ // set method
+ if(method == method_read)
+ { this->method = MatrixCreator::READ ; }
+ else if(method == method_read_atac)
+ { this->method = MatrixCreator::READ_ATAC ; }
+ else if(method == method_fragment)
+ { this->method = MatrixCreator::FRAGMENT ; }
+ else if(method == method_fragment_center)
+ { this->method = MatrixCreator::FRAGMENT_CENTER ; }
+
+ // help invoked, run() cannot be invoked
+ if(help)
+ { std::cout << desc << std::endl ;
+ this->runnable = false ;
+ return ;
+ }
+ // everything fine, run() can be called
+ else
+ { this->runnable = true ;
+ return ;
+ }
+}
+
+
+int main(int argn, char** argv)
+{ CorrelationMatrixCreatorApplication app(argn, argv) ;
+ return app.run() ;
+}
+
diff --git a/src/Applications/CorrelationMatrixCreatorApplicationParallel.hpp b/src/Applications/CorrelationMatrixCreatorApplicationParallel.hpp
new file mode 100644
index 0000000..a362db7
--- /dev/null
+++ b/src/Applications/CorrelationMatrixCreatorApplicationParallel.hpp
@@ -0,0 +1,104 @@
+#ifndef CORRELATIONMATRIXCREATORAPPLICATION_HPP
+#define CORRELATIONMATRIXCREATORAPPLICATION_HPP
+
+#include
+#include // MatrixCreator::methods
+
+#include
+
+/*!
+ * \brief The CorrelationMatrixCreatorApplication class is a wrapper around a
+ * RegionMatrixCreator instance creating an autonomous application to
+ * compute a count matrix from a BAM file by directly passing all the options
+ * and parameters from the command line.
+ */
+class CorrelationMatrixCreatorApplication: public ApplicationInterface
+{
+ public:
+ CorrelationMatrixCreatorApplication() = delete ;
+ CorrelationMatrixCreatorApplication(const CorrelationMatrixCreatorApplication& app) = delete ;
+ /*!
+ * \brief Constructs an object from the command line
+ * options.
+ * \param argn the number of options passed to the
+ * main() function.
+ * \param argv the vector of options passed to the
+ * main() function.
+ */
+ CorrelationMatrixCreatorApplication(int argn, char** argv) ;
+
+ /*!
+ * \brief Runs the application. The data are classified
+ * using the given settings and the posterior probability
+ * matrix is returned through the stdout.
+ * The matrix is a 4D matrix with dimensions :
+ * regions, class, shift flip.
+ * \return an exit code EXIT_SUCCESS or EXIT_FAILURE
+ * to return to the OS.
+ */
+ virtual int run() override ;
+
+ private:
+ /*!
+ * \brief Parses the program command line options and
+ * sets the object field accordingly.
+ * If the help option is detected, the "runnable"
+ * field is set to false and subsequent calls to
+ * run() will produce nothing.
+ * \param argn the number of options passed to the
+ * main() function.
+ * \param argv the vector of options passed to the
+ * main() function.
+ * \throw std::invalid_argument if an error is found
+ * in the program options.
+ */
+ void parseOptions(int argn, char** argv) ;
+
+ /*!
+ * \brief the path to the bed file.
+ */
+ std::string file_bed ;
+ /*!
+ * \brief the path to the bam file.
+ */
+ std::string file_bam ;
+ /*!
+ * \brief the path to the bam index file.
+ */
+ std::string file_bai ;
+ /*!
+ * \brief a relative coordinate indicating the
+ * most downstream position to consider around
+ * each region in the bed file.
+ */
+ int from ;
+ /*!
+ * \brief a relative coordinate indicating the
+ * most upstream position to consider around
+ * each region in the bed file.
+ */
+ int to ;
+ /*!
+ * \brief the size of the bin that will be used
+ * to bin the signal in the regions [from,to] around
+ * each region in the bed file.
+ */
+ int bin_size ;
+ /*!
+ * \brief How to consider the sequenced fragments when computing
+ * the bin values.
+ */
+ MatrixCreator::methods method ;
+ /*!
+ * \brief a flag indicating whether the core of run() can be
+ * run or not.
+ */
+ bool runnable ;
+ /*!
+ * \brief the number of threads to use.
+ */
+ size_t n_threads ;
+} ;
+
+
+#endif // CORRELATIONMATRIXCREATORAPPLICATION_HPP
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index daa0e79..78a92e2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,53 +1,86 @@
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
+# compiler options
+add_compile_options(-std=c++14)
+add_compile_options(-O3)
+add_compile_options(-Wall)
+add_compile_options(-Wextra)
+add_compile_options(-Werror)
+add_compile_options(-Wfatal-errors)
+add_compile_options(-pedantic)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}")
+add_definitions (${SEQAN_DEFINITIONS})
# include file location
+include_directories (${SEQAN_INCLUDE_DIRS})
include_directories("${scATACseq_SOURCE_DIR}/src/Matrix")
include_directories("${scATACseq_SOURCE_DIR}/src/Clustering")
include_directories("${scATACseq_SOURCE_DIR}/src/Random")
include_directories("${scATACseq_SOURCE_DIR}/src/Parallel")
include_directories("${scATACseq_SOURCE_DIR}/src/Statistics")
include_directories("${scATACseq_SOURCE_DIR}/src/GUI")
include_directories("${scATACseq_SOURCE_DIR}/src/Applications")
include_directories("${scATACseq_SOURCE_DIR}/src/Matrix")
+include_directories("${scATACseq_SOURCE_DIR}/src/GenomicTools")
# compile modules into static libraries
-add_library(Clustering "Clustering/ClusteringEngine.cpp" "Clustering/EMEngine.cpp" "Clustering/ReferenceComputer.cpp")
-add_library(Random "Random/Random.cpp" "Random/RandomNumberGenerator.cpp")
-add_library(Parallel "Parallel/ThreadPool.cpp")
-add_library(Statistics "Statistics/Statistics.cpp")
-add_library(GUI "GUI/ConsoleProgressBar.cpp" "GUI/Diplayable.cpp" "GUI/Updatable.cpp")
+## set output directory
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/lib")
+## build instructions
+add_library(Clustering "Clustering/ClusteringEngine.cpp"
+ "Clustering/EMEngine.cpp"
+ "Clustering/ReferenceComputer.cpp")
+add_library(Random "Random/Random.cpp"
+ "Random/RandomNumberGenerator.cpp")
+add_library(Parallel "Parallel/ThreadPool.cpp")
+add_library(Statistics "Statistics/Statistics.cpp")
+add_library(GUI "GUI/ConsoleProgressBar.cpp"
+ "GUI/Diplayable.cpp"
+ "GUI/Updatable.cpp")
+add_library(GenomicTools "GenomicTools/MatrixCreator.cpp"
+ "GenomicTools/CorrelationMatrixCreator.cpp"
+ "GenomicTools/GenomeRegion.cpp")
-link_directories("${scATACseq_SOURCE_DIR}/src/Clustering")
-link_directories("${scATACseq_SOURCE_DIR}/src/Random")
-link_directories("${scATACseq_SOURCE_DIR}/src/Statistics")
-link_directories("${scATACseq_SOURCE_DIR}/src/GUI")
-link_directories("${scATACseq_SOURCE_DIR}/src/Parallel")
-
-
-# linking modules to resolve dependencies
+## resolve dependencies
target_link_libraries(Clustering Random Statistics GUI Parallel)
target_link_libraries(Parallel Threads::Threads)
+target_link_libraries(GenomicTools ${SEQAN_LIBRARIES})
# executables
-## a toy
-set(EXE_MAIN "main")
-add_executable(${EXE_MAIN} "main.cpp")
-target_link_libraries(${EXE_MAIN} Clustering)
-set_target_properties(${EXE_MAIN} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
+## a toy for SeqAn usage
+set(EXE_MAIN_SEQAN "main_seqan")
+add_executable(${EXE_MAIN_SEQAN} "main_seqan.cpp")
+target_link_libraries(${EXE_MAIN_SEQAN} ${SEQAN_LIBRARIES} GenomicTools Clustering)
+set_target_properties(${EXE_MAIN_SEQAN} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
+## a toy for correlation matrix
+set(EXE_MAIN_CORMAT "main_cormat")
+add_executable(${EXE_MAIN_CORMAT} "main_cormat.cpp")
+target_link_libraries(${EXE_MAIN_CORMAT} ${SEQAN_LIBRARIES} GenomicTools)
+set_target_properties(${EXE_MAIN_CORMAT} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
+## a toy for EM usage
+set(EXE_MAIN_EM "main_em")
+add_executable(${EXE_MAIN_EM} "main_em.cpp")
+target_link_libraries(${EXE_MAIN_EM} Clustering)
+set_target_properties(${EXE_MAIN_EM} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
+## an application to create a matrix from BED and a BAM file
+set(EXE_MAIN_BAMMATRIX "CorrelationMatrixCreator")
+add_executable(${EXE_MAIN_BAMMATRIX} "Applications/CorrelationMatrixCreatorApplication.cpp" "Applications/ApplicationInterface.cpp")
+target_link_libraries(${EXE_MAIN_BAMMATRIX} GenomicTools Boost::program_options)
+set_target_properties(${EXE_MAIN_BAMMATRIX} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an ChIPPartitioning standalone
set(EXE_CHIPPART "ChIPPartitioning")
add_executable(${EXE_CHIPPART} "Applications/ChIPPartitioningApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_CHIPPART} Clustering Boost::program_options)
set_target_properties(${EXE_CHIPPART} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## an executable to compute classes references from the data and the post prob of ChIPPartitioning
set(EXE_PROB2REF "probToRef")
add_executable(${EXE_PROB2REF} "Applications/ProbToRefApplication.cpp" "Applications/ApplicationInterface.cpp")
target_link_libraries(${EXE_PROB2REF} Clustering Boost::program_options)
set_target_properties(${EXE_PROB2REF} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
## a test suite
set(EXE_TESTS "unittests")
-add_executable(${EXE_TESTS} "unittests.cpp" "Unittests/unittests_matrix.cpp")
-target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB})
+add_executable(${EXE_TESTS} "unittests.cpp"
+ "Unittests/unittests_matrix.cpp"
+ "Unittests/unittests_genomictools.cpp")
+target_link_libraries(${EXE_TESTS} ${UNITTEST_LIB} ${SEQAN_LIBRARIES} GenomicTools)
set_target_properties(${EXE_TESTS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${scATACseq_SOURCE_DIR}/bin")
diff --git a/src/Clustering/EMEngine.cpp b/src/Clustering/EMEngine.cpp
index c10ce53..bacf3e6 100644
--- a/src/Clustering/EMEngine.cpp
+++ b/src/Clustering/EMEngine.cpp
@@ -1,769 +1,808 @@
#include
#include
#include
#include
#include
#include // rand_int_uniform()
#include // getRandomNumberGenerator()
#include // poisson_pmf(), normal_pmf(), sd()
#include // ConsoleProgressBar
#include // ThreadPool
#include // log(), exp(), pow()
#include
#include // numeric_limits
#include // uniform_real, variate_generator
#include // future, promise
#include // move()
#include // bind(), ref()
EMEngine::EMEngine(const Matrix2D& data,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
EMEngine::seeding_codes seeding,
const std::string& seed,
size_t n_threads)
: flip(flip), n_iter(n_iter), n_shift(n_shift), n_flip(flip+1), n_class(n_class),
n_row(data.get_nrow()), n_col(data.get_ncol()), l_slice(n_col - n_shift + 1),
seeding_method(seeding), n_threads(n_threads), threads(n_threads)
{
// initialise random number generator
getRandomGenerator(seed) ;
// copy the data
this->data = matrix2d_i(this->n_row, v_i(this->n_col)) ;
for(size_t i=0; in_row; i++)
{ for(size_t j=0; jn_col; j++)
{ this->data[i][j] = data(i,j) ; }
}
}
EMEngine::~EMEngine()
{ this->threads.join() ; }
Matrix2D EMEngine::get_references() const
{
Matrix2D references(this->n_class, this->l_slice, 0.) ;
for(size_t i=0; in_class; i++)
{ for(size_t j=0; jl_slice; j++)
{ references(i,j) = this->references[i][j] ; }
}
return references ;
}
Matrix4D EMEngine::get_posterior_prob() const
{ Matrix4D post_prob(this->n_row, this->n_class, this->n_shift, this->n_flip, 0.) ;
for(size_t i=0; in_row; i++)
{ for(size_t k=0; kn_class; k++)
{ for(size_t s=0; sn_shift; s++)
{ for(size_t f=0; fn_flip; f++)
{ post_prob(i,k,s,f) = this->post_prob[i][k][s][f] ; }
}
}
}
return post_prob ;
}
-
-double EMEngine::get_loglikelihood() const
+/*
+// this is the naive way, it is exact but results in Nan, -Nan, -Inf, +Inf
+// sometimes...
+double EMEngine::get_loglikelihood0() const
{
double ll = 0 ;
-
for(size_t i=0; in_row; i++)
{ double p_tmp = 0. ;
for(size_t j=0; jn_class; j++)
{ for(size_t s=0; sn_shift; s++)
{ // slice is [from_fw,to)
// from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw]
// fw |---------->>>----------|
// ----------------------------------> data
// rev |----------<<<----------| [from_dat_rev, to_dat_rev]
// to_dat_rev can be -1 -> int
// to_dat_rev from_dat_rev
// log likelihood
// --------------- forward ---------------
double lp_fw = 0. ;
int from_dat_fw = s ;
int to_dat_fw = from_dat_fw + this->l_slice - 1 ;
for(int j_dat_fw=from_dat_fw, j_ref_fw=0;
j_dat_fwdata[i][j_dat_fw],
+ double lp = std::max(log(poisson_pmf(this->data[i][j_dat_fw],
this->references[j][j_ref_fw]*
- this->window_mean[i][s]) ;
-
+ this->window_mean[i][s])),
+ EMEngine::p_min_log) ;
lp_fw += lp ;
- }
+
p_tmp += exp(lp_fw) * this->class_prob[j][s][flip_states::FORWARD] ;
// --------------- reverse ---------------
if(this->flip)
{ double lp_rev = 0. ;
int from_dat_rev = this->n_col - 1 - s ;
int to_dat_rev = from_dat_rev - (this->l_slice - 1) ;
int shift_rev = this->n_shift - s - 1 ;
for(int j_dat_rev=from_dat_rev, j_ref_fw=0;
j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++)
- { double lp = log(poisson_pmf(this->data[i][j_dat_rev],
+ { double lp = std::max(log(poisson_pmf(this->data[i][j_dat_rev],
this->references[j][j_ref_fw]*
- this->window_mean[i][shift_rev])) ;
+ this->window_mean[i][shift_rev])),
+ EMEngine::p_min_log) ;
lp_rev += lp ;
}
+
p_tmp += exp(lp_rev) * this->class_prob[j][s][flip_states::REVERSE] ;
}
}
}
ll += log(p_tmp) ;
}
return ll ;
}
+*/
-double EMEngine::get_aic() const
-{ double ll = this->get_loglikelihood() ;
- double n_param = ((double) this->n_class *
- (double)this->l_slice) +
- ((double)this->n_shift *
- (double)this->flip+1. *
- (double)this->n_class) - 1. ;
- return (2.*n_param) - (2.*ll) ;
-}
-
-/*
double EMEngine::get_loglikelihood() const
{
double ll = 0. ;
+ // compute all terms needed
for(size_t i=0; in_row; i++)
- { double l = 0. ;
+ { double prob_tmp = 0 ;
for(size_t j=0; jn_class; j++)
- { for(size_t s_fw=0, s_rev=this->n_shift-1;
- s_fwn_shift; s_fw++, s_rev--)
- { // slice is [from_fw,to)
+ { std::vector> v3 ;
+ for(size_t s=0; sn_shift; s++)
+ {
+ // slice is [from_fw,to)
// from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw]
// fw |---------->>>----------|
// ----------------------------------> data
// rev |----------<<<----------| [from_dat_rev, to_dat_rev]
// to_dat_rev can be -1 -> int
// to_dat_rev from_dat_rev
+ // log likelihood
// --------------- forward ---------------
- size_t from_dat_fw = s_fw ;
- size_t to_dat_fw = from_dat_fw + this->l_slice - 1 ;
- // --------------- reverse ---------------
- size_t from_dat_rev = this->n_col - 1 - s_fw ;
- // size_t to_dat_rev = from_dat_rev - (this->l_slice - 1) ;
-
- double ll_tmp = 0. ;
+ double lp_fw = 0. ;
+ int from_dat_fw = s ;
+ int to_dat_fw = from_dat_fw + this->l_slice - 1 ;
+ for(int j_dat_fw=from_dat_fw, j_ref_fw=0;
+ j_dat_fwdata[i][j_dat_fw],
+ this->references[j][j_ref_fw]*
+ this->window_mean[i][s]),
+ EMEngine::p_min)) ;
+ lp_fw += lp ;
+ }
+ double p_fw = this->class_prob[j][s][flip_states::FORWARD] ;
+ v3.push_back(std::make_pair(lp_fw, p_fw)) ;
- for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev;
- j_dat_fwdata[i][j_dat_fw],
- this->references[j][j_ref_fw]*
- this->window_mean[i][s_fw]);
- ll_tmp += log(std::max(p, EMEngine::p_min) *
- this->class_prob[j][s_fw][flip_states::FORWARD]) ;
- // --------------- reverse ---------------
- if(this->flip)
- { double p = poisson_pmf(this->data[i][j_dat_rev],
- this->references[j][j_ref_fw]*
- this->window_mean[i][s_rev]) ;
- ll_tmp += log(std::max(p, EMEngine::p_min) *
- this->class_prob[j][s_fw][flip_states::REVERSE]) ;
+ // --------------- reverse ---------------
+ if(this->flip)
+ { double lp_rev = 0. ;
+ int from_dat_rev = this->n_col - 1 - s ;
+ int to_dat_rev = from_dat_rev - (this->l_slice - 1) ;
+ int shift_rev = this->n_shift - s - 1 ;
+ for(int j_dat_rev=from_dat_rev, j_ref_fw=0;
+ j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++)
+ { double lp = log(std::max(poisson_pmf(this->data[i][j_dat_rev],
+ this->references[j][j_ref_fw]*
+ this->window_mean[i][shift_rev]),
+ EMEngine::p_min)) ;
+ lp_rev += lp ;
}
+ double p_rev = this->class_prob[j][s][flip_states::REVERSE] ;
+ v3.push_back(std::make_pair(lp_rev, p_rev)) ;
}
- l += ll_tmp ;
}
+ prob_tmp += sum_exp(v3) ;
}
- ll += l ;
+ ll += log(prob_tmp) ;
}
return ll ;
}
-*/
+
+double EMEngine::get_aic() const
+{ double ll = this->get_loglikelihood() ;
+ double n_param = ((double) this->n_class *
+ (double)this->l_slice) +
+ ((double)this->n_shift *
+ (double)this->flip+1. *
+ (double)this->n_class) - 1. ;
+ // std::cerr << "AIC = " << (2.*n_param) << " - " << ll << std::endl ;
+ return (2.*n_param) - (2.*ll) ;
+}
ClusteringEngine::exit_codes EMEngine::cluster()
{ size_t bar_update_n = this->n_iter + 1 ;
ConsoleProgressBar bar(std::cerr, bar_update_n, 70, "clustering") ;
// construct all other required data structures
// mean number of reads per window
this->window_mean = matrix2d_d(this->n_row, v_d(this->n_shift, 0.)) ;
this->compute_window_means() ;
// the references
this->references = matrix2d_d(this->n_class,
v_d(this->l_slice, 0.)) ;
// log loglikelihood
this->loglikelihood = matrix4d_d(this->n_row,
matrix3d_d(this->n_class,
matrix2d_d(this->n_shift,
- v_d(this->n_flip, 0.)))) ;
+ v_d(this->n_flip, 9.)))) ;
this->loglikelihood_max = v_d(this->n_row, 0.) ;
// posterior prob
this->post_prob = matrix4d_d(this->n_row,
matrix3d_d(this->n_class,
matrix2d_d(this->n_shift,
v_d(this->n_flip, 0.)))) ;
this->class_prob = matrix3d_d(this->n_class,
matrix2d_d(this->n_shift,
v_d(this->n_flip, 0.))) ;
this->class_prob_tot = v_d(this->n_class, 0.) ;
this->post_prob_row = v_d(this->n_row, 0.) ;
this->post_prob_class = v_d(this->n_class, 0.) ;
this->post_prob_tot = 0. ;
// seeding
this->seeding(this->seeding_method) ;
bar.update() ;
// optimize the partition
for(size_t n_iter=0; n_itern_iter; n_iter++)
{
// normalize the references such thjat the mean value, on each
// row, is 1
this->normalize_references() ;
// E-step
this->compute_loglikelihood() ;
this->compute_post_prob() ;
// M-step
this->compute_class_prob() ;
this->compute_references() ;
this->center_shifts() ;
+
bar.update() ;
}
bar.update() ; std::cerr << std::endl ;
return ClusteringEngine::exit_codes::SUCCESS ;
}
void EMEngine::normalize_references()
{
for(size_t i=0; in_class; i++)
{ double mean = 0. ;
for(size_t j=0; jl_slice; j++)
{ mean += this->references[i][j] ; }
mean /= this->l_slice ;
for(size_t j=0; jl_slice; j++)
{ this->references[i][j] /= mean ; }
}
}
void EMEngine::seeding(EMEngine::seeding_codes seeding)
{
if(seeding == EMEngine::seeding_codes::RANDOM)
{ this->seeding_random() ; }
else if(seeding == EMEngine::seeding_codes::SAMPLING)
{ this->seeding_sampling() ; }
else if(seeding == EMEngine::seeding_codes::TOY)
{ this->seeding_toy() ; }
}
void EMEngine::seeding_random()
{
// get random values from a beta distribution cannot be done using boost so
// i) generate random number [0,1] x
// ii) compute f(x) where f is beta distribution
matrix2d_d prob(this->n_row, v_d(this->n_class, 0.)) ;
v_d prob_class(this->n_class, 0.) ;
double tot_sum = 0. ;
// sample the prob
// beta distribution parameters
double alpha = pow(this->n_row, -0.5) ;
double beta = 1. ;
for(size_t i=0; in_row; i++)
{ double row_sum = 0. ;
for(size_t j=0; jn_class; j++)
{ double x = rand_real_uniform(0., 1.0) ;
double p = std::max(EMEngine::p_min, beta_pmf(x, alpha, beta)) ;
prob[i][j] = p ;
prob_class[j] += p ;
tot_sum += p ;
row_sum += p ;
}
// normalize
for(size_t j=0; jn_class; j++)
{ prob[i][j] /= row_sum ; }
}
// class prob
for(auto& p : prob_class)
{ p /= tot_sum ; }
// compute the refererences
for(size_t i=0; in_row; i++)
{ for(size_t j=0; jn_class; j++)
{ for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_slice; j_ref++, j_dat++)
{ this->references[j][j_ref] += (this->data[i][j_dat] * prob[i][j]) ; }
}
}
// normalize
for(size_t i=0; in_class; i++)
{ for(size_t j=0; jl_slice; j++)
{ this->references[i][j] ; }
}
// set the class probabilities to a uniform distribution
double sum = this->n_class * this->n_shift * this->n_flip ;
for(size_t i=0; in_class; i++)
{ for(size_t j=0; jn_shift; j++)
{ for(size_t k=0; kn_flip; k++)
{ this->class_prob[i][j][k] = 1./sum ; }
}
}
}
void EMEngine::seeding_sampling()
{
// sample data to initialise the references
std::vector choosen(this->n_row, false) ;
for(size_t i=0; in_class; )
{ size_t index = rand_int_uniform(size_t(0), size_t(this->n_row-1)) ;
// already choose
if(choosen[index])
{ ; }
// not yet choosen as reference
else
{ for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_slice; j_ref++, j_dat++)
{ this->references[i][j_ref] = this->data[index][j_dat] ; }
choosen[index] = true ;
i++ ;
}
}
// set the class probabilities to a uniform distribution
double sum = this->n_class * this->n_shift * this->n_flip ;
for(size_t i=0; in_class; i++)
{ for(size_t j=0; jn_shift; j++)
{ for(size_t k=0; kn_flip; k++)
{ this->class_prob[i][j][k] = 1. / sum ;
}
}
}
}
void EMEngine::seeding_toy()
{
// sample data to initialise the references
std::vector choosen(this->n_row, false) ;
for(size_t i=0; in_class; )
{ size_t index = i ;
// already choose
if(choosen[index])
{ ; }
// not yet choosen as reference
else
{ for(size_t j_ref=0, j_dat=this->n_shift/2; j_refl_slice; j_ref++, j_dat++)
{ this->references[i][j_ref] = this->data[index][j_dat] ; }
choosen[index] = true ;
i++ ;
}
}
// set the class probabilities to a uniform distribution
double sum = this->n_class * this->n_shift * this->n_flip ;
for(size_t i=0; in_class; i++)
{ for(size_t j=0; jn_shift; j++)
{ for(size_t k=0; kn_flip; k++)
{ this->class_prob[i][j][k] = 1./sum ; }
}
}
}
void EMEngine::compute_window_means()
{ // compute the slices on which each thread will work
std::vector> slices =
ThreadPool::split_range(0, this->n_row, this->n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector> promises(this->n_threads) ;
std::vector> futures(this->n_threads) ;
for(size_t i=0; in_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; in_threads; i++)
{ auto slice = slices[i] ;
this->threads.addJob(std::move(
std::bind(&EMEngine::compute_window_means_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
void EMEngine::compute_window_means_routine(size_t from,
size_t to,
std::promise& done)
{
double l_slice = double(this->l_slice) ;
for(size_t i=from; in_shift; from++)
{ double sum = 0. ;
// slice is [from,to)
size_t to = from + this->l_slice ;
for(size_t j=from; jdata[i][j] ;}
this->window_mean[i][from] = sum / l_slice ;
}
}
done.set_value(true) ;
}
void EMEngine::compute_loglikelihood()
{
// compute the slices on which each thread will work
std::vector> slices =
ThreadPool::split_range(0, this->n_row, this->n_threads) ;
// get promises and futures
// the function run by the threads will simply fill the promise with
// "true" to indicate that they are done
std::vector> promises(this->n_threads) ;
std::vector> futures(this->n_threads) ;
for(size_t i=0; in_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; in_threads; i++)
{ auto slice = slices[i] ;
this->threads.addJob(std::move(
std::bind(&EMEngine::compute_loglikelihood_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
for(auto& future : futures)
{ future.get() ; }
// -------------------------- threads stop ---------------------------
}
void EMEngine::compute_loglikelihood_routine(size_t from, size_t to, std::promise& done)
{
// access in writing
// this->loglikelihood -> only access the i-th which belong [from,to)
// this->loglikelihood_max -> only access the i-th which belong [from,to)
for(size_t i=from; iloglikelihood_max[i] = std::numeric_limits::lowest() ;
for(size_t j=0; jn_class; j++)
{ for(size_t s_fw=0, s_rev=this->n_shift-1;
s_fwn_shift; s_fw++, s_rev--)
{ // slice is [from_fw,to)
// from_dat_fw to_dat_fw [from_dat_fw, to_dat_fw]
// fw |---------->>>----------|
// ----------------------------------> data
// rev |----------<<<----------| [from_dat_rev, to_dat_rev]
// to_dat_rev can be -1 -> int
// to_dat_rev from_dat_rev
// log likelihood
double ll_fw = 0. ;
double ll_rev = 0. ;
// --------------- forward ---------------
size_t from_dat_fw = s_fw ;
size_t to_dat_fw = from_dat_fw + this->l_slice - 1 ;
// --------------- reverse ---------------
size_t from_dat_rev = this->n_col - 1 - s_fw ;
// size_t to_dat_rev = from_dat_rev - (this->l_slice - 1) ;
for(size_t j_dat_fw=from_dat_fw,j_ref_fw=0, j_dat_rev=from_dat_rev;
j_dat_fwdata[i][j_dat_fw],
this->references[j][j_ref_fw]*
this->window_mean[i][s_fw])) ;
ll_fw += std::max(ll, EMEngine::p_min_log) ;
// --------------- reverse ---------------
if(this->flip)
{ ll = log(poisson_pmf(this->data[i][j_dat_rev],
this->references[j][j_ref_fw]*
this->window_mean[i][s_rev])) ;
ll_rev += std::max(ll, EMEngine::p_min_log) ;
}
}
this->loglikelihood[i][j][from_dat_fw][flip_states::FORWARD] = ll_fw ;
// keep track of the max per row
if(ll_fw > this->loglikelihood_max[i])
{ this->loglikelihood_max[i] = ll_fw ; }
if(this->flip)
{ this->loglikelihood[i][j][from_dat_fw][flip_states::REVERSE] = ll_rev ;
// keep track of the max per row
if(ll_rev > this->loglikelihood_max[i])
{ this->loglikelihood_max[i] = ll_rev ; }
}
}
}
}
// fill the promise to indicate that the function exited
done.set_value(true) ;
}
void EMEngine::compute_post_prob()
{
// compute the slices on which each thread will work
std::vector> slices =
ThreadPool::split_range(0, this->n_row, this->n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the partial sum per class of post_prob for the given slice
// this should be used to compute the complete sum of post_prob
// and the complete sum per class of post_prob
std::vector> promises(this->n_threads) ;
std::vector> futures(this->n_threads) ;
for(size_t i=0; in_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; in_threads; i++)
{ auto slice = slices[i] ;
this->threads.addJob(std::move(
std::bind(&EMEngine::compute_post_prob_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// wait until all threads are done working
// compute the sum of post prob and the per class sum of post prob
// from the partial results computed on each slice
this->post_prob_tot = 0. ;
this->post_prob_class = v_d(this->n_class, 0.) ;
for(auto& future : futures)
{ auto probs = future.get() ;
for(size_t i=0; in_class; i++)
{ double prob = probs[i] ;
this->post_prob_class[i] += prob ;
this->post_prob_tot += prob ;
}
}
// -------------------------- threads stop ---------------------------
}
void EMEngine::compute_post_prob_routine(size_t from,
size_t to,
std::promise& done)
{
// this->post_prob_row -> only access the i-th which belong [from,to)
// this->post_prob -> only access the i-th which belong [from,to)
// some values that needs to be returned
// the total of the posterior prob for this slice of the data
// the total per class of posterior prob for this slice of the data
v_d post_prob_class(this->n_class, 0.) ;
for(size_t i=from; ipost_prob_row[i] = 0. ;
for(size_t n_class=0; n_classn_class; n_class++)
{ for(size_t n_shift=0; n_shiftn_shift; n_shift++)
{ for(size_t n_flip=0; n_flipn_flip; n_flip++)
- { double p = exp(this->loglikelihood[i][n_class][n_shift][n_flip] -
+ { /*
+ double p = exp(this->loglikelihood[i][n_class][n_shift][n_flip] -
this->loglikelihood_max[i]) *
this->class_prob[n_class][n_shift][n_flip] ;
+ */
+ double p = std::max(exp(this->loglikelihood[i][n_class][n_shift][n_flip] -
+ this->loglikelihood_max[i]) *
+ this->class_prob[n_class][n_shift][n_flip],
+ EMEngine::p_min) ;
this->post_prob[i][n_class][n_shift][n_flip] = p ;
this->post_prob_row[i] += p ;
}
}
}
// normalize
for(size_t n_class=0; n_classn_class; n_class++)
{ for(size_t n_shift=0; n_shiftn_shift; n_shift++)
{ for(size_t n_flip=0; n_flipn_flip; n_flip++)
{ this->post_prob[i][n_class][n_shift][n_flip] /=
this->post_prob_row[i] ;
double p = this->post_prob[i][n_class][n_shift][n_flip] ;
post_prob_class[n_class] += p ;
}
}
}
}
done.set_value(post_prob_class) ;
}
void EMEngine::compute_class_prob()
{
for(size_t n_class=0; n_classn_class; n_class++)
{ // reset total
this->class_prob_tot[n_class] = 0. ;
for(size_t n_shift=0; n_shiftn_shift; n_shift++)
{ for(size_t flip=0; flipn_flip; flip++)
{ // sum
this->class_prob[n_class][n_shift][flip] = 0. ;
for(size_t i=0; in_row; i++)
{ this->class_prob[n_class][n_shift][flip] +=
this->post_prob[i][n_class][n_shift][flip] ;
}
// normalize
this->class_prob[n_class][n_shift][flip] /= this->post_prob_tot ;
this->class_prob_tot[n_class] += this->class_prob[n_class][n_shift][flip] ;
}
}
}
}
void EMEngine::compute_references()
{
// compute the slices on which each thread will work
std::vector> slices =
ThreadPool::split_range(0, this->n_row, this->n_threads) ;
// get promises and futures
// the function run by the threads will compute
// the reference from the given slice
std::vector> promises(this->n_threads) ;
std::vector> futures(this->n_threads) ;
for(size_t i=0; in_threads; i++)
{ futures[i] = promises[i].get_future() ; }
// distribute work to threads
// -------------------------- threads start --------------------------
for(size_t i=0; in_threads; i++)
{ auto& slice = slices[i] ;
this->threads.addJob(std::move(
std::bind(&EMEngine::compute_references_routine,
this,
slice.first,
slice.second,
std::ref(promises[i])))) ;
}
// while threads are working, reset the references
for(size_t i=0; in_class; i++)
{ for(size_t j=0; jl_slice; j++)
{ this->references[i][j] = 0. ; }
}
// wait until all threads are done working
// sum the partial class references to get the complete ones
for(size_t n=0; nn_threads; n++)
{ matrix2d_d reference = futures[n].get() ;
for(size_t i=0; in_class; i++)
{ for(size_t j=0; jl_slice; j++)
{ this->references[i][j] += reference[i][j] ; }
}
}
// -------------------------- threads stop ---------------------------
}
void EMEngine::compute_references_routine(size_t from, size_t to, std::promise& references)
{ // the empty references
matrix2d_d ref(this->n_class, v_d(this->l_slice, 0.)) ;
for(size_t n_class=0; n_class < this->n_class; n_class++)
{
for(size_t i=from; in_shift; n_shift++)
{ // --------------- forward ---------------
int from_dat_fw = n_shift ;
int to_dat_fw = from_dat_fw + this->l_slice - 1 ;
for(int j_dat_fw=from_dat_fw, j_ref_fw=0;
j_dat_fw<=to_dat_fw; j_dat_fw++, j_ref_fw++)
{ ref[n_class][j_ref_fw] +=
(this->post_prob[i][n_class][n_shift][flip_states::FORWARD] * this->data[i][j_dat_fw]) /
this->post_prob_class[n_class] ;
}
// --------------- reverse ---------------
if(this->flip)
{ int from_dat_rev = this->n_col - 1 - n_shift ;
int to_dat_rev = from_dat_rev - (this->l_slice - 1) ;
for(int j_dat_rev=from_dat_rev, j_ref_fw=0;
j_dat_rev >= to_dat_rev; j_dat_rev--, j_ref_fw++)
{ ref[n_class][j_ref_fw] +=
(this->post_prob[i][n_class][n_shift][flip_states::REVERSE] * this->data[i][j_dat_rev]) /
this->post_prob_class[n_class] ;
}
}
}
}
}
references.set_value(ref) ;
}
void EMEngine::center_shifts()
{
if(this->n_shift == 1)
{ return ; }
// the possible shift states
std::vector shifts(this->n_shift) ;
std::iota(shifts.begin(), shifts.end(), 1.) ;
// the shift probabilities and the class probabilies (no need to norm., class_prob sums to 1)
double shifts_prob_measured_tot = 0. ;
std::vector shifts_prob_measured(this->n_shift) ;
for(size_t s=0; sn_shift; s++)
{ for(size_t k=0; kn_class; k++)
{ for(size_t f=0; fn_flip; f++)
{ shifts_prob_measured[s] += this->class_prob[k][s][f] ;
shifts_prob_measured_tot += this->class_prob[k][s][f] ;
}
}
}
// the shift mean and (biased) standard deviation
double shifts_sd = sd(shifts, shifts_prob_measured, false) ;
// the shift probabilities under the assumption that is distributed as a gaussian centered on
// the central shift state with sd and mean as in the data
// sd as the data
std::vector shifts_prob_centered(shifts.size(), 0.) ;
double shifts_prob_centered_tot = 0. ;
for(size_t i=0; in_shift/2)+1, shifts_sd) ;
shifts_prob_centered_tot += shifts_prob_centered[i] ;
}
for(size_t k=0; kn_class; k++)
{ for(size_t f=0; fn_flip; f++)
{ for(size_t s=0; sn_shift; s++)
{ this->class_prob[k][s][f] = this->class_prob_tot[k] * shifts_prob_centered[s] /
(this->n_flip * shifts_prob_centered_tot) ;
}
}
}
// shifts_prob_measured_tot = 0. ;
shifts_prob_measured.clear() ;
shifts_prob_measured.resize(this->n_shift) ;
for(size_t s=0; sn_shift; s++)
{ for(size_t k=0; kn_class; k++)
{ for(size_t f=0; fn_flip; f++)
{ shifts_prob_measured[s] += this->class_prob[k][s][f] ;
}
}
}
}
const double EMEngine::p_min = 1e-100 ;
const double EMEngine::p_min_log = log(EMEngine::p_min) ;
+
+#include
+
+double sum_exp(const std::vector>& v)
+{
+ double result = 0. ;
+ // double max = *std::max_element(lp.begin(), lp.end()) ;
+
+ double max = std::numeric_limits::lowest() ;
+ for(const auto& i : v)
+ { if(i.first > max)
+ { max = i.first ; }
+ }
+
+ // sum
+ for(const auto& i : v)
+ { result += (exp(i.first - max))*i.second ; }
+ result *= exp(max) ;
+
+ return result ;
+}
diff --git a/src/Clustering/EMEngine.hpp b/src/Clustering/EMEngine.hpp
index fa586d1..d4087cf 100644
--- a/src/Clustering/EMEngine.hpp
+++ b/src/Clustering/EMEngine.hpp
@@ -1,362 +1,363 @@
#ifndef EMENGINE_HPP
#define EMENGINE_HPP
#include
#include
#include
#include
#include
#include
#include // promise, future
// some typdef
#include
/*!
* \brief This class implements the iterative expectation
* maximization classification procedure described in Nair
* et al. 2014, Bioinformatics.
* The classification procedure performs a probabilistic
* partitioning of genomic regions, based on the distribution
* of the reads over the regions.
* To mitigate a miss-alignment of the signal in the different
* regions - that is a same signal strech is present in two
* regions but at different offsets - the classification
* procedure can search protypic signals shorter than a whole
* region, at each possible offset over the region (named
* shift).
* To mitigate an inversion of the signal in the different regions
* - that is a same signal strech is present in two regions but in
* reverse orientation - the classification procedure can search
* protypic signals in both orientation.
*/
class EMEngine : public ClusteringEngine
{
static const double p_min ;
static const double p_min_log ;
public:
/*!
* \brief The possible seeding strategies.
*/
enum seeding_codes {RANDOM=0, SAMPLING, TOY} ;
/*!
* \brief The possible flip states.
*/
enum flip_states{FORWARD=0, REVERSE} ;
public:
/*!
* \brief Constructs an object.
* \param data the data to classify.
* \param n_class the number of signal classes to search.
* \param n_iter the number of iterations.
* \param n_shift the shifting freedom. 1 means no shift.
* \param flip whether flipping is allowed.
* \param n_threads the number of threads dedicated to the
* computations.
*/
EMEngine(const Matrix2D& data,
size_t n_class,
size_t n_iter,
size_t n_shift,
bool flip,
seeding_codes seeding,
const std::string& seed=std::string(""),
size_t n_threads=1) ;
/*!
* \brief Destructor.
*/
virtual ~EMEngine() override ;
/*!
* \brief Returns a matrix with the class class references
* (protypic signal), on each row.
* \return a matrix containing the class references, on
* each row.
*/
virtual Matrix2D get_references() const ;
/*!
* \brief Returns a matrix with the posterior probabilies
* with the dimensions representing the data, classes, shifts
* and flips respectively.
* \return a matrix containing the posterior probabilities.
*/
virtual Matrix4D get_posterior_prob() const ;
/*!
* \brief Returns the likelihood of the partition.
* \return the likelihood of the partition.
*/
virtual double get_loglikelihood() const ;
/*!
* \brief Returns the Akaike Information Criterion (AIC)
* for the given partition.
* The AIC is 2n - 2LL where is the number of
* free parameters in the model and LL the log
* likelihood of the partition.
* \return the partition AIC.
*/
virtual double get_aic() const ;
/*!
* \brief Runs the data clustering.
* \return
*/
virtual ClusteringEngine::exit_codes cluster() override ;
-
protected:
/*!
* \brief Default constructor.
*/
EMEngine() = default ;
/*!
* \brief Sets each class protypic signal to 1 count,
* in average.
*/
virtual void normalize_references() ;
/*!
* \brief Initialises the references using the corresponding
* method.
* \param seeding the method to use.
*/
virtual void seeding(seeding_codes seeding) ;
/*!
* \brief Initialises the references randomly.
* Generates the initial references by randomly assigning
* the data to the classes using a beta distribution and
* all classes are set equally likely.
*/
virtual void seeding_random() ;
/*!
* \brief Initialises the K references by randomly
* sampling K rows in the data. The class are set
* equally probable.
*/
virtual void seeding_sampling() ;
/*!
* \brief Initialises the K references using the first K
* rows in data. The class are set equally probable.
*/
virtual void seeding_toy() ;
/*!
* \brief Computes the mean number of reads present in
* each slice (of length ncol - shift + 1), in each row
* of the data and store them in this->window_mean.
*/
virtual void compute_window_means() ;
/*!
* \brief The routine that effectively computes the mean
* number of reads present in each slice, for the range
* [from,to) of rows in the data.
* This function is thread safe only as long as different
* [from,to) slices are given to the different threads.
* \param from the index of the first row to treat.
* \param to the index of the past last row to treat.
* \param done a promise filled when the function is done
* working. This allows to synchronize threads.
*/
virtual void compute_window_means_routine(size_t from,
size_t to,
std::promise& done) ;
/*!
* \brief Computes the data log likelihood given the
* current class protypic signals.
*/
virtual void compute_loglikelihood() ;
/*!
* \brief The routine that effectively computes the
* log likelihoods for the range [from,to) of rows
* in the data. This function is used to distribute
* the log likelihood computations over several threads.
* This function is thread safe only as long as
* different [from,to) slices are given to the different
* threads.
* \param from the index of the first row to treat.
* \param to the index of the past last row to treat.
* \param done a promise filled when the function is
* done working. This allows to synchronize threads.
*/
virtual void compute_loglikelihood_routine(size_t from,
size_t to,
std::promise& done) ;
/*!
* \brief Computes the data posterior probabilties.
*/
virtual void compute_post_prob() ;
/*!
* \brief The routine that effectively computes the
* posterior probabilities for the range [from,to) of
* rows in the data. This function is used to distribute
* the posterior probability computations over several
* threads. This function is thread safe only as long
* as different [from,to) slices are given to the
* differentthreads.
* \param from the index of the first row to treat.
* \param to the index of the past last row to treat.
* \param probs a promise containing a vector with the
* sum of the posterior probability, for each class,
* computed for the given slice.
*/
virtual void compute_post_prob_routine(size_t from,
size_t to,
std::promise& probs) ;
/*!
* \brief Computes the class probabilities from the
* posterior probabilities.
*/
virtual void compute_class_prob() ;
/*!
* \brief Computes the class aggregations given the
* posterior probabilities.
*/
virtual void compute_references() ;
/*!
* \brief A routine that computes the partial class
* references for the range [from,to) of rows in the
* data. To obtain the full class references, it is
* required to 1) run this routine on the whole data
* at once or 2) run it on different slices and
* sum up the partial references obtained. This function
* is used to distribute the posterior probability
* computations over several threads. This function is
* thread safe only as long as different [from,to) slices
* are given to the different threads.
* \param from the index of the first row to treat.
* \param to the index of the past last row to treat.
* \param class_ref a promise containing a matrix with the
* partial class references on each row.
*/
virtual void compute_references_routine(size_t from,
size_t to,
std::promise& class_ref) ;
/*!
* \brief Modifies the class probabilities in such a
* way that the shift probabilities are then normaly
* distributed, centered on the middle shift state.
* However, the overall class probabilities remain
* unchanged.
*/
virtual void center_shifts() ;
protected:
/*!
* \brief whether flip is enabled.
*/
bool flip ;
/*!
* \brief the number of iterations.
*/
size_t n_iter ;
/*!
* \brief the number of shift states.
*/
size_t n_shift ;
/*!
* \brief the number of flip states.
*/
size_t n_flip ;
/*!
* \brief the number of classes.
*/
size_t n_class ;
/*!
* \brief the data.
*/
matrix2d_i data ;
/*!
* \brief the mean number of reads per window in the
* data.
*/
matrix2d_d window_mean ;
/*!
* \brief the class aggregation signal.
*/
matrix2d_d references ;
/*!
* \brief the log likelihoods.
*/
matrix4d_d loglikelihood ;
/*!
* \brief the max log likelihood value for each row.
*/
v_d loglikelihood_max ;
/*!
* \brief the posterior probabilities.
*/
matrix4d_d post_prob ;
/*!
* \brief the class probabilities.
*/
matrix3d_d class_prob ;
/*!
* \brief the total prob per class.
*/
v_d class_prob_tot ;
/*!
* \brief the sum per row of post_prob.
*/
v_d post_prob_row ;
/*!
* \brief the sum per class of post_prob.
*/
v_d post_prob_class ;
/*!
* \brief the total of post_prob.
*/
double post_prob_tot ;
/*!
* \brief the number of rows in data.
*/
size_t n_row ;
/*!
* \brief the number of columns in data.
*/
size_t n_col ;
/*!
* \brief the size of the pattern search and of
* the scanning window in the data.
*/
size_t l_slice ;
/*!
* \brief the seeding method to use.
*/
EMEngine::seeding_codes seeding_method ;
/*!
* \brief the number of threads.
*/
size_t n_threads ;
/*!
* \brief the threads.
*/
ThreadPool threads ;
} ;
+double sum_exp(const std::vector>& v) ;
+
#endif // EMENGINE_HPP
diff --git a/src/Clustering/ReferenceComputer.cpp b/src/Clustering/ReferenceComputer.cpp
index bde0ad0..352da39 100644
--- a/src/Clustering/ReferenceComputer.cpp
+++ b/src/Clustering/ReferenceComputer.cpp
@@ -1,84 +1,79 @@
#include
#include
#include
// some typdef
#include
-template
-std::ostream& operator << (std::ostream& stream, const std::vector& v)
-{ for(const auto& x : v)
- { stream << x << " " ; }
- stream << std::endl ;
- return stream ;
-}
ReferenceComputer::ReferenceComputer(const Matrix2D& data,
const Matrix4D& posterior_prob,
size_t n_threads)
: EMEngine(data,
posterior_prob.get_dim()[1],
1,
posterior_prob.get_dim()[2],
posterior_prob.get_dim()[3] == 2,
EMEngine::seeding_codes::RANDOM,
"",
n_threads)
{
-
// copy the data
this->data = matrix2d_i(this->n_row, v_i(this->n_col)) ;
for(size_t i=0; in_row; i++)
{ for(size_t j=0; jn_col; j++)
{ this->data[i][j] = data(i,j) ; }
}
+ // compute window means
+ this->window_mean = matrix2d_d(this->n_row, v_d(this->n_shift, 0.)) ;
+ this->compute_window_means() ;
+
// initialise, copy and compute probs
this->post_prob = matrix4d_d(this->n_row,
matrix3d_d(this->n_class,
matrix2d_d(this->n_shift,
v_d(this->n_flip, 0.)))) ;
this->class_prob = matrix3d_d(this->n_class,
matrix2d_d(this->n_shift,
v_d(this->n_flip, 0.))) ;
this->class_prob_tot = v_d(this->n_class, 0.) ;
this->post_prob_class = v_d(this->n_class, 0.) ;
for(size_t i=0; in_row; i++)
{ for(size_t j=0; jn_class; j++)
{ for(size_t s=0; sn_shift; s++)
{ for(size_t f=0; fn_flip; f++)
{ double p = posterior_prob(i,j,s,f) ;
this->post_prob[i][j][s][f] = p ;
this->post_prob_class[j] += p ;
this->post_prob_tot += p ;
}
}
}
}
this->compute_class_prob() ;
// compute the references
this->references = matrix2d_d(this->n_class,
v_d(this->l_slice, 0.)) ;
this->compute_references() ;
-
}
ReferenceComputer::~ReferenceComputer()
{ ; }
Matrix2D ReferenceComputer::get_references() const
{
// add a 1st column with the class probabilities
Matrix2D references(this->n_class, this->l_slice+1, 0.) ;
for(size_t i=0; in_class; i++)
{ // class prob
references(i,0) = this->class_prob_tot[i] ;
// signal
for(size_t j=0; jl_slice; j++)
{ references(i,j+1) = this->references[i][j] ; }
}
return references ;
}
diff --git a/src/Clustering/typedef.hpp b/src/Clustering/typedef.hpp
index 231fd50..4d3e91a 100644
--- a/src/Clustering/typedef.hpp
+++ b/src/Clustering/typedef.hpp
@@ -1,11 +1,16 @@
#ifndef TYPEDEFCLUSTERING_HPP
#define TYPEDEFCLUSTERING_HPP
+#include // std::vector
+#include // std::pair
+
typedef std::vector v_i ;
typedef std::vector v_d ;
typedef std::vector matrix2d_i ;
typedef std::vector matrix2d_d ;
typedef std::vector matrix3d_d ;
typedef std::vector matrix4d_d ;
+typedef std::vector> v_pair ;
+
#endif // TYPEDEFCLUSTERING_HPP
diff --git a/src/GenomicTools/CellMatrixCreator.cpp b/src/GenomicTools/CellMatrixCreator.cpp
new file mode 100644
index 0000000..e69de29
diff --git a/src/GenomicTools/CellMatrixCreator.hpp b/src/GenomicTools/CellMatrixCreator.hpp
new file mode 100644
index 0000000..e69de29
diff --git a/src/GenomicTools/CorrelationMatrixCreator.cpp b/src/GenomicTools/CorrelationMatrixCreator.cpp
new file mode 100644
index 0000000..fbf6dfa
--- /dev/null
+++ b/src/GenomicTools/CorrelationMatrixCreator.cpp
@@ -0,0 +1,373 @@
+#include
+#include
+#include // std::runtime_error
+
+#include // BamFileIn
+#include // BedFileIn
+
+#include
+#include
+
+
+template
+std::ostream& operator << (std::ostream& stream, const std::list& l)
+{
+ for(const auto& p : l)
+ { stream << p << " " ; }
+ return stream ;
+}
+
+template
+std::ostream& operator << (std::ostream& stream, const std::vector& v)
+{
+ for(const auto& p : v)
+ { stream << p << " " ; }
+ return stream ;
+}
+
+template
+std::ostream& operator << (std::ostream& stream, const std::pair& p)
+{
+ stream << "[" << p.first << " " << p.second << "] " ;
+ return stream ;
+}
+
+template
+std::ostream& operator << (std::ostream& stream, const std::unordered_map& m)
+{
+ for(const auto& p : m)
+ { stream << p << " " << std::endl; }
+ return stream ;
+}
+
+
+/* A lambda to sort GenomeRegion by ascending starting coordinate
+ */
+auto sortByStartPos = [](const GenomeRegion& r1, const GenomeRegion& r2) -> bool
+{ return r1 < r2 ;
+} ;
+
+CorrelationMatrixCreator::CorrelationMatrixCreator(const std::string& bed_file_path,
+ const std::string& bam_file_path,
+ const std::string& bai_file_path,
+ int from,
+ int to,
+ int bin_size,
+ MatrixCreator::methods method)
+ : MatrixCreator(bed_file_path,
+ bam_file_path,
+ bai_file_path,
+ from,
+ to,
+ bin_size,
+ method),
+ target_list_fw(),
+ target_list_rv()
+{
+ seqan::BedRecord bed_line ;
+
+ // compute coordinates relative to each region
+ this->compute_relative_bin_coord() ;
+ size_t n_col = this->relative_bin_coord.size() ;
+
+ // compute number of regions and get valid chromosomes names
+ this->open_bed_file() ;
+ this->open_bam_file() ;
+ seqan::BamHeader header ;
+ seqan::readHeader(header, bam_file) ;
+ size_t n_row = 0 ;
+ while(not seqan::atEnd(this->bed_file))
+ { seqan::readRecord(bed_line, this->bed_file) ;
+ std::string chrom_name = seqan::toCString(bed_line.ref) ;
+ // new chromosome
+ if(this->chrom_map_names.find(chrom_name) ==
+ this->chrom_map_names.end())
+ { int chrom_idx = -1 ;
+ seqan::getIdByName(chrom_idx,
+ seqan::contigNamesCache(seqan::context(this->bam_file)),
+ chrom_name) ;
+ this->chrom_map_names[chrom_name] = chrom_idx ;
+ }
+ n_row++ ;
+ }
+ this->close_bed_file() ;
+ this->close_bam_file() ;
+
+ // create the count matrix
+ this->matrix_counts = Matrix2D(n_row, n_col, 0) ;
+ // create the region matrix
+ this->matrix_bins =
+ std::vector>
+ (n_row,std::vector(n_col)) ;
+ this->open_bed_file() ;
+ this->open_bam_file() ;
+ size_t i = 0 ;
+ while(not seqan::atEnd(this->bed_file))
+ { seqan::readRecord(bed_line, this->bed_file) ;
+ // find the region limits
+ std::string region_chr = seqan::toCString(bed_line.ref) ;
+ int region_len = bed_line.endPos - bed_line.beginPos ;
+ int region_mid = bed_line.beginPos + (region_len / 2) ;
+
+ // compute the absolute bins coordinates for this region
+ // and create the bins in this region
+ for(size_t j=0; jrelative_bin_coord[j] ;
+ this->matrix_bins[i][j] =
+ GenomeRegion(region_chr,
+ this->chrom_map_names[region_chr],
+ region_mid + relative_coord.first,
+ region_mid + relative_coord.second) ;
+ }
+ i++ ;
+ }
+ this->close_bed_file() ;
+ this->close_bam_file() ;
+}
+
+CorrelationMatrixCreator::~CorrelationMatrixCreator()
+{ this->close_bam_file() ;
+ this->close_bed_file() ;
+}
+
+Matrix2D CorrelationMatrixCreator::create_matrix()
+{
+ this->open_bam_file() ;
+ this->open_bai_file() ;
+
+ // read BAM header
+ seqan::BamHeader bam_header ;
+ seqan::readHeader(bam_header, this->bam_file) ;
+
+ for(size_t i=0; imatrix_counts.get_nrow(); i++)
+ {
+ const auto& row = this->matrix_bins[i] ;
+ GenomeRegion region(row.front().chromosome,
+ row.front().chromosome_idx,
+ row.front().start,
+ row.back().end) ;
+
+ bool jump = this->jump_upstream(region, 600) ;
+ if(not jump)
+ { continue ; }
+ // read all relevant targets
+ this->to_downstream_target(region) ;
+ // update count matrix row
+ this->update_count_matrix(i) ;
+ // clean buffers
+ this->clear_target_lists() ;
+ }
+ this->close_bam_file() ;
+ return this->matrix_counts ;
+}
+
+bool CorrelationMatrixCreator::jump_upstream(const GenomeRegion& region,
+ int margin)
+{ bool has_alignment = false ;
+ int rID = -10 ;
+ if(this->chrom_map_names.find(region.chromosome) !=
+ this->chrom_map_names.end())
+ { rID = this->chrom_map_names[region.chromosome] ; }
+ else
+ { char msg[4096] ;
+ sprintf(msg, "Error! chromosome %s is not linked with a valid ID in BAM file",
+ region.chromosome.c_str()) ;
+ std::cerr << msg << std::endl ;
+ return false ;
+ }
+
+ int start = std::max(0, region.start - margin) ;
+ int end = start + 1 ;
+ bool jump = seqan::jumpToRegion(this->bam_file,
+ has_alignment,
+ rID,
+ start,
+ end,
+ this->bai_file) ;
+ return jump ;
+}
+
+void CorrelationMatrixCreator::to_downstream_target(const GenomeRegion& region)
+{ if(this->method == CorrelationMatrixCreator::methods::READ or
+ this->method == CorrelationMatrixCreator::methods::READ_ATAC)
+ { this->to_downstream_read(region) ; }
+ else
+ { this->to_downstream_fragment(region) ; }
+}
+
+void CorrelationMatrixCreator::to_downstream_read(const GenomeRegion& region)
+{ bool done = false ;
+
+ seqan::BamAlignmentRecord record ;
+
+ while(not seqan::atEnd(this->bam_file) and
+ not done)
+ { // QC check and transform record
+ seqan::readRecord(record, this->bam_file) ;
+ if(not CorrelationMatrixCreator::is_good_read(record) or
+ not this->is_valid_chromosome(record))
+ { continue ; }
+
+ GenomeRegion target ;
+ try
+ { if(this->method == CorrelationMatrixCreator::methods::READ)
+ { target = GenomeRegion::constructRead(record, this->bam_file) ; }
+ else
+ { target = GenomeRegion::constructReadATAC(record, this->bam_file) ; }
+ }
+ catch(std::invalid_argument& e)
+ { // connect to cerr to write in SAM
+ seqan::BamFileOut samFileOut(seqan::context(this->bam_file),
+ std::cerr,
+ seqan::Sam()) ;
+ std::cerr << "std::invalid_argument caught! could not use "
+ "this record as read: " << std::endl ;
+ writeRecord(samFileOut, record) ;
+ std::cerr << "message was : " << e.what() << std::endl << std::endl ;
+ continue ;
+ }
+
+ // upstream -> continue
+ if(target < region)
+ { continue ; }
+ // overlap -> store
+ else if(target | region)
+ { if(not seqan::hasFlagRC(record))
+ { this->target_list_fw.push_back(target) ; }
+ else
+ { this->target_list_rv.push_back(target) ; }
+ }
+ // downstream -> stop
+ else
+ { done = true ; }
+ }
+}
+
+void CorrelationMatrixCreator::to_downstream_fragment(const GenomeRegion& region)
+{
+ bool done = false ;
+
+ seqan::BamAlignmentRecord record ;
+
+ while(not seqan::atEnd(this->bam_file) and
+ not done)
+ { // QC check and transform record
+ seqan::readRecord(record, this->bam_file) ;
+ if(not CorrelationMatrixCreator::is_good_pair(record) or
+ not this->is_valid_chromosome(record))
+ { continue ; }
+
+ GenomeRegion target ;
+ try
+ { target = GenomeRegion::constructFragment(record, this->bam_file) ; }
+ catch(std::invalid_argument& e)
+ { // connect to cerr to write in SAM
+ seqan::BamFileOut samFileOut(seqan::context(this->bam_file),
+ std::cerr,
+ seqan::Sam()) ;
+ std::cerr << "std::invalid_argument caught! could not use "
+ "this record as fragment: " << std::endl ;
+ writeRecord(samFileOut, record) ;
+ std::cerr << "message was : " << e.what() << std::endl << std::endl ;
+ continue ;
+ }
+
+ // upstream -> continue
+ if(target < region)
+ { continue ; }
+ // overlap -> store
+ else if(target | region)
+ { if(this->method == CorrelationMatrixCreator::methods::FRAGMENT_CENTER)
+ { target = GenomeRegion::constructFragmentCenter(record,
+ this->bam_file) ;
+ if(target | region)
+ { this->target_list_fw.push_back(target) ; }
+ }
+ else
+ { this->target_list_fw.push_back(target) ; }
+ }
+ // downstream -> stop
+ else if(target > region)
+ { // std::cerr << std::endl ;
+ done = true ;
+ }
+ }
+ // std::cerr << "to_downstream_fragment END" << std::endl ;
+}
+
+void CorrelationMatrixCreator::clear_target_lists()
+{ this->target_list_fw.clear() ;
+ this->target_list_rv.clear() ;
+}
+
+/*
+void CorrelationMatrixCreator::remove_upstream_targets(const GenomeRegion& region)
+{ // forward targets
+ auto iter_fw = this->target_list_fw.cbegin() ;
+ while(iter_fw != this->target_list_fw.end())
+ { // remove upstream reads
+ if(*iter_fw < region)
+ { iter_fw = this->target_list_fw.erase(iter_fw) ; }
+ // keep overlapping reads, don't stop here
+ else if(*iter_fw | region)
+ { iter_fw++ ; }
+ // stop at first read downstream
+ else
+ { break ; }
+ }
+ // reverse targets
+ auto iter_rv = this->target_list_rv.cbegin() ;
+ while(iter_rv != this->target_list_rv.end())
+ { // remove upstream reads
+ if(*iter_rv < region)
+ { iter_rv = this->target_list_rv.erase(iter_rv) ; }
+ // keep overlapping reads
+ else if(*iter_rv | region)
+ { iter_rv++ ; }
+ // stop at first read downstream
+ else
+ { break ; }
+ }
+}
+*/
+
+void CorrelationMatrixCreator::update_count_matrix(size_t row_index)
+{
+ // forward targets
+ for(const auto& iter : this->target_list_fw)
+ { auto bin_start_end = CorrelationMatrixCreator::
+ get_bin_indices(iter, this->matrix_bins[row_index]) ;
+ for(int j=bin_start_end.first; jmatrix_counts(row_index, j) +=
+ iter.overlap_len(this->matrix_bins[row_index][j]) ;
+ }
+ }
+ // reverse targets
+ for(const auto& iter : this->target_list_rv)
+ { auto bin_start_end = CorrelationMatrixCreator::
+ get_bin_indices(iter, this->matrix_bins[row_index]) ;
+ for(int j=bin_start_end.first; jmatrix_counts(row_index, j) +=
+ iter.overlap_len(this->matrix_bins[row_index][j]) ;
+ }
+ }
+}
+
+/*
+void CorrelationMatrixCreator::update_count_matrix_naive(size_t row_index)
+{ // forward targets
+ for(const auto& iter : target_list_fw)
+ { for(size_t j=0; jmatrix_counts.get_ncol(); j++)
+ { this->matrix_counts(row_index, j) +=
+ iter.overlap_len(this->matrix_bins[row_index][j]) ;
+ }
+ }
+ // reverse targets
+ for(const auto& iter : target_list_rv)
+ { for(size_t j=0; jmatrix_counts.get_ncol(); j++)
+ { this->matrix_counts(row_index, j) +=
+ iter.overlap_len(this->matrix_bins[row_index][j]) ;
+ }
+ }
+}
+*/
diff --git a/src/GenomicTools/CorrelationMatrixCreator.hpp b/src/GenomicTools/CorrelationMatrixCreator.hpp
new file mode 100644
index 0000000..e6043bc
--- /dev/null
+++ b/src/GenomicTools/CorrelationMatrixCreator.hpp
@@ -0,0 +1,187 @@
+#ifndef CORRELATIONMATRIXCREATOR_HPP
+#define CORRELATIONMATRIXCREATOR_HPP
+
+#include