Page MenuHomec4science

appendix.tex
No OneTemporary

File Metadata

Created
Mon, May 20, 20:59

appendix.tex

\appendix
% \beginsupplement
\chapter{Supplementary material}
% ===================================== encode peaks supplemental =====================================
\section{ENCODE peaks analysis supplementary material}
% \begin{figure}[H]
% \begin{center}
% \includegraphics[scale=0.25]{images/ch_encode_peaks/em_examples.png}
% \captionof{figure}{\textbf{Nucleosome architectures around TF binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. Partitions for \textbf{A} CTCF, \textbf{B} NRF1, \textbf{C} FOS and \textbf{D} max are displayed. The first row always contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
% \label{suppl_encode_peaks_em_examples}
% % \end{center}
% \end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsUwGm12878CtcfUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
\captionof{figure}{\textbf{Chromatine architectures around CTCF binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
\label{suppl_encode_peaks_em_ctcf}
\end{center}
\end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsSydhGm12878Nrf1IggmusUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
\captionof{figure}{\textbf{Chromatine architectures around NRF1 binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
\label{suppl_encode_peaks_em_nrf1}
\end{center}
\end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsSydhGm12878CfosUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
\captionof{figure}{\textbf{Chromatine architectures around cFOS binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
\label{suppl_encode_peaks_em_cfos}
\end{center}
\end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsSydhGm12878MaxIggmusUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
\captionof{figure}{\textbf{Chromatine architectures around max binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
\label{suppl_encode_peaks_em_max}
\end{center}
\end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsSydhGm12878Brca1a300IggmusUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
\captionof{figure}{\textbf{Chromatine architectures around BRCA1 binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
\label{suppl_encode_peaks_em_brca1}
\end{center}
\end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.8]{images/ch_encode_peaks/ctcf_ndr.png}
\captionof{figure}{\textbf{Nucleosome occupancy around CTCF peaks } measured by MNase-seq, in bins of 10bp. The nucleosome depleted region is displayed in blue.}
\label{suppl_encode_peaks_ctcf_ndr}
\end{center}
\end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.4]{images/ch_encode_peaks/jund_motif_association.png}
\captionof{figure}{\textbf{JunD motif association} measured around the binding sites of different TFs. For a each TF, its binding sites, +/- 500bp, were searched for the presence of i) the TF motif and ii) CTCF motif. For each TF, a 2x2 contingency table was created with the number of peaks having i) both motifs, ii) the TF motif only, iii) CTCF motif only and iv) no motif. \textbf{A} Odd ratio (OR) of the exact Fisher test performed on each TF contingency table. The ORs are displayed with their 95\% confidence interval (CI). ORs > 1 - that is, with 1 not part of the 95\%CI - are labeled in green and indicate an association of both motifs more frequent than expected by chance. ORs < 1 are labeled in red and indicate a repulsion of both motifs more frequence than expected by chance. The JunD and cFos dataset ORs are too high to be represented in this plot. \textbf{B} Density of JunD motif occurrence at the absolute distance of different TF binding sites (peak centers) which also have their own motif present (at distance 0). The rows were standardized and aggregated using the Euclidean distance. \textbf{C} Same as in (B) but for TF binding sites that does not have their own motif.}
\label{suppl_encode_peaks_jund_association}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
\includegraphics[scale=0.4]{images/ch_encode_peaks/ebf1_haib_3.png}
\captionof{figure}{\textbf{EBF1 binding sites} around the dyad of nucleosomes having an occupied EBF1 motif within 100bp (in red) and of all nucleosomes (in blue). The abrupt decrease of EBF1 motif frequency at +/- 100bp reflects the nucleosome selection process.}
\label{suppl_encode_peaks_ebf1_nucl}
\end{center}
\end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.5]{images/ch_encode_peaks/MA0154_3.png}
\captionof{figure}{\textbf{EBF1 logo} from JASPAR binding model MA0154.3 \citep{khan_jaspar_2018}.}
\label{suppl_encode_peaks_ebf1_logo}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
\includegraphics[scale=0.4]{images/ch_encode_peaks/ebf1_haib_2.png}
\captionof{figure}{\textbf{EBF1 binding sites} chromatin features. \textbf{A} Chromatin accessibility around nucleosomes that have an EBF1 binding site within 100bp (red) and all nucleosomes (blue). \textbf{B} H3K4me2 deposition around nucleosomes that have an EBF1 binding site within 100bp (red) and all nucleosomes (blue). \textbf{C} Sequence conservation around nucleosomes that have an EBF1 binding site within 100bp (red) and all nucleosomes (blue).}
\label{suppl_encode_peaks_ebf1_chrom}
\end{center}
\end{figure}
% ======================================= SPar-K supplemental ========================================
\section{SPar-K supplementary material}
\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{void SPar-K}%
\begin{algorithm}[H]
\label{algo_spark}
\Fn{\Function{R, K, S, I}}
{
\KwData{$R$ a 2D matrix of numericals of dimension $N \times L$; $K$ the number clusters; $S$ the shifting allowed; $I$ the maximum number of iterations.}
\KwResult{5 vectors of length $N$ with the cluster labels, the data shift values, the reference shift values, the data flip values and the distances to the cluster references.}
$CLUSTER =$ vector of 0 of size $N$ \;
$SHIFT\_DAT =$ vector of 0 of size $N$ \;
$SHIFT\_REF =$ vector of 0 of size $N$ \;
$FLIP\_DAT =$ vector of 0 of size $N$ \;
$DISTANCE =$ vector of 0 of size $N$ \;
\If{smoothing}
{ smoothOutliers($R$) \; }
$REF = $ a matrix of $K$ rows and $L$ columns\;
select $K$ rows of $R$ and copy them in $REF$ using seedingRandom($K$, $R$) or seedingKmeans++($K$, $R$) \;
$n\_iter = 1$\;
$converged = false$\;
\While{$n\_iter != I$ or not $converged$}
{
\For{$i=0$ to $N-1$}
{ $D =$ a vector 0 of size $K$ \;
$S\_REF =$ a vector 0 of size $K$ \;
$S\_DAT =$ a vector 0 of size $K$ \;
$F\_DAT =$ a vector 0 of size $K$ \;
\For{$k=0$ to $K-1$}
{ $D_{k},S\_REF_{k},S\_DAT_{k}, F\_DAT_{k} =$ distanceFast($REF_{k,}$, $R{i,}$, $S$) \; }
$j = $ index of minimum value in $REF$\;
$CLUSTER_{i} = D_{j}$ \;
$SHIFT\_REF_{i} = S\_REF_{j}$ \;
$SHIFT\_DAT_{i} = S\_DAT_{j}$ \;
$FLIP\_DAT_{i} = F\_DAT_{j}$ \;
$DISTANCE\_{i} = D_{j}$ \;
}
$REF =$ computeClusterReferences($R$, $CLUSTER$, $SHIFT\_REF$, $SHIFT\_DAT$, $FLIP\_DAT$) \;
\If{$n\_iter > 1$ and at least one value of $CLUSTER$ changed since last iteration}
{ $converged = false$ \; }
\Else
{ $converged = true$ \; }
$n\_iter = n\_iter + 1$ \;
}
\Return{$CLUSTER, SHIFT\_REF, SHIFT\_DAT, FLIP, DISTANCE$}
}
\caption{SPar-K algorithm.}
\end{algorithm}
\vspace{\baselineskip}% Insert a blank line
\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{MATRIX smoothOutliers}%
\begin{algorithm}[H]
\label{algo_smooth_outliers}
\Fn{\Function{MATRIX}}
{
\KwData{$MATRIX$ a 2D matrix of numericals of dimensions $N \times L$.}
\KwResult{a 2D matrix of numericals of dimensions $N \times L$. with outlier smoothed.}
\ForEach{$ROW$ in MATRIX}
{ $m$ = mean of $ROW$ \;
$s$ = standard deviation of $ROW$ \;
$lower$ = $m$ - 3*$s$ \;
$upper$ = $m$ + 3*$s$ \;
\ForEach{$value$ in $ROW$}
{ \If{$value < lower$ or $value > upper$}
{ \If{$value$ has a left neighbor $l$ and a right neighbor $r$}
{ $value$ = ($l$ + $r$) / 2 \; }
\ElseIf{$value$ has two left neighbors $l1$ and $l2$}
{ $value$ = ($l1$ + $l2$) / 2 \; }
\ElseIf{$value$ has two right neighbors $r1$ and $r2$}
{ $value$ = ($r1$ + $r2$) / 2 \; }
}
}
}
\Return{$MATRIX$}
}
\caption{Smooth the data matrix by removing outliers.}
\end{algorithm}
\vspace{\baselineskip}% Insert a blank line
\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{d,s\_xf,s\_y,f\_y distanceFast}%
\begin{algorithm}[H]
\label{algo_distance_fast}
\Fn{\Function{X,Y,S}}
{ \KwData{$X$ and $Y$ two vectors of size $L$ and $S$ the allowed shifting}
\KwResult{the distance between $X$ and $Y$, the shift of $X$ and $Y$ and whether $Y$ was flipped in the alignment.}
\tcc{initialize all variables and data structures}
initialize() \;
\For{$i=0$ to $n-1$}
{ $SUM\_X_0 \pluseq X_i $ \;
$SUM\_X2_0 \pluseq X_i^2$ \;
$SUM\_Y_0 \pluseq Y_i $ \;
$SUM\_Y2_0 \pluseq Y_i^2$ \;
}
$i = n$ \;
\For{$s=1$ to $s=S-1$ }
{ $SUM\_X_{s} = SUM\_X_{s-1} + X_{i-n} + X_{i} $ \;
$SUM\_X2_{s} = SUM\_X2_{s-1} + X_{i-n}^{2} + X_{i}^{2}$ \;
$SUM\_Y_{s} = SUM\_Y_{s-1} + Y_{i-n} + Y_{i} $ \;
$SUM\_Y2_{s} = SUM\_Y2_{s-1} + Y_{i-n}^{2} + Y_{i}^{2}$ \;
$i \pluseq 1$ \;
}
\tcc{compute all distances with $X$ having a shift of 0, $Y$ having a shift of 0 and all the remains ones. These functions can access and modify all variables and data structures in this function.}
fillFirstRow() \;
fillFirstColl() \;
fillRemaining() \;
\tcc{if more than one, find the minimal distance which also minimize the alignment score}
$best_d = best\_s\_x = best\_s\_y = best\_f\_y = 0$ \;
\For{$tripplet$ in $MIN\_DIST$}
{ $s\_x = tripplet_{0}$ \;
$s\_y = tripplet_{1}$ \;
$f\_y = tripplet_{2}$ \;
$min\_score = \infty$ \;
\If{$SCORES_{s\_x,s\_y} < min\_score$}
{ $min\_score = SCORES_{s\_x,s\_y}$ \;
$best\_s\_x = s\_x$ \;
$best\_s\_y = s\_y$ \;
$best\_f\_y = f\_y$ \;
$best\_d = DIST_{s\_x,s\_y,f\_y}$ \;
}
}
\Return{$best\_d,best\_s\_x,best\_s\_y,best\_f\_y$}
}
\caption{Fast algorithm to compute the correlation distance with shift and flip}
\end{algorithm}
\vspace{\baselineskip}% Insert a blank line
\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{initialize}%
\begin{algorithm}[H]
\label{initialize_algo}
\Fn{\Function{}}
{
$l\_half = L / 2$ \;
$n = L - S + 1$ \;
$n\_half = n / 2$ \;
%\tcc{1-cor for each shift of $X$ (on rows) and each shift of $Y$ (on columns)}
$DIST =$ 3D matrix of 3 of dimensions $S*S*2$ \;
%\tcc{the sums of the pairwise products of the $X$ elements for each shift and $Y$ elements for each shift}
$SUM\_XY =$ 3D matrix of 0 of dimensions $S*S*2$ \;
%\tcc{the sums of the $X$ elements for each shift}
$SUM\_X =$ a vector of 0 of size $S$ \;
%\tcc{the sums of squares of the $X$ elements for each shift}
$SUM\_X2 =$ a vector of 0 of size $S$ \;
%\tcc{same for $Y$}
$SUM\_Y =$ a vector of 0 of size $S$ \;
$SUM\_Y2 =$ a vector of 0 of size $S$ \;
%\tcc{a centrality score for each alignment, a forward and a reverse alignments have the same score}
$SCORES =$ a 2D matrix of 0 of dimensions $S*S$ \;
%\tcc{to track minimum distances in $DIST$}
$MIN\_DIST =$ an empty list of triplets \;
$min\_dist = 3$ \;
}
\caption{A routine of distanceFast() that initializes all the necessary variables. This function can access and modify variables in distanceFast().}
\end{algorithm}
\vspace{\baselineskip}% Insert a blank line
\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{void fillFirstRow}%
\begin{algorithm}[H]
\Fn{\Function{}}
{
% \tcc{fill first row of $DIST$, 1-cor with $X$ having a shift of 0}
$from\_x = 0$ \;
$to\_x = from\_x + n$ \;
\For{$i = 0$ to $S-1$}
{ \tcc{forward orientation}
$from\_y = i$ \;
$to\_y = from\_y + n$ \;
$SUM_XY{from\_x,from\_y,0} = 0$ \;
\For{$j = 0$ to $n-1$}
{ $SUM\_XY_{from\_x,from\_y,0} \pluseq X_{from\_x+j} * Y_{from\_y+j}$ \; }
% \tcc{correlation}
$c = \frac{n* SUM\_XY_{from\_x,from\_y,0} - SUM\_X_{from\_x} * SUM\_Y_{from\_y}}
{\sqrt{n*SUM\_X2_{from\_x} - SUM\_X_{from\_x}^{2}} * \sqrt{n*SUM\_Y2_{from\_y} - SUM\_Y_{from\_y}^{2}}}$ \;
\If{division by 0 occurred}
{ $c = 0$ \; }
% \tcc{distance}
$d = 1 - c$ \;
$DIST_{from\_x,from\_y,0} = d$ \;
% \tcc{alignment centrality score, this value is the same for forward and reverse alignments}
$SCORES_{from\_x,from\_y} = \abs{l\_half - (n\_half + from\_x)} +
\abs{l\_half - (n\_half + from\_y)}$ \;
\If{$d < min\_dist$}
{ $min\_distance = d$ \;
append a triplet $(from\_x,from\_y, 0)$ to $MIN\_DIST$ \;
}
\ElseIf{$d = min\_dist$}
{ append a triplet $(from\_x,from\_y, 0)$ to $MIN\_DIST$ \; }
\tcc{reverse orientation}
$SUM_XY{from\_x,from\_y,1} = 0$ \;
\For{$j = 0$ to $n-1$}
{ $SUM\_XY_{from\_x,from\_y,1} \pluseq X_{from\_x+j} * Y_{to\_y-j-1}$ \; }
% \tcc{correlation}
$c = \frac{n* SUM\_XY_{from\_x,from\_y,1} - SUM\_X_{from\_x} * SUM\_Y_{from\_y}}
{\sqrt{n*SUM\_X2_{from\_x} - SUM\_X_{from\_x}^{2}} * \sqrt{n*SUM\_Y2_{from\_y} - SUM\_Y_{from\_y}^{2}}}$ \;
\If{division by 0 occurred}
{ $c = 0$ \; }
% \tcc{distance}
$d = 1 - c$ \;
$DIST_{from\_x,from\_y,1} = d$ \;
\If{$d < min\_dist$}
{ $min\_distance = d$ \;
append a triplet $(from\_x,from\_y, 1)$ to $MIN\_DIST$ \;
}
\ElseIf{$d = min\_dist$}
{ append a triplet $(from\_x,from\_y, 1)$ to $MIN\_DIST$ \; }
}
}
\caption{A routine of distanceFast() computing all distances with $X$ having a shift of 0. This function can access and modify all variables declared in distanceFast().}
\end{algorithm}
\vspace{\baselineskip}% Insert a blank line
\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{void fillFirstCol}%
\begin{algorithm}[H]
\Fn{\Function{}}
{ % \tcc{fill first row of $DIST$, 1-cor with $X$ having a shift of 0}
$from\_y = 0$ ; $to\_y = from\_y + n$ \;
$from\_y\_rev = s - from\_y - 1$ \;
$to\_y\_rev = from\_y\_rev + n$ \;
\For{$i=0$ to $S-1$}
{ $from\_x = i$ \;
$to\_x = from\_x + n$ \;
\tcc{forward orientation}
$SUM\_XY_{from\_x,from\_y,0} = 0 $ \;
\For{$j=0$ to $n-1$}
{ $SUM\_XY{from\_x,from\_y,0} \pluseq X_{from\_x+j}*Y_{from\_y+j} $ \; }
$c = \frac{n*SUM\_XY_{from\_x,from\_y,0} - SUM\_X_{from\_x}*SUM\_Y_{from\_y}}
{\sqrt{n*SUM\_X2_{from\_x} - SUM\_X_{from\_x}^{2}} * \sqrt{n*SUM\_Y2_{from\_y} - SUM\_Y_{from\_y}^{2}}}$ \;
\If{division by 0 occurred}
{ $c = 0$ \; }
% \tcc{distance}
$d = 1 - c$ \;
$DIST_{from\_x,from\_y,0} = d$ \;
% \tcc{alignment centrality score, this value is the same for forward and reverse alignments}
$SCORES_{from\_x,from\_y} = \abs{l\_half - (n\_half + from\_x)} +
\abs{l\_half - (n\_half + from\_y)}$ \;
\If{$d < min\_dist$}
{ $min\_distance = d$ \;
append a triplet $(from\_x,from\_y, 0)$ to $MIN\_DIST$ \;
}
\ElseIf{$d = min\_dist$}
{ append a triplet $(from\_x,from\_y, 0)$ to $MIN\_DIST$ \; }
\tcc{reverse orientation}
$SUM\_XY_{from\_x,S-1,1} = 0$ \;
\For{$j=0$ to $n-1$}
{ $SUM\_XY_{from\_x,S-1,1} \pluseq X_{from\_x+j} * Y_{to\_y\_rev-1-j}$ \; }
$c = \frac{n*SUM\_XY_{from\_x,S-1,1} - SUM\_X_{from\_x}*SUM\_Y_{S-1}}
{\sqrt{n*SUM\_X2_{from\_x} - SUM\_X_{from\_x}^{2}} * \sqrt{n*SUM\_Y2_{S-1} - SUM\_Y_{S-1}^{2}}}$ \;
\If{division by 0 occurred}
{ $c = 0$ \; }
% \tcc{distance}
$d = 1 - c$ \;
$DIST_{from\_x,from\_y,1} = d$ \;
\If{$d < min\_dist$}
{ $min\_distance = d$ \;
append a triplet $(from\_x,S-1, 1)$ to $MIN\_DIST$ \;
}
\ElseIf{$d = min\_dist$}
{ append a triplet $(from\_x,S-1, 1)$ to $MIN\_DIST$ \; }
}
}
\caption{A routine of distanceFast() computing all distances with $Y$ having a shift of 0. This function is can access and modify all variables declared in distanceFast().}
\end{algorithm}
\vspace{\baselineskip}% Insert a blank line
\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{void fillRemaining}%
\begin{algorithm}[H]
\Fn{\Function{}}
{ \For{$i=1$ to $S-1$}
{ $from\_x = i$ ; $to\_x = from\_x + n $ \;
\For{$j=1$ to $S-1$}
{ $from\_y = j$ \;
$to\_y = from\_y + n$ \;
\tcc{forward orientation}
$SUM\_XY_{from\_x,from\_y,0} = SUM\_XY_{from\_x-1,from\_y-1,1} - X_{from\_x-1} * Y_{from\_y-1} + X_{to\_x-1}*Y{to\_y-1}$ \;
$c = \frac{n*SUM\_XY{from\_x,from\_y,0} - SUM\_X{from\_x}*SUM\_Y{from\_y}}
{\sqrt{n*SUM\_X2_{from\_x} - SUM\_X_{from\_x}^2} * \sqrt{n*SUM\_Y2_{from\_Y} - SUM\_Y_{from\_Y}^2}}$ \;
\If{division by 0 occurred}
{ $c = 0$ \; }
$d = 1 - c$ \;
$DIST_{from\_x,from\_y,0} = d$ \;
%\tcc{alignment centrality score, this value is the same for forward and reverse alignments}
$SCORES_{from\_x,from\_y} = \abs{l\_half - (n\_half + from\_x)} +
\abs{l\_half - (n\_half + from\_y)}$ \;
\If{$d < min\_dist$}
{ $min\_distance = d$ \;
append a triplet $(from\_x,from\_y,0)$ to $MIN\_DIST$ \;
}
\ElseIf{$d = min\_dist$}
{ append a triplet $(from\_x,from\_y,0)$ to $MIN\_DIST$ \; }
\tcc{reverse orientation}
$j\_rev = S - j - 1$ \;
$from\_y\_rev = j\_rev$ \;
$to\_y\_rev = from\_y\_rev + n$ \;
$SUM\_XY_{from\_x,from\_y\_rev,1} = SUM\_XY_{from\_x-1,from\_y\_rev+1,1} - X_{from\_x-1}*Y_{to\_y\_rev} + X_{to\_x-1}*Y{from\_y\_rev}$ \;
$c = \frac{n*SUM\_XY_{from\_x,from\_y\_rev,1} - SUM\_X_{from\_x}*SUM\_Y_{from\_y\_rev}}
{\sqrt{n*SUM\_X2_{from\_x} - SUM\_X_{from\_x}^2} * \sqrt{n*SUM\_Y2_{from\_y\_rev} - SUM\_Y_{from\_y_rev}^2}}$ \;
\If{division by 0 occurred}
{ $c = 0$ \; }
$d = 1 - c$ \;
$DIST_{from\_x,from\_y,1} = d$ \;
\If{$d < min\_dist$}
{ $min\_distance = d$ \;
append a triplet $(from\_x,from\_y\_rev,1)$ to $MIN\_DIST$ \;
}
\ElseIf{$d = min\_dist$}
{ append a triplet $(from\_x,from\_y\_rev,1)$ to $MIN\_DIST$ \; }
}
}
}
\caption{A routine of distanceFast() computing all remaining distances between $X$ and $Y$. This function can access and modify all variables declared in distanceFast().}
\end{algorithm}
\vspace{\baselineskip}% Insert a blank line
\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{REF seedingRandom}%
\begin{algorithm}[H]
\label{algo_seed_random}
\Fn{\Function{K, R}}
{ \KwData{$K$ the number references to initialize from $R$, a 2D matrix of numericals of dimensions $N \times L$.}
\KwResult{$REF$ a 2D matrix of numericals of dimensions $K \times L$.}
$REF = $ a 2D matrix of dimensions $K*L$ \;
\For{$i=0$ to $K-1$}
{ $j = $ sample a number from $0$ to $N-1$ with uniform probabilities \;
\While{$j$ has already been sampled before}
{ $j = $ sample a number from $0$ to $N-1$ with uniform probabilities \; }
$REF_{i,} = R_{j,}$ \;
}
\Return{$REF$}
}
\caption{Random seeding algorithm}
\end{algorithm}
\vspace{\baselineskip}% Insert a blank line
\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{REF seedingKmeans++}%
\begin{algorithm}[H]
\label{algo_seed_kmeans++}
\Fn{\Function{K, R, S}}
{ \KwData{$K$ the number references to initialize from $R$, a 2D matrix of numericals of dimensions $N \times L$, allowing a shifting of $S$.}
\KwResult{$REF$ a 2D matrix of numericals of dimensions $K \times L$.}
$REF = $ a 2D matrix of dimensions $K*L$ \;
$DIST\_TO\_CLOSEST = $ a vector of size $N$\;
\tcc{choose first reference, a standard deviation equals to 0 should be avoided otherwise computing a distance against this reference will always be equal to 1 (see how distance are computed).}
$j = $ sample a number from $0$ to $N-1$ with uniform probabilities but for which $DATA_{j,}$ has a standard deviation different from $0$ \;
$REF_{0,} = DATA_{j,}$ \;
\tcc{choose next references}
\For{$k=1$ to $K-1$}
{ \For{$i=0$ to $N-1$}
{ \tcc{distance to all current references for this region}
$DIST\_TO\_CENTERS =$ a vector of size $k$ ;
\For{$j=0$ to $k-1$}
{ $DIST\_TO\_CENTERS_{j,} =$ distanceFast($REF_{j,}$, $R_{i,}$, $S$) \; }
\tcc{distance to closest current reference, for this region}
$DIST\_TO\_CLOSEST{i,} =$ minimum value of $DIST\_TO\_CENTERS$\;
}
$j =$ sample a number from 0 to $N-1$ using $DIST\_TO\_CLOSEST$ as the probability/weight distribution for each point to be sample. \;
\While{$j$ has already been sampled before}
{ $j =$ sample a number from 0 to $N-1$ using $DIST\_TO\_CLOSEST$ as the probability/weight distribution for each point to be sample. \; }
$REF_{k,} = R{j,}$ \;
}
\Return{REF}
}
\caption{Kmeans++ seeding algorithm.}
\end{algorithm}
% ======================================= SMiLE-seq supplemental =======================================
\section{SMiLE-seq supplementary material}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.3]{images/ch_smile-seq/figure_s4_reproduced.png}
\captionof{figure}{\textbf{Predictive power of SMiLE-seq :} \textbf{A} binding models were derived de novo from HT-SELEX 1st cycle data using the HMM discovery method (labelled HT-SELEX cycle 1 HMM) and their performances were assessed using the AUC-ROC. AUC-ROC values for the corresponding TF models derived from SMiLe-seq data (labelled SMiLE-seq) and reported by Jolma and colleagues (labelled HT-SELEX reported matrices, \cite{jolma_dna-binding_2013}) are also displayed. \textbf{B} the predictive performances of CEBPb, CTCF and TCF7 binding models were assessed using subsets of binding sites of decreasing affinities. Inside each peak list, the peaks were ranked by score and subsets of 500 peaks were selected. Peaks 1-500 have the highest affinity, then peaks 501-1000, and so on. The boxplots indicate the distribution of AUC-ROC obtained over all available peak-lists.}
\label{suppl_smileseq_auc_2}
\end{center}
\end{figure}
% ======================================= ATAC-seq supplemental ========================================
\section{Chromatin accessibility of monocytes supplementary material}
\subsection{Fragment size analysis}
\label{suppl_atac_seq_fragment_size}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.3]{images/ch_atac-seq/fragment_lengths.png}
\captionof{figure}{\textbf{Fragment size analysis} \textbf{A} sequenced fragment size density. The three peaks, from left to right, indicate i) the open chromatin fragments, ii) the mono-nucleosome fragments and iii) the di-nucleosome fragments. A mixture model composed of three Gaussian distributions was fitted to the data in order to model the fragment sizes. The class fit is shown as dashed lines : open chromatin (red), mono-nucleosomes (blue) and di-nucleosomes (green). The violet dashed line show the sum of the three classes. \textbf{B :} probability that a fragment belongs to any of the three fragment classes, given its size i) open chromatin (red), ii) mono-nucleosomes (blue) and iii) di-nucleosomes (green). The vertical dashed lines indicates, for each class, the size limit at which the class probability drops below 0.9. With these limites, the class spans are i) 30-84bp for open chromatin (red), ii) 133-266bp for mono-nucleosomes (blue) and iii) 341-500bp for di-nucleosomes (green). The upper limit of the di-nucleosome class was arbitrarily set to 500bp. \textbf{C :} final fragment classes. Each fragments which size overlapped the size range spanned by a class, was assigned to that class. This ensured a high confidence assignment for more than 134 million fragments, leaving 46 millions of ambiguous and long fragments (>500bp) unassigned.}
\label{atac_seq_fragment_size}
\end{center}
\end{figure}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.4]{images/ch_atac-seq/ctcf_motifs_10e-6_aggregations.png}
\captionof{figure}{\textbf{Signal around CTCF motifs : } the human genome was scanned with a CTCF PWM and different aggregated signal densities were measured for open chromatin (red lines), mono nucleosome (blue lines), di-nucleosomes (green lines) and for a pool of mono-nucleosome fragments with di-nucleosomes fragments cut in two at their center position (violet line). \textbf{Top row :} each position of the fragments, from the start of the first read to the end of the second, were used. \textbf{Middle row :} each position of the reads were used. \textbf{Bottom row :} only one position at the read edges for open chromatin fragment and the central position of nucleosome fragment were used. The open chromatin read edges were modified by +4bp and -5bp for +strand and -strand reads respectively.\\
The aggregated densities were measured using bin sizes of 1 (left column), 2 (middle column) and 10bp (right column).}
\label{atac_seq_ctcf_all_data}
\end{center}
\end{figure}
Nucleosomes fragments are expected to be large, as a nucleosome is wrapped by \~150bp of DNA whereas nucleosome free region fragments can be expected to be shorter. Long nucleosome free region fragments are unlikely. The longer an accessible region is, the most likely an insertion will happen resulting in the creation of two shorter fragments. A fragment size analysis allowed to identify different categories of fragments (Figure \ref{atac_seq_fragment_size}). In this figure, open regions, mono- and di-nucleosome fragments are clearly visible. Morever, a 10bp periodicity oscillations reflecting the DNA pitch is also visible. This pattern is expected and indicates a good data quality \citep{buenrostro_transposition_2013}.
Rather than assigning arbitrary fragment size threshold to separate the categories, I preferred to use the approach developed by \citep{buenrostro_transposition_2013}. The fragment sizes were fitted by a mixture of three Gaussian distributions. Then, the limits for each fragment class was defined as the size at which the probability of assignment to that fragment class dropped under 0.9 (Figure \ref{atac_seq_fragment_size}B).
This method ensured the classification of 134 millions of fragments, leaving ~46 millions reads unassigned (Figure \ref{atac_seq_fragment_size}C). However, this reduces drastically the risks of fragment mis-classification and protects the downstream analyses from a strong bias.
\subsection{Measuring open chromatin and nucleosome occupancy}
\label{suppl_atac_seq_measuring_signal}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.3]{images/ch_atac-seq/ctcf_sp1_myc_ebf1_footprint.png}
\captionof{figure}{\textbf{Signal around CTCF, SP1, myc and EBF1 motifs :} the human genome was scanned with one PWM per TF to predict their binding sites (see section \ref{atac_seq_method_pwmscan}). For each TF, the open chromatin accessibility was measured (red) as well as and the nucleosome occupancy (blue) around their predicted binding sites. For the chromatin accessibility, the corrected read edges were considered and for nucleosomes, the center of the fragments. The motif location is indicated by the dashed lines.}
\label{atac_seq_ctcf_sp1_myc_ebf1_footprint}
\end{center}
\end{figure}
Once the different fragment populations have been identified, the next question to solve is how should each category of fragment be represented?
First, for open chromatin fragment, it is clear that we want to know where the DNA is accessible. This information is provided by the fragment edges – the transpositions sites. However, to account for the fact that the Tn5 transposase acts as a homo-dimer and inserts two barcodes side by side \citep{adey_rapid_2010}, the fragment edges positions were modified by +4bp for reads mapping the + strand and -5bp for reads mapping the - strand, as done in other studies \citep{buenrostro_transposition_2013,li_identification_2019}.
Second, for mono and di-nucleosome fragments, we are interested in knowing where the nucleosomes are sitting. For this, the fragment edges may not be the most informative. A better way to represent those fragments would be to use the center positions, which should correspond to the dyad for mono-nucleosomes or even to consider the entire reads or fragments.
To test these hypotheses I investigated the different signal aggregations around predicted CTCF binding sites using. The signal, +/- 1kb around the motif, was aggregated inside bins of 1, 2 or 10bp size. CTCF predicted binding sites were good candidates because CTCF is know to bind mostly through its motif (\cite{neph_expansive_2012} and Figure \ref{encode_peaks_gm12878_motif_prop}). Additionally CTCF binding produces a really typical chromatin architecture with strongly positioned nucleosomes arrays \citep{fu_insulator_2008} and leaves a footprint \citep{neph_expansive_2012}.
As seen in Figure \ref{atac_seq_ctcf_all_data}, entire open chromatin reads and fragments do not allow to visualize a footprint signature (upper and middle rows, red lines). Both of them, nonetheless highlight open chromatin regions. The footprint becomes visible when considering the edges of the open chromatin fragments (bottom row, red line). Increasing the bin size blurs it and eventually makes it disappear (10bp, lower right).
Regarding nucleosomes, considering the entire fragments blurs the signal (upper row, blue and green lines) and the entire reads reveal the region upstream and downstream of the nucleosomes (middle row, blue and green lines). The only way to obtain a precise nucleosomes occupancy information was to use the middle position of the mono-nucleosome fragments (bottom row, blue line). Interesting enough, the middle position of di-nucleosome fragments indicates the DNA linker between two adjacent nucleosomes but does not accumulate in open chromatin regions (bottom row, green line). This suggested that di-nucleosome fragments could be separated in two mono-nucleosome fragments. I tested this hypothesis by simply dividing a di-nucleosome fragment in two smaller ones, at its center position. I then pooled these new fragments with the mono-nucleosome fragments to create a nucleosome fragment dataset. When looking at the middle of these fragments, they could perfectly reveal the nucleosomes directly adjacent to the CTCF motif. Additionally this nucleosome dataset was also able to reveal a second nucleosome in the arrays (bottom row, violet lines).
To further support these results, I also measured the chromatin organization (+5/-4 corrected read edges for open chromatin and center of the nucleosome fragments from the nucleosome fragment dataset) around SP1, myc and EBF1 binding motifs as well. As shown in Figure \ref{atac_seq_ctcf_sp1_myc_ebf1_footprint}, the aggregation of the signal arount the CTCF and SP1 motifs show an enhanced accessibility on the motif as well as a clear footprint. Moreover, the footprint is in a nucleosome free region. The situation was different for myc and EBF1. Neither of the two aggregations showed a nucleosome free region, nor an increased accessibility around the motif. Regarding myc, even though its aggregation presented a signal compatible with a local protection of its motif, this was shallow in comparison of CTCF and SP1. Finally, EBF1 presented a somewhat decreased accessibility around its motif and a striking increase accessibility directly at the level of the motif.
CTCF and SP1 motifs are supporting the fact that footprints and nucleosome occupancy can be revealed using this method. Together with myc and EBF1, they clearly show an heterogeneity of chromatin organizations, at least at the aggregation level.
There are many possible explanations for these results. One of them is that the aggregation hides the variability of the individual regions and that SP1 and CTCF present a more conserved organization around their motif than myc and EBF1. Another would that the most visible and obvious footprint reflect an stronger TF activity. However, one should remain cautious on the interpretation of aggregation patterns as the individual sites signal may interfere with each other, creating an artificial aggregation that does not exist at any individual site \citep{kundaje_ubiquitous_2012}.
In the light of these results, I decided to use the +5/-4 corrected edges of the open chromatin reads to investigate footprints and the fragment centers of the newly created nucleosome dataset to investigate nucleosome occupancy. If not explicitly stated otherwise, the reader should consider that any signal was measured using this procedure.
\subsection{Evaluation of EMSequence and ChIPPartitioning}
\label{suppl_eval_emseq_chippartitioning}
It was important to assess the performances of the partitioning methods to discover sequence motifs and footprint classes.
% To evaluate the behavior of each algorithm, a simple situation was considered. As in the previous section, a list of predicted CTCF, SP1, EBF1 and myc binding sites were compiled using a PWM genome scan. For each TF, the open chromatin read density, the nucleosome occupancy and the DNA sequences were extracted.
% This case can be considered simple for two reasons. First, for a given TF, all the regions have a motif instance. Second, this motif instance is located exactly in the center of the region, in the same orientation. Thus no shifting nor flipping is required in order to reveal footprints. The only necessary thing may be to discover different types of footprints.
\subsubsection{EMSequence}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.4]{images/ch_atac-seq/simulated_sequences_2class_flip_best_motifs.png}
\captionof{figure}{\textbf{Simulated data motifs :} motifs used for the data generation (labeled "True motif") and the best scoring - based on the AUC - partition motifs (labeled "Found motif"). The partition with EMSequence was run such that it was searching for motifs of 11bp, slightly longer than those used for the data generation. "RC" stands for reverse complement. The motifs tree and alignment was build using the motifStack R package \citep{ou_motifstack_2018}.}
\label{suppl_atac_seq_emseq_best_motifs}
\end{center}
\end{figure}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.4]{images/ch_atac-seq/simulated_sequences_2class_flip_auc_roc.png}
\captionof{figure}{\textbf{Classification performances on simulated data :} \textbf{Left} 50 different data partitions were run using EMSequence. The discovered models were then used to assign a class label to each sequence. These assigned labels were then compared to the true labels using the AUC under the ROC curve. The red line indicates the AUC value achieved by the true motifs. \textbf{Right} the 50 ROC curves corresponding to each partition. The red lines indicates the true motifs ROC curve. The curves under the diagonal are the cases where the 1st discovered class corresponded to the 2nd true class and vice-versa. For these cases, the AUC is actually the area over the curve.}
\label{suppl_atac_seq_emseq_auc_roc}
\end{center}
\end{figure}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.35]{images/ch_atac-seq/sp1_motifs_7class.png}
\captionof{figure}{\textbf{SP1 motifs :} partition of 15'883 801bp sequences centered on a SP1 binding site using EMSequence. The different classes are ordered by decreasing overall probability. Arrows atop of the motifs indicates tandem arrangements of SP1 motifs.}
\label{suppl_atac_seq_emseq_sp1_7class}
\end{center}
\end{figure}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.35]{images/ch_atac-seq/sp1_motifs_10class.png}
\captionof{figure}{\textbf{SP1 motifs :} partition of 15'883 801bp sequences centered on a SP1 binding site using EMSequence. These sequences were classified by EMSequence to search for 10 different 30bp long motifs ($801 - 30 = 771$ of shifting freedom). The optimization was run for 20 iterations. The different classes are ordered by decreasing overall probability. Arrows atop of the motifs indicates head-to-tail arrangements of SP1 motifs.}
\label{suppl_atac_seq_emseq_sp1_10class}
\end{center}
\end{figure}
In order to measure the ability of EMSequence to retrieve over-represented motifs from a set of sequences, I simulated 2'000 synthetic DNA sequences of 100bp long. The sequences were separated in two classes. Each class was defined by a 8bp sequence motif (Figure \ref{suppl_atac_seq_emseq_best_motifs}). Each sequence had exactly one motif occurrence, anywhere in the sequence (with a uniform probability), on either strand (equiprobable).
These sequences were partitioned with flipping into 2 classes by EMSequence in order to find 2 motifs of 11bp ($100bp - 11bp + 1 = 90$bp of shifting). The optimization was run for 200 iterations. To assert the quality of the motifs discovered, I set up a classification framework inspired by PWMEval-ChIP-peak (see section \ref{section_smileseq_pwmeval}). Using equation \ref{smile_seq_pwmeval_score}, each sequence was scored with both model of each partition and the area under the curve (AUC) of the receiver operator characteristic (ROC) value was computed for each partition. The same was done using the true motif models. Because EMSequence is sensitive to its initial state, 50 partitions were performed. As shown in Figure \ref{suppl_atac_seq_emseq_auc_roc}, the de-novo discovered models are as good as the actual sequence motifs to segregate both sequence classes. Additionally, a visual inspection of the discovered motif logo confirmed that most of the discovered motifs actually match the true sequence motifs (Figure \ref{suppl_atac_seq_emseq_best_motifs}).
In order to further demonstrate the ability of EMSequence on a more significant biological case, I investigated SP1 sequence specificity. As for ChIPPartitioning, a list of 15'883 predicted SP1 binding sites were compiled using a PWM genome scan. The sequences +/- 400bp around the motif instance centers were extracted. Thus, all regions contained at least one SP1 site. Thus, retrieving the SP1 binding site is expected. Additionally, as SP1 tends to bind to promoters, we cannot exclude to see other motif being being discovered. These sequences were then given to EMSequence to search for several different 31bp long motifs ($801 - 31 + 1 = 771$ of shifting freedom). The optimization was run for 20 iterations.
The motifs that were retrieved matched the expectations (Figure \ref{suppl_atac_seq_emseq_sp1_7class}). All classes retrieved an SP1 motif. Four classes (1,5,6,7) retrieved a single SP1 motif. Even though they are highly similar, they vary in term of flanking regions (class 6 versus 7 for instance). Class 3, which contained a surprisingly long motif, representing 24\% of the data, actually captured a LINE element. Indeed, the "GCAGCGAGGCTGGGGGAGGGGC" is characteristic of it (determined using BLAT \citep{kent_blatblast-like_2002} on the UCSC Genome Browser). Finally, and more interesting, classes 2 and 4 could capture two rare (about 1\% of the cases each) tandem repeats of SP1 motifs with two different spacers (1 and 9bp). Additionally, head-to-head SP1 motif repeats could be detected (\ref{suppl_atac_seq_emseq_sp1_10class}, classes 1 and 4). This suggested that SP1 binds as i) an homo-dimer or ii) as an hetero-dimer with another member of its family, binding a resembling motif. Moreover, the tandem and heat-to-head motif repeats suggested that different structural arrangement exist. According to BioGrid \citep{chatr-aryamontri_biogrid_2017}, SP1 has been reported to physically interact with SP1 (homo-dimer), SP3 and SP4 (hetero-dimer). According to JASPAR 2018 matrix clustering \citep{castro-mondragon_rsat_2017}, the KLF and EGR families recognizes similar motifs. Members of either families are also listed as SP1 interactors in Biogrid (KLF4, KLF6, KLF9, KLF10 and EGR1).
The lack of non-SP1 motif discovered could be explained by at least one reason. The list of SP1 binding sites compiled was performed using a quite stringent threshold. The consequence is that the motif instances are highly similar to each other. This makes SP1 motifs strongly dominant within the dataset. Since EMSequence optimizes a set of models, it is highly sensitive to its starting state. In this experiment, EMSequence was initialised randomly. Because of the dominance of SP1 motifs within the data, it is likely that the different classes were attracted by them rather than allowed to diverge to detect other motifs.
Together, these evidences support the fact that EMSequence is suited to perform a meaningful partition of DNA sequences and to retrieve biologically important DNA motifs.
\subsubsection{ChIPPartitioning}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.30]{images/ch_atac-seq/ctcf_motifs_6class_noshift_flip.png}
\captionof{figure}{\textbf{Open chromatin classes around CTCF motifs} found by ChIPPartitioning without shifing but with flipping to identify different classes of footprints around 26'650 CTCF motifs. The aggregation signal around the 6 different classes found are shown by decreasing class probability. The open chromatin patterns are displayed in red, the nucleosomes are displayed in blue. The aggregated DNA sequence is displayed as a logo. The y-axis ranges from the minimum to the maximum signal observed. For the DNA logo, this corresponds to 0 and 2 bits respectively.}
\label{suppl_atac_seq_emread_ctcf_noshift_flip}
\end{center}
\end{figure}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.3]{images/ch_atac-seq/sp1_motifs_6class_noshift_flip.png}
\captionof{figure}{\textbf{Open chromatin classes around SP1 motifs :} EMRead was run without shifing (+/- 10bp) but with flipping to identify different classes of footprints around 15'883 SP1 motifs. The aggregation signal around the 6 different classes found are shown by decreasing class probability. The open chromatin patterns are displayed in red, the nucleosomes are displayed in blue. The aggregated DNA sequence is displayed as a logo. The y-axis ranges from the minimum to the maximum signal observed. For the DNA logo, this corresponds to 0 and 2 bits respectively.}
\label{suppl_atac_seq_emread_sp1_noshift_flip}
\end{center}
\end{figure}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.35]{images/ch_atac-seq/ctcf_motifs_6class_shift_flip.png}
\captionof{figure}{\textbf{Open chromatin classes around CTCF motifs} found by ChIPPartitioning with shifing but with flipping to identify different classes of footprints around 26'650 CTCF motifs. The aggregation signal around the 6 different classes found are shown by decreasing class probability. The open chromatin patterns are displayed in red, the nucleosomes are displayed in blue. The aggregated DNA sequence is displayed as a logo. The y-axis ranges from the minimum to the maximum signal observed. For the DNA logo, this corresponds to 0 and 2 bits respectively.}
\label{suppl_atac_seq_emread_ctcf_shift_flip}
\end{center}
\end{figure}
\begin{figure}[h]
\begin{center}
\includegraphics[scale=0.3]{images/ch_atac-seq/sp1_motifs_6class_shift_flip.png}
\captionof{figure}{\textbf{Open chromatin classes around SP1 motifs :} EMRead was run with shifing (+/- 10bp) flipping to identify different classes of footprints around 15'883 SP1 motifs. The aggregation signal around the 6 different classes found are shown by decreasing class probability. The open chromatin patterns are displayed in red, the nucleosomes are displayed in blue. The aggregated DNA sequence is displayed as a logo. The y-axis ranges from the minimum to the maximum signal observed. For the DNA logo, this corresponds to 0 and 2 bits respectively.}
\label{suppl_atac_seq_emread_sp1_shift_flip}
\end{center}
\end{figure}
A complete benchmark of the ChIPPartitioning has been performed in \citep{nair_probabilistic_2014}. In this paper, the authors have generated simulated ChIP-seq data with patterns to retrieve, at different coverages and compared the performances with other similar software. It turned out that ChIPPartitioning was the best performing method. For this reason, I did not repeat this benchmark. However, ChIPPartitioning ability to retrieve footprint classes from from ATAC-seq data has not been performed yet.
To evaluate this, a simple situation was considered. As in the previous section, a list of predicted CTCF and SP1 binding sites were compiled using a genome scan with suited binding models. For each TF, the open chromatin read density around these sites was measured +/-400bp aroud the motif instances, at the single base pair resolution, and classified. As the motif instances were already aligned in the center of the regions, no shifting was used. However, the region orientations were not corrected based on the strand on which the motif instance appeared.
To evaluate the capability of ChIPPartitioning to retrieve classes of footprints, these data were classified
i) without shifting and with flip (Figure \ref{suppl_atac_seq_emread_ctcf_noshift_flip} and Figure \ref{suppl_atac_seq_emread_sp1_noshift_flip}) and ii) with shifting and flipping (Figure \ref{suppl_atac_seq_emread_ctcf_shift_flip} and \ref{suppl_atac_seq_emread_sp1_shift_flip}).
First, in both conditions - with and without shifting - different open chromatin signal classes have been discovered. Second, in most cases, the chromatin accessibility is anti-correlated with the nucleosome occupancy, which is something expected. However this is not always the case, such as in Figure \ref{suppl_atac_seq_emread_ctcf_noshift_flip} classes 3 and 6. Such pattern may reflect a complex chromatin architecture, with variably positioned nucleosomes, that the partition cannot realign. But it is also likely to be an artifactual signal caused by the partition itself. Third, allowing the regions to be flipped based on the chromatin accessibility signature (Figure \ref{suppl_atac_seq_emread_ctcf_noshift_flip} and Figure \ref{suppl_atac_seq_emread_sp1_noshift_flip}) does not allow to resolve properly the orientation of the underlying CTCF and SP1 motif instances. Indeed, the sequence logos, in the center, are symetric indicating a superposition of +strand and -strand motif. Finally, allowing a moderated shifting freedom (+/- 10bp, Figure \ref{suppl_atac_seq_emread_ctcf_shift_flip} and \ref{suppl_atac_seq_emread_sp1_shift_flip}) results in blurred out sequence logo. This indicates that the chromatin accessibility signal realignment unphased the underlying CTCF and SP1 motif instances. Thus the signal that is observed does not represent classes of footprints.
In this case, each the region contained a motif instance at its center. Nonetheless, even a limited shifting according to the open chromatin signal resulted in the dephasing the underlying motif instances. Trying to resolve the motif orientation by allowing flipping according to the open chromatin was not more successful. Thus, discovering footprint classes from a highly unaligned set of regions does not seem to be possible. Workaround strategies have to be found.
\subsection{Other supplementary figures}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.35]{images/ch_atac-seq/peaks_rmsk_sampled_sequences_23class.png}
\captionof{figure}{\textbf{Extended sequence and chromatin models} found in monocytes regulatory regions. The displayed logos correspond to each class sequence aggregation. The corresponding chromatin accessibility (red) and nucleosome occupancy (blue) are displayed atop of the logos. The classes are displayed by overall decreasing probability. A zoom over the central part of each class aggregation is shown in the top right inlet.}
\label{suppl_atac_seq_23class}
\end{center}
\end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.30]{images/ch_atac-seq/data_classPU1_2class.png}
\captionof{figure}{\textbf{PU.1 sub-classes} obtained by extracting PU.1 class data and subjecting them to a ChIPPartitioning classification into 2 classes. The displayed logos correspond to each class sequence aggregation. The corresponding chromatin accessibility (red) and nucleosome occupancy (blue) are displayed atop of the logos. The classes are displayed by overall decreasing probability. A zoom over the central part of each class aggregation is shown in the top right inlet.}
\label{suppl_atac_seq_pu1_subclass}
\end{center}
\end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[scale=0.30]{images/ch_atac-seq/data_classjun_3class.png}
\captionof{figure}{\textbf{AP1 sub-classes} obtained by extracting AP1 class data and subjecting them to a ChIPPartitioning classification into 3 classes. The displayed logos correspond to each class sequence aggregation. The corresponding chromatin accessibility (red) and nucleosome occupancy (blue) are displayed atop of the logos. The classes are displayed by overall decreasing probability. A zoom over the central part of each class aggregation is shown in the top right inlet.}
\label{suppl_atac_seq_ap1_subclass}
\end{center}
\end{figure}

Event Timeline