appendix.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, May 22, 12:26

appendix.tex
View Options

	\appendix

	% \beginsupplement

	\chapter{An appendix}

	\section{Supplementary figures}

	% ======================================= SMiLE-seq figures =======================================

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.3]{images/ch_smile-seq/figure_s4_reproduced.png}
	\captionof{figure}{\textbf{Predictive power of SMiLE-seq :} \textbf{A} binding models were derived de novo from HT-SELEX 1st cycle data using the HMM discovery method (labelled HT-SELEX cycle 1 HMM) and their performances were assessed using the AUC-ROC. AUC-ROC values for the corresponding TF models derived from SMiLe-seq data (labelled SMiLE-seq) and reported by Jolma and colleagues (labelled HT-SELEX reported matrices, \cite{jolma_dna-binding_2013}) are also displayed. \textbf{B} the predictive performances of CEBPb, CTCF and TCF7 binding models were assessed using subsets of binding sites of decreasing affinities. Inside each peak list, the peaks were ranked by score and subsets of 500 peaks were selected. Peaks 1-500 have the highest affinity, then peaks 501-1000, and so on. The boxplots indicate the distribution of AUC-ROC obtained over all available peak-lists.}
	\label{suppl_smileseq_auc_2}
	\end{center}
	\end{figure}


	% ===================================== encode peaks figures =====================================
	% \begin{figure}[H]
	% \begin{center}
	% \includegraphics[scale=0.25]{images/ch_encode_peaks/em_examples.png}
	% \captionof{figure}{\textbf{Nucleosome architectures around TF binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. Partitions for \textbf{A} CTCF, \textbf{B} NRF1, \textbf{C} FOS and \textbf{D} max are displayed. The first row always contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
	% \label{suppl_encode_peaks_em_examples}
	% % \end{center}
	% \end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsUwGm12878CtcfUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
	\captionof{figure}{\textbf{Chromatine architectures around CTCF binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
	\label{suppl_encode_peaks_em_ctcf}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsSydhGm12878Nrf1IggmusUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
	\captionof{figure}{\textbf{Chromatine architectures around NRF1 binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
	\label{suppl_encode_peaks_em_nrf1}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsSydhGm12878CfosUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
	\captionof{figure}{\textbf{Chromatine architectures around cFOS binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
	\label{suppl_encode_peaks_em_cfos}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsSydhGm12878MaxIggmusUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
	\captionof{figure}{\textbf{Chromatine architectures around max binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
	\label{suppl_encode_peaks_em_max}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.5]{images/ch_encode_peaks/wgEncodeAwgTfbsSydhGm12878Brca1a300IggmusUniPk_MNase_GM12878_allpeaks_EM_4class_15shift_flip.png}
	\captionof{figure}{\textbf{Chromatine architectures around BRCA1 binding sites} discovered using ChIPPartitioning. The partition was done with respect to the MNase reads (red), +/- 1kb around the peaks, in bins of 10bp, that were allowed to be shifted and flipped. DNaseI (blue), TSS density (violet) and sequence conservation (green) were realigned according to MNase classification and overlaid. The y-axis scale represent the proportion of the highest signal for each chromatin pattern. The first row contains the aggregated signal over all sites. The number of binding sites (peaks) is indicated in parenthesis. The following rows contains the 4 classes discovered. Their overall probability is indicated atop of the class signal, on the right. The y-axis indicates the min/max signal for all densities.}
	\label{suppl_encode_peaks_em_brca1}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.8]{images/ch_encode_peaks/ctcf_ndr.png}
	\captionof{figure}{\textbf{Nucleosome occupancy around CTCF peaks } measured by MNase-seq, in bins of 10bp. The nucleosome depleted region is displayed in blue.}
	\label{suppl_encode_peaks_ctcf_ndr}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.4]{images/ch_encode_peaks/jund_motif_association.png}
	\captionof{figure}{\textbf{JunD motif association} measured around the binding sites of different TFs. For a each TF, its binding sites, +/- 500bp, were searched for the presence of i) the TF motif and ii) CTCF motif. For each TF, a 2x2 contingency table was created with the number of peaks having i) both motifs, ii) the TF motif only, iii) CTCF motif only and iv) no motif. \textbf{A} Odd ratio (OR) of the exact Fisher test performed on each TF contingency table. The ORs are displayed with their 95\% confidence interval (CI). ORs > 1 - that is, with 1 not part of the 95\%CI - are labeled in green and indicate an association of both motifs more frequent than expected by chance. ORs < 1 are labeled in red and indicate a repulsion of both motifs more frequence than expected by chance. The JunD and cFos dataset ORs are too high to be represented in this plot. \textbf{B} Density of JunD motif occurrence at the absolute distance of different TF binding sites (peak centers) which also have their own motif present (at distance 0). The rows were standardized and aggregated using the Euclidean distance. \textbf{C} Same as in (B) but for TF binding sites that does not have their own motif.}
	\label{suppl_encode_peaks_jund_association}
	\end{center}
	\end{figure}

	\begin{figure}
	\begin{center}
	\includegraphics[scale=0.4]{images/ch_encode_peaks/ebf1_haib_3.png}
	\captionof{figure}{\textbf{EBF1 binding sites} around the dyad of nucleosomes having an occupied EBF1 motif within 100bp (in red) and of all nucleosomes (in blue). The abrupt decrease of EBF1 motif frequency at +/- 100bp reflects the nucleosome selection process.}
	\label{suppl_encode_peaks_ebf1_nucl}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.5]{images/ch_encode_peaks/MA0154_3.png}
	\captionof{figure}{\textbf{EBF1 logo} from JASPAR binding model MA0154.3 \citep{khan_jaspar_2018}.}
	\label{suppl_encode_peaks_ebf1_logo}
	\end{center}
	\end{figure}

	\begin{figure}
	\begin{center}
	\includegraphics[scale=0.4]{images/ch_encode_peaks/ebf1_haib_2.png}
	\captionof{figure}{\textbf{EBF1 binding sites} chromatin features. \textbf{A} Chromatin accessibility around nucleosomes that have an EBF1 binding site within 100bp (red) and all nucleosomes (blue). \textbf{B} H3K4me2 deposition around nucleosomes that have an EBF1 binding site within 100bp (red) and all nucleosomes (blue). \textbf{C} Sequence conservation around nucleosomes that have an EBF1 binding site within 100bp (red) and all nucleosomes (blue).}
	\label{suppl_encode_peaks_ebf1_chrom}
	\end{center}
	\end{figure}

	% ======================================= SPar-K figures ========================================


	% ======================================= SPar-K algos ========================================


	\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{void SPar-K}%
	\begin{algorithm}[H]
	\label{algo_spark}
	\Fn{\Function{R, K, S, I}}
	{
	\KwData{$R$ a 2D matrix of numericals of dimension $N \times L$; $K$ the number clusters; $S$ the shifting allowed; $I$ the maximum number of iterations.}
	\KwResult{5 vectors of length $N$ with the cluster labels, the data shift values, the reference shift values, the data flip values and the distances to the cluster references.}

	$CLUSTER =$ vector of 0 of size $N$ \;
	$SHIFT\_DAT =$ vector of 0 of size $N$ \;
	$SHIFT\_REF =$ vector of 0 of size $N$ \;
	$FLIP\_DAT =$ vector of 0 of size $N$ \;
	$DISTANCE =$ vector of 0 of size $N$ \;

	\If{smoothing}
	{ smoothOutliers($R$) \; }

	$REF = $ a matrix of $K$ rows and $L$ columns\;
	select $K$ rows of $R$ and copy them in $REF$ using seedingRandom($K$, $R$) or seedingKmeans++($K$, $R$) \;

	$n\_iter = 1$\;
	$converged = false$\;

	\While{$n\_iter != I$ or not $converged$}
	{
	\For{$i=0$ to $N-1$}
	{ $D =$ a vector 0 of size $K$ \;
	$S\_REF =$ a vector 0 of size $K$ \;
	$S\_DAT =$ a vector 0 of size $K$ \;
	$F\_DAT =$ a vector 0 of size $K$ \;
	\For{$k=0$ to $K-1$}
	{ $D_{k},S\_REF_{k},S\_DAT_{k}, F\_DAT_{k} =$ distanceFast($REF_{k,}$, $R{i,}$, $S$) \; }
	$j = $ index of minimum value in $REF$\;
	$CLUSTER_{i} = D_{j}$ \;
	$SHIFT\_REF_{i} = S\_REF_{j}$ \;
	$SHIFT\_DAT_{i} = S\_DAT_{j}$ \;
	$FLIP\_DAT_{i} = F\_DAT_{j}$ \;
	$DISTANCE\_{i} = D_{j}$ \;
	}

	$REF =$ computeClusterReferences($R$, $CLUSTER$, $SHIFT\_REF$, $SHIFT\_DAT$, $FLIP\_DAT$) \;

	\If{$n\_iter > 1$ and at least one value of $CLUSTER$ changed since last iteration}
	{ $converged = false$ \; }
	\Else
	{ $converged = true$ \; }
	$n\_iter = n\_iter + 1$ \;
	}
	\Return{$CLUSTER, SHIFT\_REF, SHIFT\_DAT, FLIP, DISTANCE$}
	}
	\caption{SPar-K algorithm.}
	\end{algorithm}

	\vspace{\baselineskip}% Insert a blank line

	\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{MATRIX smoothOutliers}%
	\begin{algorithm}[H]
	\label{algo_smooth_outliers}
	\Fn{\Function{MATRIX}}
	{
	\KwData{$MATRIX$ a 2D matrix of numericals of dimensions $N \times L$.}
	\KwResult{a 2D matrix of numericals of dimensions $N \times L$. with outlier smoothed.}

	\ForEach{$ROW$ in MATRIX}
	{ $m$ = mean of $ROW$ \;
	$s$ = standard deviation of $ROW$ \;
	$lower$ = $m$ - 3*$s$ \;
	$upper$ = $m$ + 3*$s$ \;
	\ForEach{$value$ in $ROW$}
	{ \If{$value < lower$ or $value > upper$}
	{ \If{$value$ has a left neighbor $l$ and a right neighbor $r$}
	{ $value$ = ($l$ + $r$) / 2 \; }
	\ElseIf{$value$ has two left neighbors $l1$ and $l2$}
	{ $value$ = ($l1$ + $l2$) / 2 \; }
	\ElseIf{$value$ has two right neighbors $r1$ and $r2$}
	{ $value$ = ($r1$ + $r2$) / 2 \; }
	}
	}
	}
	\Return{$MATRIX$}
	}
	\caption{Smooth the data matrix by removing outliers.}
	\end{algorithm}

	\vspace{\baselineskip}% Insert a blank line

	\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{d,s\_xf,s\_y,f\_y distanceFast}%
	\begin{algorithm}[H]
	\label{algo_distance_fast}
	\Fn{\Function{X,Y,S}}
	{ \KwData{$X$ and $Y$ two vectors of size $L$ and $S$ the allowed shifting}
	\KwResult{the distance between $X$ and $Y$, the shift of $X$ and $Y$ and whether $Y$ was flipped in the alignment.}
	\tcc{initialize all variables and data structures}
	initialize() \;

	\For{$i=0$ to $n-1$}
	{ $SUM\_X_0 \pluseq X_i $ \;
	$SUM\_X2_0 \pluseq X_i^2$ \;
	$SUM\_Y_0 \pluseq Y_i $ \;
	$SUM\_Y2_0 \pluseq Y_i^2$ \;
	}
	$i = n$ \;
	\For{$s=1$ to $s=S-1$ }
	{ $SUM\_X_{s} = SUM\_X_{s-1} + X_{i-n} + X_{i} $ \;
	$SUM\_X2_{s} = SUM\_X2_{s-1} + X_{i-n}^{2} + X_{i}^{2}$ \;
	$SUM\_Y_{s} = SUM\_Y_{s-1} + Y_{i-n} + Y_{i} $ \;
	$SUM\_Y2_{s} = SUM\_Y2_{s-1} + Y_{i-n}^{2} + Y_{i}^{2}$ \;
	$i \pluseq 1$ \;
	}

	\tcc{compute all distances with $X$ having a shift of 0, $Y$ having a shift of 0 and all the remains ones. These functions can access and modify all variables and data structures in this function.}
	fillFirstRow() \;
	fillFirstColl() \;
	fillRemaining() \;

	\tcc{if more than one, find the minimal distance which also minimize the alignment score}
	$best_d = best\_s\_x = best\_s\_y = best\_f\_y = 0$ \;
	\For{$tripplet$ in $MIN\_DIST$}
	{ $s\_x = tripplet_{0}$ \;
	$s\_y = tripplet_{1}$ \;
	$f\_y = tripplet_{2}$ \;
	$min\_score = \infty$ \;
	\If{$SCORES_{s\_x,s\_y} < min\_score$}
	{ $min\_score = SCORES_{s\_x,s\_y}$ \;
	$best\_s\_x = s\_x$ \;
	$best\_s\_y = s\_y$ \;
	$best\_f\_y = f\_y$ \;
	$best\_d = DIST_{s\_x,s\_y,f\_y}$ \;
	}
	}
	\Return{$best\_d,best\_s\_x,best\_s\_y,best\_f\_y$}

	}
	\caption{Fast algorithm to compute the correlation distance with shift and flip}
	\end{algorithm}

	\vspace{\baselineskip}% Insert a blank line

	\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{initialize}%
	\begin{algorithm}[H]
	\label{initialize_algo}
	\Fn{\Function{}}
	{

	$l\_half = L / 2$ \;
	$n = L - S + 1$ \;
	$n\_half = n / 2$ \;
	%\tcc{1-cor for each shift of $X$ (on rows) and each shift of $Y$ (on columns)}
	$DIST =$ 3D matrix of 3 of dimensions $SS2$ \;
	%\tcc{the sums of the pairwise products of the $X$ elements for each shift and $Y$ elements for each shift}
	$SUM\_XY =$ 3D matrix of 0 of dimensions $SS2$ \;
	%\tcc{the sums of the $X$ elements for each shift}
	$SUM\_X =$ a vector of 0 of size $S$ \;
	%\tcc{the sums of squares of the $X$ elements for each shift}
	$SUM\_X2 =$ a vector of 0 of size $S$ \;
	%\tcc{same for $Y$}
	$SUM\_Y =$ a vector of 0 of size $S$ \;
	$SUM\_Y2 =$ a vector of 0 of size $S$ \;
	%\tcc{a centrality score for each alignment, a forward and a reverse alignments have the same score}
	$SCORES =$ a 2D matrix of 0 of dimensions $S*S$ \;
	%\tcc{to track minimum distances in $DIST$}
	$MIN\_DIST =$ an empty list of triplets \;
	$min\_dist = 3$ \;
	}
	\caption{A routine of distanceFast() that initializes all the necessary variables. This function can access and modify variables in distanceFast().}
	\end{algorithm}

	\vspace{\baselineskip}% Insert a blank line


	\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{void fillFirstRow}%
	\begin{algorithm}[H]
	\Fn{\Function{}}
	{
	% \tcc{fill first row of $DIST$, 1-cor with $X$ having a shift of 0}
	$from\_x = 0$ \;
	$to\_x = from\_x + n$ \;

	\For{$i = 0$ to $S-1$}
	{ \tcc{forward orientation}
	$from\_y = i$ \;
	$to\_y = from\_y + n$ \;

	$SUM_XY{from\_x,from\_y,0} = 0$ \;
	\For{$j = 0$ to $n-1$}
	{ $SUM\_XY_{from\_x,from\_y,0} \pluseq X_{from\_x+j} * Y_{from\_y+j}$ \; }

	% \tcc{correlation}
	$c = \frac{n* SUM\_XY_{from\_x,from\_y,0} - SUM\_X_{from\_x} * SUM\_Y_{from\_y}}
	{\sqrt{nSUM\_X2_{from\_x} - SUM\_X_{from\_x}^{2}} \sqrt{n*SUM\_Y2_{from\_y} - SUM\_Y_{from\_y}^{2}}}$ \;
	\If{division by 0 occurred}
	{ $c = 0$ \; }

	% \tcc{distance}
	$d = 1 - c$ \;
	$DIST_{from\_x,from\_y,0} = d$ \;

	% \tcc{alignment centrality score, this value is the same for forward and reverse alignments}
	$SCORES_{from\_x,from\_y} = \abs{l\_half - (n\_half + from\_x)} +
	\abs{l\_half - (n\_half + from\_y)}$ \;
	\If{$d < min\_dist$}
	{ $min\_distance = d$ \;
	append a triplet $(from\_x,from\_y, 0)$ to $MIN\_DIST$ \;
	}
	\ElseIf{$d = min\_dist$}
	{ append a triplet $(from\_x,from\_y, 0)$ to $MIN\_DIST$ \; }

	\tcc{reverse orientation}
	$SUM_XY{from\_x,from\_y,1} = 0$ \;
	\For{$j = 0$ to $n-1$}
	{ $SUM\_XY_{from\_x,from\_y,1} \pluseq X_{from\_x+j} * Y_{to\_y-j-1}$ \; }

	% \tcc{correlation}
	$c = \frac{n* SUM\_XY_{from\_x,from\_y,1} - SUM\_X_{from\_x} * SUM\_Y_{from\_y}}
	{\sqrt{nSUM\_X2_{from\_x} - SUM\_X_{from\_x}^{2}} \sqrt{n*SUM\_Y2_{from\_y} - SUM\_Y_{from\_y}^{2}}}$ \;
	\If{division by 0 occurred}
	{ $c = 0$ \; }

	% \tcc{distance}
	$d = 1 - c$ \;
	$DIST_{from\_x,from\_y,1} = d$ \;

	\If{$d < min\_dist$}
	{ $min\_distance = d$ \;
	append a triplet $(from\_x,from\_y, 1)$ to $MIN\_DIST$ \;
	}
	\ElseIf{$d = min\_dist$}
	{ append a triplet $(from\_x,from\_y, 1)$ to $MIN\_DIST$ \; }
	}
	}
	\caption{A routine of distanceFast() computing all distances with $X$ having a shift of 0. This function can access and modify all variables declared in distanceFast().}
	\end{algorithm}

	\vspace{\baselineskip}% Insert a blank line

	\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{void fillFirstCol}%
	\begin{algorithm}[H]
	\Fn{\Function{}}
	{ % \tcc{fill first row of $DIST$, 1-cor with $X$ having a shift of 0}
	$from\_y = 0$ ; $to\_y = from\_y + n$ \;
	$from\_y\_rev = s - from\_y - 1$ \;
	$to\_y\_rev = from\_y\_rev + n$ \;

	\For{$i=0$ to $S-1$}
	{ $from\_x = i$ \;
	$to\_x = from\_x + n$ \;

	\tcc{forward orientation}
	$SUM\_XY_{from\_x,from\_y,0} = 0 $ \;
	\For{$j=0$ to $n-1$}
	{ $SUM\_XY{from\_x,from\_y,0} \pluseq X_{from\_x+j}*Y_{from\_y+j} $ \; }

	$c = \frac{nSUM\_XY_{from\_x,from\_y,0} - SUM\_X_{from\_x}SUM\_Y_{from\_y}}
	{\sqrt{nSUM\_X2_{from\_x} - SUM\_X_{from\_x}^{2}} \sqrt{n*SUM\_Y2_{from\_y} - SUM\_Y_{from\_y}^{2}}}$ \;
	\If{division by 0 occurred}
	{ $c = 0$ \; }

	% \tcc{distance}
	$d = 1 - c$ \;
	$DIST_{from\_x,from\_y,0} = d$ \;

	% \tcc{alignment centrality score, this value is the same for forward and reverse alignments}
	$SCORES_{from\_x,from\_y} = \abs{l\_half - (n\_half + from\_x)} +
	\abs{l\_half - (n\_half + from\_y)}$ \;
	\If{$d < min\_dist$}
	{ $min\_distance = d$ \;
	append a triplet $(from\_x,from\_y, 0)$ to $MIN\_DIST$ \;
	}
	\ElseIf{$d = min\_dist$}
	{ append a triplet $(from\_x,from\_y, 0)$ to $MIN\_DIST$ \; }

	\tcc{reverse orientation}
	$SUM\_XY_{from\_x,S-1,1} = 0$ \;
	\For{$j=0$ to $n-1$}
	{ $SUM\_XY_{from\_x,S-1,1} \pluseq X_{from\_x+j} * Y_{to\_y\_rev-1-j}$ \; }

	$c = \frac{nSUM\_XY_{from\_x,S-1,1} - SUM\_X_{from\_x}SUM\_Y_{S-1}}
	{\sqrt{nSUM\_X2_{from\_x} - SUM\_X_{from\_x}^{2}} \sqrt{n*SUM\_Y2_{S-1} - SUM\_Y_{S-1}^{2}}}$ \;
	\If{division by 0 occurred}
	{ $c = 0$ \; }

	% \tcc{distance}
	$d = 1 - c$ \;
	$DIST_{from\_x,from\_y,1} = d$ \;

	\If{$d < min\_dist$}
	{ $min\_distance = d$ \;
	append a triplet $(from\_x,S-1, 1)$ to $MIN\_DIST$ \;
	}
	\ElseIf{$d = min\_dist$}
	{ append a triplet $(from\_x,S-1, 1)$ to $MIN\_DIST$ \; }
	}
	}
	\caption{A routine of distanceFast() computing all distances with $Y$ having a shift of 0. This function is can access and modify all variables declared in distanceFast().}
	\end{algorithm}

	\vspace{\baselineskip}% Insert a blank line

	\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{void fillRemaining}%
	\begin{algorithm}[H]
	\Fn{\Function{}}
	{ \For{$i=1$ to $S-1$}
	{ $from\_x = i$ ; $to\_x = from\_x + n $ \;
	\For{$j=1$ to $S-1$}
	{ $from\_y = j$ \;
	$to\_y = from\_y + n$ \;

	\tcc{forward orientation}
	$SUM\_XY_{from\_x,from\_y,0} = SUM\_XY_{from\_x-1,from\_y-1,1} - X_{from\_x-1} * Y_{from\_y-1} + X_{to\_x-1}*Y{to\_y-1}$ \;
	$c = \frac{nSUM\_XY{from\_x,from\_y,0} - SUM\_X{from\_x}SUM\_Y{from\_y}}
	{\sqrt{nSUM\_X2_{from\_x} - SUM\_X_{from\_x}^2} \sqrt{n*SUM\_Y2_{from\_Y} - SUM\_Y_{from\_Y}^2}}$ \;

	\If{division by 0 occurred}
	{ $c = 0$ \; }

	$d = 1 - c$ \;
	$DIST_{from\_x,from\_y,0} = d$ \;

	%\tcc{alignment centrality score, this value is the same for forward and reverse alignments}
	$SCORES_{from\_x,from\_y} = \abs{l\_half - (n\_half + from\_x)} +
	\abs{l\_half - (n\_half + from\_y)}$ \;

	\If{$d < min\_dist$}
	{ $min\_distance = d$ \;
	append a triplet $(from\_x,from\_y,0)$ to $MIN\_DIST$ \;
	}
	\ElseIf{$d = min\_dist$}
	{ append a triplet $(from\_x,from\_y,0)$ to $MIN\_DIST$ \; }

	\tcc{reverse orientation}
	$j\_rev = S - j - 1$ \;
	$from\_y\_rev = j\_rev$ \;
	$to\_y\_rev = from\_y\_rev + n$ \;
	$SUM\_XY_{from\_x,from\_y\_rev,1} = SUM\_XY_{from\_x-1,from\_y\_rev+1,1} - X_{from\_x-1}Y_{to\_y\_rev} + X_{to\_x-1}Y{from\_y\_rev}$ \;

	$c = \frac{nSUM\_XY_{from\_x,from\_y\_rev,1} - SUM\_X_{from\_x}SUM\_Y_{from\_y\_rev}}
	{\sqrt{nSUM\_X2_{from\_x} - SUM\_X_{from\_x}^2} \sqrt{n*SUM\_Y2_{from\_y\_rev} - SUM\_Y_{from\_y_rev}^2}}$ \;

	\If{division by 0 occurred}
	{ $c = 0$ \; }

	$d = 1 - c$ \;
	$DIST_{from\_x,from\_y,1} = d$ \;

	\If{$d < min\_dist$}
	{ $min\_distance = d$ \;
	append a triplet $(from\_x,from\_y\_rev,1)$ to $MIN\_DIST$ \;
	}
	\ElseIf{$d = min\_dist$}
	{ append a triplet $(from\_x,from\_y\_rev,1)$ to $MIN\_DIST$ \; }
	}
	}
	}
	\caption{A routine of distanceFast() computing all remaining distances between $X$ and $Y$. This function can access and modify all variables declared in distanceFast().}
	\end{algorithm}

	\vspace{\baselineskip}% Insert a blank line

	\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{REF seedingRandom}%
	\begin{algorithm}[H]
	\label{algo_seed_random}
	\Fn{\Function{K, R}}
	{ \KwData{$K$ the number references to initialize from $R$, a 2D matrix of numericals of dimensions $N \times L$.}
	\KwResult{$REF$ a 2D matrix of numericals of dimensions $K \times L$.}

	$REF = $ a 2D matrix of dimensions $K*L$ \;

	\For{$i=0$ to $K-1$}
	{ $j = $ sample a number from $0$ to $N-1$ with uniform probabilities \;
	\While{$j$ has already been sampled before}
	{ $j = $ sample a number from $0$ to $N-1$ with uniform probabilities \; }
	$REF_{i,} = R_{j,}$ \;
	}
	\Return{$REF$}
	}
	\caption{Random seeding algorithm}
	\end{algorithm}

	\vspace{\baselineskip}% Insert a blank line

	\SetKwProg{Fn}{}{\{}{}\SetKwFunction{Function}{REF seedingKmeans++}%
	\begin{algorithm}[H]
	\label{algo_seed_kmeans++}
	\Fn{\Function{K, R, S}}
	{ \KwData{$K$ the number references to initialize from $R$, a 2D matrix of numericals of dimensions $N \times L$, allowing a shifting of $S$.}
	\KwResult{$REF$ a 2D matrix of numericals of dimensions $K \times L$.}

	$REF = $ a 2D matrix of dimensions $K*L$ \;
	$DIST\_TO\_CLOSEST = $ a vector of size $N$\;

	\tcc{choose first reference, a standard deviation equals to 0 should be avoided otherwise computing a distance against this reference will always be equal to 1 (see how distance are computed).}
	$j = $ sample a number from $0$ to $N-1$ with uniform probabilities but for which $DATA_{j,}$ has a standard deviation different from $0$ \;
	$REF_{0,} = DATA_{j,}$ \;

	\tcc{choose next references}
	\For{$k=1$ to $K-1$}
	{ \For{$i=0$ to $N-1$}
	{ \tcc{distance to all current references for this region}
	$DIST\_TO\_CENTERS =$ a vector of size $k$ ;
	\For{$j=0$ to $k-1$}
	{ $DIST\_TO\_CENTERS_{j,} =$ distanceFast($REF_{j,}$, $R_{i,}$, $S$) \; }
	\tcc{distance to closest current reference, for this region}
	$DIST\_TO\_CLOSEST{i,} =$ minimum value of $DIST\_TO\_CENTERS$\;
	}

	$j =$ sample a number from 0 to $N-1$ using $DIST\_TO\_CLOSEST$ as the probability/weight distribution for each point to be sample. \;
	\While{$j$ has already been sampled before}
	{ $j =$ sample a number from 0 to $N-1$ using $DIST\_TO\_CLOSEST$ as the probability/weight distribution for each point to be sample. \; }
	$REF_{k,} = R{j,}$ \;
	}
	\Return{REF}
	}
	\caption{Kmeans++ seeding algorithm.}
	\end{algorithm}



	% ======================================= ATAC-seq figures ========================================

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.3]{images/ch_atac-seq/sp1_motifs_6class_noshift_flip.png}
	\captionof{figure}{\textbf{Open chromatin classes around SP1 motifs :} EMRead was run without shifing (+/- 10bp) but with flipping to identify different classes of footprints around 15'883 SP1 motifs. The aggregation signal around the 6 different classes found are shown by decreasing class probability. The open chromatin patterns are displayed in red, the nucleosomes are displayed in blue. The aggregated DNA sequence is displayed as a logo. The y-axis ranges from the minimum to the maximum signal observed. For the DNA logo, this corresponds to 0 and 2 bits respectively.}
	\label{suppl_emread_sp1_noshift_flip}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.3]{images/ch_atac-seq/sp1_motifs_6class_shift_flip.png}
	\captionof{figure}{\textbf{Open chromatin classes around SP1 motifs :} EMRead was run with shifing (+/- 10bp) flipping to identify different classes of footprints around 15'883 SP1 motifs. The aggregation signal around the 6 different classes found are shown by decreasing class probability. The open chromatin patterns are displayed in red, the nucleosomes are displayed in blue. The aggregated DNA sequence is displayed as a logo. The y-axis ranges from the minimum to the maximum signal observed. For the DNA logo, this corresponds to 0 and 2 bits respectively.}
	\label{suppl_emread_sp1_shift_flip}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.4]{images/ch_atac-seq/simulated_sequences_2class_flip_best_motifs.png}
	\captionof{figure}{\textbf{Simulated data motifs :} motifs used for the data generation (labeled "True motif") and the best scoring - based on the AUC - partition motifs (labeled "Found motif"). The partition with EMSequence was run such that it was searching for motifs of 11bp, slightly longer than those used for the data generation. "RC" stands for reverse complement. The motifs tree and alignment was build using the motifStack R package \citep{ou_motifstack_2018}.}
	\label{suppl_atac_seq_emseq_best_motifs}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.35]{images/ch_atac-seq/sp1_motifs_10class.png}
	\captionof{figure}{\textbf{SP1 motifs :} partition of 15'883 801bp sequences centered on a SP1 binding site using EMSequence. These sequences were classified by EMSequence to search for 10 different 30bp long motifs ($801 - 30 = 771$ of shifting freedom). The optimization was run for 20 iterations. The different classes are ordered by decreasing overall probability. Arrows atop of the motifs indicates head-to-tail arrangements of SP1 motifs.}
	\label{suppl_emseq_sp1_10class}
	\end{center}
	\end{figure}

	\begin{figure}[H]
	\begin{center}
	\includegraphics[scale=0.35]{images/ch_atac-seq/sp1_motifs_10class.png}
	\captionof{figure}{\textbf{SP1 motifs :} partition of 15'883 801bp sequences centered on a SP1 binding site using EMSequence. These sequences were classified by EMSequence to search for 10 different 30bp long motifs ($801 - 30 = 771$ of shifting freedom). The optimization was run for 20 iterations. The different classes are ordered by decreasing overall probability. Arrows atop of the motifs indicates head-to-tail arrangements of SP1 motifs.}
	\label{suppl_emseq_sp1_10class}
	\end{center}
	\end{figure}

	\begin{figure}
	\begin{center}
	\includegraphics[scale=0.30]{images/ch_atac-seq/peaks_rmsk_sampled_sequences_23class.png}
	\captionof{figure}{\textbf{Extended sequence and chromatin models} found in 10'000 monocytes regulatory regions. The displayed logos correspond to each class sequence aggregation. The corresponding chromatin accessibility (red) and nucleosome occupancy (blue) are displayed atop of the logos. The classes are displayed by overall decreasing probability. A zoom over the central part of each class aggregation is shown in the top right inlet.}
	\label{suppl_atac_seq_23class}
	\end{center}
	\end{figure}

	\begin{figure}
	\begin{center}
	\includegraphics[scale=0.30]{images/ch_atac-seq/data_classPU1_2class.png}
	\captionof{figure}{\textbf{PU.1 sub-classes} obtained by extracting PU.1 class data and subjecting them to a ChIPPartitioning classification into 2 classes. The displayed logos correspond to each class sequence aggregation. The corresponding chromatin accessibility (red) and nucleosome occupancy (blue) are displayed atop of the logos. The classes are displayed by overall decreasing probability. A zoom over the central part of each class aggregation is shown in the top right inlet.}
	\label{suppl_atac_seq_pu1_subclass}
	\end{center}
	\end{figure}

	\begin{figure}
	\begin{center}
	\includegraphics[scale=0.30]{images/ch_atac-seq/data_classjun_3class.png}
	\captionof{figure}{\textbf{AP1 sub-classes} obtained by extracting AP1 class data and subjecting them to a ChIPPartitioning classification into 3 classes. The displayed logos correspond to each class sequence aggregation. The corresponding chromatin accessibility (red) and nucleosome occupancy (blue) are displayed atop of the logos. The classes are displayed by overall decreasing probability. A zoom over the central part of each class aggregation is shown in the top right inlet.}
	\label{suppl_atac_seq_ap1_subclass}
	\end{center}
	\end{figure}

appendix.texNo OneTemporaryActions

File Metadata

appendix.texView Options

Event Timeline

appendix.tex
No OneTemporary
Actions

appendix.tex
View Options