Page MenuHomec4science

ch_spark.aux
No OneTemporary

File Metadata

Created
Fri, May 10, 10:49

ch_spark.aux

\relax
\providecommand\hyper@newdestlabel[2]{}
\citation{groux_spar-k:_2019}
\citation{hon_chromasig:_2008}
\citation{lai_archalign:_2010}
\citation{nielsen_catchprofiles}
\citation{kundaje_ubiquitous_2012}
\citation{nair_probabilistic_2014}
\@writefile{toc}{\contentsline {chapter}{\numberline {4}SPar-K}{57}{chapter.4}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
\@writefile{loa}{\addvspace {10\p@ }}
\newlabel{spark}{{4}{57}{SPar-K}{chapter.4}{}}
\@writefile{chapter}{\contentsline {toc}{SPar-K}{57}{chapter.4}}
\@writefile{toc}{\contentsline {section}{\numberline {4.1}Algorithm}{57}{section.4.1}}
\citation{arthur_k-means++:_2007}
\citation{groux_spar-k:_2019}
\citation{groux_spar-k:_2019}
\citation{groux_spar-k:_2019}
\citation{groux_spar-k:_2019}
\citation{groux_spar-k:_2019}
\citation{groux_spar-k:_2019}
\@writefile{toc}{\contentsline {section}{\numberline {4.2}Implementation}{58}{section.4.2}}
\citation{leisch_toolbox_2006}
\citation{nair_probabilistic_2014}
\citation{nair_probabilistic_2014}
\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Synthethic datasets : \textbf {A} The class signal densities. \textbf {B} A synthetic dataset with a mean coverage of a 100 reads per region in average ($c$=100) and 0\% noise ($p_{s}$=1, $p_{b}$=0) and \textbf {C} one of the corresponding SPar-K partition, with shifting and flipping. The color ribbons on the side indicate the cluster assignments. \textbf {D} A synthetic dataset with a mean coverage of a 100 reads per region in average ($c$=100) and 90\% noise ($p_{s}$=0.1, $p_{b}$=0.9) and \textbf {E} one of the corresponding SPar-K partition, with shifting and flipping.\relax }}{59}{figure.caption.25}}
\newlabel{spark_simulated_data}{{4.1}{59}{Synthethic datasets : \textbf {A} The class signal densities. \textbf {B} A synthetic dataset with a mean coverage of a 100 reads per region in average ($c$=100) and 0\% noise ($p_{s}$=1, $p_{b}$=0) and \textbf {C} one of the corresponding SPar-K partition, with shifting and flipping. The color ribbons on the side indicate the cluster assignments. \textbf {D} A synthetic dataset with a mean coverage of a 100 reads per region in average ($c$=100) and 90\% noise ($p_{s}$=0.1, $p_{b}$=0.9) and \textbf {E} one of the corresponding SPar-K partition, with shifting and flipping.\relax }{figure.caption.25}{}}
\@writefile{toc}{\contentsline {section}{\numberline {4.3}Benchmarking}{59}{section.4.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3.1}K-means}{59}{subsection.4.3.1}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces \textbf {Clustering accuracy using random seeding :} to compare the clustering accuracies of the different methods, several simulated dataset containing 3 classes, different coverages (10, 50 and 100 reads per region indicated as "cov10", "cov50" and "cov100") and noise proportions (no noise, 10\% noise, 50\% noise and 90\% noise indicated as "0.0", "0.1", "0.5" and "0.9") were generated. Each dataset was clustered 50 times with each method. The Adjusted Rand Index (ARI) was computed for each partition. The ARI values are displayed as boxplots. SPar-K and ChIPPartitioning were run allowing flipping and shifting. The ARI was measured on each of the resulting data partitions. For SPar-K, "smooth" indicates outlier smoothing. For the regular K-means, "eucl." and "corr." refer to the euclidean and correlation distances. "R" stands for "random" and indicates the ARI values obtained when comparing the true cluster labels with a randomly shuffled version of it, 100 times. Figure and legend taken and adapted from \citep {groux_spar-k:_2019}.\relax }}{60}{figure.caption.26}}
\newlabel{spark_ari}{{4.2}{60}{\textbf {Clustering accuracy using random seeding :} to compare the clustering accuracies of the different methods, several simulated dataset containing 3 classes, different coverages (10, 50 and 100 reads per region indicated as "cov10", "cov50" and "cov100") and noise proportions (no noise, 10\% noise, 50\% noise and 90\% noise indicated as "0.0", "0.1", "0.5" and "0.9") were generated. Each dataset was clustered 50 times with each method. The Adjusted Rand Index (ARI) was computed for each partition. The ARI values are displayed as boxplots. SPar-K and ChIPPartitioning were run allowing flipping and shifting. The ARI was measured on each of the resulting data partitions. For SPar-K, "smooth" indicates outlier smoothing. For the regular K-means, "eucl." and "corr." refer to the euclidean and correlation distances. "R" stands for "random" and indicates the ARI values obtained when comparing the true cluster labels with a randomly shuffled version of it, 100 times. Figure and legend taken and adapted from \citep {groux_spar-k:_2019}.\relax }{figure.caption.26}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces \textbf {Median SSE :} for the simulated ChIP-seq dataset containing 3 classes, with coverage 100 and no noise, partitioned into 2 to 5 clusters. To judge whether the elbow method could be used to estimate the optimal number of clusters, this dataset was partitioned with SPar-K, allowing flip and shifting, into 2 to 5 clusters, 50 times for each set of parameters. For each number of clusters, the median SSE is shown, +/- 1 standard deviation (bars). \textbf {A} Seeding done at random, \textbf {B} seeding done at random and outlier smoothing \textbf {C} seeding done with the K-means++ method \textbf {D} seeding done with the K-means++ method and outlier smoothing. In all cases, the optimal number of clusters seemed to be 3 (which was the expected value). Figure and legend taken and adapted from \citep {groux_spar-k:_2019}.\relax }}{61}{figure.caption.27}}
\newlabel{spark_sse}{{4.3}{61}{\textbf {Median SSE :} for the simulated ChIP-seq dataset containing 3 classes, with coverage 100 and no noise, partitioned into 2 to 5 clusters. To judge whether the elbow method could be used to estimate the optimal number of clusters, this dataset was partitioned with SPar-K, allowing flip and shifting, into 2 to 5 clusters, 50 times for each set of parameters. For each number of clusters, the median SSE is shown, +/- 1 standard deviation (bars). \textbf {A} Seeding done at random, \textbf {B} seeding done at random and outlier smoothing \textbf {C} seeding done with the K-means++ method \textbf {D} seeding done with the K-means++ method and outlier smoothing. In all cases, the optimal number of clusters seemed to be 3 (which was the expected value). Figure and legend taken and adapted from \citep {groux_spar-k:_2019}.\relax }{figure.caption.27}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces \textbf {Running times :} to compare the run times of each program, the synthetic dataset with coverage 100 and no noise was partitioned 20 times with each program. The run times (wall clock) in second were measured. For all SPar-K and the regular K-means, the partitions were initialized using a random and K-means++ (indicated as "k++"). For ChIPPartitioning, only a random seeding was used. The partitions were then optimized for 30 iterations at most. For SPar-K and ChIPPartitioning, a shifting of 71 bins and flipping were allowed. For SPar-K, only one thread was used and "smooth" indicates outlier smoothing. For the regular K-means, "eucl." and "corr." refer to the euclidean and correlation distances. Figure and legend taken and adapted from \citep {groux_spar-k:_2019}.\relax }}{61}{figure.caption.28}}
\newlabel{spark_time}{{4.4}{61}{\textbf {Running times :} to compare the run times of each program, the synthetic dataset with coverage 100 and no noise was partitioned 20 times with each program. The run times (wall clock) in second were measured. For all SPar-K and the regular K-means, the partitions were initialized using a random and K-means++ (indicated as "k++"). For ChIPPartitioning, only a random seeding was used. The partitions were then optimized for 30 iterations at most. For SPar-K and ChIPPartitioning, a shifting of 71 bins and flipping were allowed. For SPar-K, only one thread was used and "smooth" indicates outlier smoothing. For the regular K-means, "eucl." and "corr." refer to the euclidean and correlation distances. Figure and legend taken and adapted from \citep {groux_spar-k:_2019}.\relax }{figure.caption.28}{}}
\citation{groux_spar-k:_2019}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3.2}ChIPPartitioning}{62}{subsection.4.3.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3.3}Data}{62}{subsection.4.3.3}}
\citation{ambrosini_chip-seq_2016}
\citation{ambrosini_chip-seq_2016}
\citation{groux_spar-k:_2019}
\citation{groux_spar-k:_2019}
\citation{bailey_meme_2009}
\citation{kundaje_ubiquitous_2012}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3.4}Performances}{63}{subsection.4.3.4}}
\@writefile{toc}{\contentsline {section}{\numberline {4.4}Partition of DNase and MNase data}{63}{section.4.4}}
\@writefile{toc}{\contentsline {section}{\numberline {4.5}Conclusions}{63}{section.4.5}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Nucleosome occupancy, determined by MNase-seq, in bins of 10bp, +/- 1000bp around 79'957 CTCF binding sites in GM12878 cells. \textbf {A} MNaseI-seq read density around the CTCF binding sites. ChIP-seq peak summits are aligned at position 0. The regions (rows) are ordered according the their resemblance (correlation) to the overall aggregation pattern. \textbf {B} SPar-K data partition. The number of clusters (4) was determined using the elbow method. The cluster labels are indicated by the color ribbons on the left. Within each cluster, the data have been realigned according to the shift and flip informations returned by SPar-K and the regions have been ordered according the their resemblance (correlation) to the cluster aggregation pattern. Because of the realignment, ChIP-seq peak summits are not anymore aligned at position 0. \textbf {C} Corresponding DNaseI hypersensitivity measured by DNaseI-seq at the same loci and realigned as in B. \textbf {D} CTCF motif occurrences predicted using a motif scan, at the same loci and realigned as in B. Each predicted binding site, +/- 1kb around a peak, is represented as a point. \textbf {E} Transcription start site (TSS) density at the same loci and realigned as in B. \textbf {F} Cluster 1 (red) aggregation profiles. The original peak coordinates were modified accordingly to the shift and flip values returned by SPar-K and the read densities the different data types were measured using ChIP-Cor \citep {ambrosini_chip-seq_2016}. For the TSSs and the transcription initiation (CAGE), only the data mapping on the negative strand were used to monitor transcription firing towards the nucleosome array (towards the left). \textbf {G} Proportions of regions having at least one CTCF motif +/- 1kb (same motifs as in D), for each cluster. \textbf {H} Proportions of regions having at least one TSS +/- 1kb (same TSSs as in E), for each cluster.\relax }}{64}{figure.caption.29}}
\newlabel{spark_ctcf}{{4.5}{64}{Nucleosome occupancy, determined by MNase-seq, in bins of 10bp, +/- 1000bp around 79'957 CTCF binding sites in GM12878 cells. \textbf {A} MNaseI-seq read density around the CTCF binding sites. ChIP-seq peak summits are aligned at position 0. The regions (rows) are ordered according the their resemblance (correlation) to the overall aggregation pattern. \textbf {B} SPar-K data partition. The number of clusters (4) was determined using the elbow method. The cluster labels are indicated by the color ribbons on the left. Within each cluster, the data have been realigned according to the shift and flip informations returned by SPar-K and the regions have been ordered according the their resemblance (correlation) to the cluster aggregation pattern. Because of the realignment, ChIP-seq peak summits are not anymore aligned at position 0. \textbf {C} Corresponding DNaseI hypersensitivity measured by DNaseI-seq at the same loci and realigned as in B. \textbf {D} CTCF motif occurrences predicted using a motif scan, at the same loci and realigned as in B. Each predicted binding site, +/- 1kb around a peak, is represented as a point. \textbf {E} Transcription start site (TSS) density at the same loci and realigned as in B. \textbf {F} Cluster 1 (red) aggregation profiles. The original peak coordinates were modified accordingly to the shift and flip values returned by SPar-K and the read densities the different data types were measured using ChIP-Cor \citep {ambrosini_chip-seq_2016}. For the TSSs and the transcription initiation (CAGE), only the data mapping on the negative strand were used to monitor transcription firing towards the nucleosome array (towards the left). \textbf {G} Proportions of regions having at least one CTCF motif +/- 1kb (same motifs as in D), for each cluster. \textbf {H} Proportions of regions having at least one TSS +/- 1kb (same TSSs as in E), for each cluster.\relax }{figure.caption.29}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Partitioning of DNaseI hypersensitivity profiles around SP1 binding sites in K562 cells. The optimal number of clusters was determined using the elbow method. \textbf {A.} Input data based on peak summits provided by ENCODE. \textbf {B.} Same regions clustered, re-aligned and oriented by SPar-K. Clusters 1, 2 and 3 are indicated by colored bars in red, blue, and green, respectively. \textbf {C.} MNase-seq read densities for the same regions, ordered, aligned and oriented as in B. \textbf {D.} Predicted SP1 binding motifs for the same regions, ordered, aligned and oriented as in B. \textbf {E.} Proportion of binding sites within each cluster having a confirmed promoter-associated TSS within +/- 300bp. \textbf {F.} Aggregations profiles for DNase-seq (red), MNase-seq (blue), promoter TSS (green) and CAGE-seq data (violet) for cluster 2 (aligned and oriented as in B). \textbf {G.} Motifs found by MEME-ChIP and Tomtom in the narrow footprints of each cluster. (*) known SP1 interactor, (c) central enrichment. Cluster 2 left and right refer to the left and right footprints seen in \textbf {B}. Figure and legend taken and adapted from \citep {groux_spar-k:_2019}.\relax }}{65}{figure.caption.30}}
\newlabel{spark_dnase}{{4.6}{65}{Partitioning of DNaseI hypersensitivity profiles around SP1 binding sites in K562 cells. The optimal number of clusters was determined using the elbow method. \textbf {A.} Input data based on peak summits provided by ENCODE. \textbf {B.} Same regions clustered, re-aligned and oriented by SPar-K. Clusters 1, 2 and 3 are indicated by colored bars in red, blue, and green, respectively. \textbf {C.} MNase-seq read densities for the same regions, ordered, aligned and oriented as in B. \textbf {D.} Predicted SP1 binding motifs for the same regions, ordered, aligned and oriented as in B. \textbf {E.} Proportion of binding sites within each cluster having a confirmed promoter-associated TSS within +/- 300bp. \textbf {F.} Aggregations profiles for DNase-seq (red), MNase-seq (blue), promoter TSS (green) and CAGE-seq data (violet) for cluster 2 (aligned and oriented as in B). \textbf {G.} Motifs found by MEME-ChIP and Tomtom in the narrow footprints of each cluster. (*) known SP1 interactor, (c) central enrichment. Cluster 2 left and right refer to the left and right footprints seen in \textbf {B}. Figure and legend taken and adapted from \citep {groux_spar-k:_2019}.\relax }{figure.caption.30}{}}
\@setckpt{main/ch_spark}{
\setcounter{page}{66}
\setcounter{equation}{1}
\setcounter{enumi}{8}
\setcounter{enumii}{0}
\setcounter{enumiii}{0}
\setcounter{enumiv}{0}
\setcounter{footnote}{0}
\setcounter{mpfootnote}{0}
\setcounter{part}{0}
\setcounter{chapter}{4}
\setcounter{section}{5}
\setcounter{subsection}{0}
\setcounter{subsubsection}{0}
\setcounter{paragraph}{0}
\setcounter{subparagraph}{0}
\setcounter{figure}{6}
\setcounter{table}{0}
\setcounter{NAT@ctr}{0}
\setcounter{FBcaption@count}{0}
\setcounter{ContinuedFloat}{0}
\setcounter{KVtest}{0}
\setcounter{subfigure}{0}
\setcounter{subfigure@save}{0}
\setcounter{lofdepth}{1}
\setcounter{subtable}{0}
\setcounter{subtable@save}{0}
\setcounter{lotdepth}{1}
\setcounter{lips@count}{2}
\setcounter{lstnumber}{1}
\setcounter{Item}{8}
\setcounter{Hfootnote}{0}
\setcounter{bookmark@seq@number}{0}
\setcounter{AM@survey}{0}
\setcounter{ttlp@side}{0}
\setcounter{myparts}{0}
\setcounter{parentequation}{0}
\setcounter{AlgoLine}{28}
\setcounter{algocfline}{1}
\setcounter{algocfproc}{1}
\setcounter{algocf}{1}
\setcounter{float@type}{8}
\setcounter{nlinenum}{0}
\setcounter{lstlisting}{0}
\setcounter{section@level}{0}
}

Event Timeline