diff --git a/main/ch_pwmscan.aux b/main/ch_pwmscan.aux index a365724..ae0c159 100644 --- a/main/ch_pwmscan.aux +++ b/main/ch_pwmscan.aux @@ -1,91 +1,91 @@ \relax \providecommand\hyper@newdestlabel[2]{} \citation{ambrosini_pwmscan:_2018} \@writefile{toc}{\contentsline {chapter}{\numberline {6}PWMScan}{77}{chapter.6}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \@writefile{loa}{\addvspace {10\p@ }} \newlabel{pwmscan}{{6}{77}{PWMScan}{chapter.6}{}} \@writefile{chapter}{\contentsline {toc}{PWMScan}{77}{chapter.6}} \@writefile{toc}{\contentsline {section}{\numberline {6.1}Algorithms}{77}{section.6.1}} \citation{pizzi_fast_2008} \citation{pizzi_fast_2008} \citation{langmead_ultrafast_2009} \@writefile{toc}{\contentsline {subsection}{\numberline {6.1.1}Scanner algorithm}{78}{subsection.6.1.1}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.1.2}Matches enumeration and mapping}{78}{subsection.6.1.2}} \citation{ambrosini_pwmscan:_2018} \citation{ambrosini_pwmscan:_2018} \citation{langmead_ultrafast_2009} \citation{bailey_meme_2009} \citation{ambrosini_pwmscan:_2018} \@writefile{toc}{\contentsline {section}{\numberline {6.2}PMWScan architecture}{79}{section.6.2}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces \textbf {PWMScan workflow :} the input is composed of a PWM and a score threshold specifying the minimum score for a sequence to achieved to be considered as a match. Letter probability matrices or count matrices are also accepted and are converted into PWMs. The score threshold can also be given as a p-value or a percentage of the maximum score, in which case it is converted into a threshold score. Based on the length of the PWM, Bowtie or pwm\_scan can be used to find the matches on the genome. If Bowtie is used, the set of k-mers achieving a better score than the threshold score is computed using branch-and-bound algorithm (mba) and mapped on the genome. On the other hand, if matrix\_scan is used, the PWM is used to score every possible sub-sequence in the genome. The regions corresponding to the sequences achieving a score at least as good as the threshold score are then returned under BED format. Figure and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }}{80}{figure.caption.32}} -\newlabel{lab_resources_pwmscan_pipeline}{{6.1}{80}{\textbf {PWMScan workflow :} the input is composed of a PWM and a score threshold specifying the minimum score for a sequence to achieved to be considered as a match. Letter probability matrices or count matrices are also accepted and are converted into PWMs. The score threshold can also be given as a p-value or a percentage of the maximum score, in which case it is converted into a threshold score. Based on the length of the PWM, Bowtie or pwm\_scan can be used to find the matches on the genome. If Bowtie is used, the set of k-mers achieving a better score than the threshold score is computed using branch-and-bound algorithm (mba) and mapped on the genome. On the other hand, if matrix\_scan is used, the PWM is used to score every possible sub-sequence in the genome. The regions corresponding to the sequences achieving a score at least as good as the threshold score are then returned under BED format. Figure and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }{figure.caption.32}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces \textbf {PWMScan workflow :} the input is composed of a PWM and a score threshold specifying the minimum score for a sequence to achieved to be considered as a match. LPMs or LFMs are also accepted and are converted into PWMs. The score threshold can also be given as a p-value or a percentage of the maximum score, in which case it is converted into a threshold score. Based on the length of the PWM, Bowtie or pwm\_scan can be used to find the matches on the genome. If Bowtie is used, the set of k-mers achieving a better score than the threshold score is computed using branch-and-bound algorithm (mba) and mapped on the genome. On the other hand, if matrix\_scan is used, the PWM is used to score every possible sub-sequence in the genome. The regions corresponding to the sequences achieving a score at least as good as the threshold score are then returned under BED format. Figure and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }}{80}{figure.caption.32}} +\newlabel{lab_resources_pwmscan_pipeline}{{6.1}{80}{\textbf {PWMScan workflow :} the input is composed of a PWM and a score threshold specifying the minimum score for a sequence to achieved to be considered as a match. LPMs or LFMs are also accepted and are converted into PWMs. The score threshold can also be given as a p-value or a percentage of the maximum score, in which case it is converted into a threshold score. Based on the length of the PWM, Bowtie or pwm\_scan can be used to find the matches on the genome. If Bowtie is used, the set of k-mers achieving a better score than the threshold score is computed using branch-and-bound algorithm (mba) and mapped on the genome. On the other hand, if matrix\_scan is used, the PWM is used to score every possible sub-sequence in the genome. The regions corresponding to the sequences achieving a score at least as good as the threshold score are then returned under BED format. Figure and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }{figure.caption.32}{}} \citation{ambrosini_chip-seq_2016} \citation{ambrosini_signal_2003} \citation{ambrosini_pwmscan:_2018} \citation{ambrosini_pwmscan:_2018} \citation{ambrosini_pwmscan:_2018} \citation{ambrosini_pwmscan:_2018} \citation{hertz_identification_1990} \citation{beckstette_fast_2006} \citation{turatsinze_using_2008} \citation{heinz_simple_2010} \citation{grant_fimo:_2011} \citation{schones_statistical_2007} \@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces \textbf {Benchmark :} PWMScan speed performances were measured and compared with 6 other well known genome scanners. In all cases, the h19 genome sequence was scanned with a 19bp CTCF matrix and a 11bp STAT1 matrix, 10 times. The run times are represented as boxplots. For PWMScan, both pwm\_scan and Bowtie strategies were run. Figure and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }}{81}{figure.caption.33}} \newlabel{lab_resources_pwmscan_benchmark}{{6.2}{81}{\textbf {Benchmark :} PWMScan speed performances were measured and compared with 6 other well known genome scanners. In all cases, the h19 genome sequence was scanned with a 19bp CTCF matrix and a 11bp STAT1 matrix, 10 times. The run times are represented as boxplots. For PWMScan, both pwm\_scan and Bowtie strategies were run. Figure and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }{figure.caption.33}{}} \@writefile{toc}{\contentsline {section}{\numberline {6.3}Benchmark}{81}{section.6.3}} \citation{aerts_toucan:_2003} \citation{fu_motifviz:_2004} \citation{zhao_tred:_2005} -\@writefile{lot}{\contentsline {table}{\numberline {6.1}{\ignorespaces \textbf {Motif scanning software comparison}. The performances of matrix\_scan were assessed by comparing how many of the regions listed by matrix\_scan were also returned by other programs and if the region scores were comparable. For the percentage of overlap with the match list returned by matrix\_scan, the shorter of the two lists always serves as the reference (100\%). For the score correlations with matrix\_scan scores, the Spearman correlation was used. Table and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }}{82}{table.caption.34}} -\newlabel{lab_resources_pwmscan_benchmark_table}{{6.1}{82}{\textbf {Motif scanning software comparison}. The performances of matrix\_scan were assessed by comparing how many of the regions listed by matrix\_scan were also returned by other programs and if the region scores were comparable. For the percentage of overlap with the match list returned by matrix\_scan, the shorter of the two lists always serves as the reference (100\%). For the score correlations with matrix\_scan scores, the Spearman correlation was used. Table and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }{table.caption.34}{}} +\@writefile{lot}{\contentsline {table}{\numberline {6.1}{\ignorespaces \textbf {Motif scanning software comparison}. The performances of matrix\_scan were assessed by comparing how many of the regions returned by matrix\_scan were also returned by other programs and if the region scores were comparable. For the percentage of overlap with the match list returned by matrix\_scan, the shortest of the two lists always served as the reference (100\%). For the score correlations with matrix\_scan scores, the Spearman correlation was used. Table and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }}{82}{table.caption.34}} +\newlabel{lab_resources_pwmscan_benchmark_table}{{6.1}{82}{\textbf {Motif scanning software comparison}. The performances of matrix\_scan were assessed by comparing how many of the regions returned by matrix\_scan were also returned by other programs and if the region scores were comparable. For the percentage of overlap with the match list returned by matrix\_scan, the shortest of the two lists always served as the reference (100\%). For the score correlations with matrix\_scan scores, the Spearman correlation was used. Table and legend taken and adapted from \citep {ambrosini_pwmscan:_2018}.\relax }{table.caption.34}{}} \citation{ambrosini_pwmscan:_2018} \@writefile{toc}{\contentsline {section}{\numberline {6.4}Conclusions}{83}{section.6.4}} \@setckpt{main/ch_pwmscan}{ \setcounter{page}{84} \setcounter{equation}{0} \setcounter{enumi}{8} \setcounter{enumii}{0} \setcounter{enumiii}{0} \setcounter{enumiv}{0} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{0} \setcounter{chapter}{6} \setcounter{section}{4} \setcounter{subsection}{0} \setcounter{subsubsection}{0} \setcounter{paragraph}{0} \setcounter{subparagraph}{0} \setcounter{figure}{2} \setcounter{table}{1} \setcounter{NAT@ctr}{0} \setcounter{FBcaption@count}{0} \setcounter{ContinuedFloat}{0} \setcounter{KVtest}{0} \setcounter{subfigure}{0} \setcounter{subfigure@save}{0} \setcounter{lofdepth}{1} \setcounter{subtable}{0} \setcounter{subtable@save}{0} \setcounter{lotdepth}{1} \setcounter{lips@count}{0} \setcounter{lstnumber}{1} \setcounter{Item}{8} \setcounter{Hfootnote}{0} \setcounter{bookmark@seq@number}{0} \setcounter{AM@survey}{0} \setcounter{ttlp@side}{0} \setcounter{myparts}{0} \setcounter{parentequation}{0} \setcounter{AlgoLine}{28} \setcounter{algocfline}{1} \setcounter{algocfproc}{1} \setcounter{algocf}{1} \setcounter{float@type}{8} \setcounter{nlinenum}{0} \setcounter{lstlisting}{0} \setcounter{section@level}{0} } diff --git a/main/ch_pwmscan.tex b/main/ch_pwmscan.tex index d2760e2..f2c0c0b 100644 --- a/main/ch_pwmscan.tex +++ b/main/ch_pwmscan.tex @@ -1,159 +1,159 @@ \cleardoublepage \chapter{PWMScan} \label{pwmscan} \markboth{PWMScan}{PWMScan} \addcontentsline{chapter}{toc}{PWMScan} -The problem of pattern-matching using a matrix is considered an important problem and many algorithms and program have been developed. The challenge comes from the fact that this problem should be solved in a time and memory efficient manner in order to allow large eukaryotic genomes to be scanned with complex patterns. +The problem of pattern-matching using a matrix is considered an important problem and many algorithms and programs have been developed. The challenge comes from the fact that this problem should be solved in a time and memory efficient manner in order to allow large eukaryotic genomes to be scanned with complex patterns. -In this chapter, I describe PWMScan, a software that has been developed to predict TF binding sites given a binding specificity model. PWMScan is a web-server for rapid scanning of large genomes for high-scoring matches to a user-supplied or server-resident PWM. Compared to other web-based PWM scanning tools, PWMScan is unique in that it scans server-resident whole genomes rather than user-uploaded DNA sequences. Other key features are: i) menu-driven access to genomes of >30 model organisms; ii) menu-driven access to >300 public PWM libraries; iii) support of various PWM representations and formats; iv) cut-off values can be specified as match scores or P-values; v) output in BEDdetail format with match scores and P-values; vi) links to UCSC genome browser for visualization of results; and vii) action buttons to transfer match lists to analysis tools. +In this chapter, I describe PWMScan, a software that has been developed to predict TF binding sites given a genome and a binding specificity model. PWMScan is a web-server for rapid scanning of large genomes for high-scoring matches to a user-supplied or server-resident PWM. Compared to other web-based PWM scanning tools, PWMScan is unique in that it scans server-resident whole genomes rather than user-uploaded DNA sequences. Other key features are: i) menu-driven access to genomes of >30 model organisms, ii) menu-driven access to >300 public PWM libraries, iii) support of various PWM representations and formats, iv) cut-off values can be specified as match scores or P-values, v) output in BEDdetail format with match scores and P-values, vi) links to UCSC genome browser for visualization of results, and vii) action buttons to transfer match lists to analysis tools. This work has been conducted in close collaboration with Giovana Ambrosini, a senior scientist of the laboratory. Some parts of this chapter have been taken and adapted - with the author permission - from the original article that presents this work \citep{ambrosini_pwmscan:_2018}. For this project, I contributed to the development of the matrix\_scan program, the development of the necessary Python code for an efficient parallelization of matrix\_scan on the server backend and I set up and executed the entire benchmarking procedure. Furthermore, I produced all the figures presented in this chapter. % article Figure 1 % \begin{figure}[!htbp] % \begin{center} % \includegraphics[scale=0.4]{images/ch_lab_resources/pwmscan_fig1.png} % \captionof{figure}{Screen shot of the PWMScan graphical user interface.} % \label{lab_resources_gui} % \end{center} % \end{figure} \section{Algorithms} -From an algorithmic perspective, PWMScan is dual. It implements a regular genome scanner and also implements a novel approach that uses short read alignments to find PWM matches. A description of both algorithms follows. +From an algorithmic perspective, PWMScan is dual. It implements a regular genome scanner and also implements a novel approach that uses short read alignment to find PWM matches. \subsection{Scanner algorithm} -The program matrix\_scan implements a scanner (or window sliding) searching algorithm. It takes as input a PWM of dimensions $L\times4$, a DNA sequence $S$ of length $L' \geq L$ and a threshold score $t$. For each possible offset $i = 1, 2, ..., L'-L+1$, the sub-sequence $S_{i} = S_{i,i+1,...,i+L-1}$ is given a score $score(S_{i})$ using equation \ref{intro_eq_score_pwm}. If $score(S_{i}) \geq t$ then the sub-sequence $S_{i}$ is predicted to be a binding site. This naive approach is $\theta((L'-L+1)*L)$ in time. Even though this is not bad in terms of asymptotics analysis, many unnecessary computations are performed. Indeed, during the score computations for a sub-sequence $S_{i}$, it is possible to define a point after which the current score has become so bad that it will automatically turn to be $