Page MenuHomec4science

optimization.tex
No OneTemporary

File Metadata

Created
Tue, May 28, 13:53

optimization.tex

\renewcommand{\FIGREP}{src/optimization/figures}
\section{Single-core optimization}
\label{sec:optimization}
\intersec{deneb}
\begin{frame}
\frametitle{Single-core optimization}
\framesubtitle{Goal of this section}
\begin{itemize}
\item Better grasp how programming can influence performance
\item We first review some basic optimization principles to keep in mind
\item Deeper understanding of the working principles of the CPU
\begin{itemize}
\item How data transfers are handled
\item Concept of vectorization
\end{itemize}
\end{itemize}
\end{frame}
\subsection{Basic optimization concepts}
\label{sec:basic_optimization}
\begin{frame}[t,fragile]
\frametitle{Single-core optimization}
\framesubtitle{Basic optimization techniques}
\begin{itemize}
\item Often, very simple changes to the code lead to significant performance
improvements
\item The following may seem trivial, but you would be surprised how often
they could be used in scientific codes
\item The main problem is that we often make a one-to-one mapping between
the equations and the algorithm
\end{itemize}
\vfill
\textbf{Do less work}\\
\begin{minipage}[t]{0.475\linewidth}
\begin{cxxcode}{}
for (int i = 0; i < N; ++i) {
a[i] = (alpha + sin(x)) * b[i];
}
\end{cxxcode}
\end{minipage}
\hfill
\begin{minipage}[t]{0.475\linewidth}
\begin{cxxcode}{}
double tmp = alpha + sin(x);
for (int i = 0; i < N; ++i) {
a[i] = tmp * b[i];
}
\end{cxxcode}
\end{minipage}
\begin{itemize}
\item Constant term is re-computed at every iteration of the loop
\item Can be taken out of the loop and computed once
\end{itemize}
\end{frame}
\note{
\begin{itemize}
\item In very simple cases like here, the compiler is smart enough to do it
for you
\item The main point is that the compiler will do most of the optimization
job. Our goal is to write code that expresses our intention in a clear way
so that the compiler can optimize it.
\end{itemize}
}
\begin{frame}[t,fragile]
\frametitle{Single-core optimization}
\framesubtitle{Basic optimization techniques}
\textbf{Avoid branches}\\
\begin{minipage}[t]{0.475\linewidth}
\begin{cxxcode}{}
for (i = 0; i < N; ++i) {
for (j = 0; j < N; ++j) {
if (j >= i) {
sign = 1.0;
} else {
sign = -1.0;
}
b[j] = sign * a[i][j];
}
}
\end{cxxcode}
\end{minipage}
\hfill
\begin{minipage}[t]{0.475\linewidth}
\begin{cxxcode}{}
for (i = 0; i < N; ++i) {
for (j = i; j < N; ++j) {
b[j] = a[i][j];
}
for (j = 0; j < i; ++j) {
b[j] = -a[i][j];
}
}
\end{cxxcode}
\end{minipage}
\begin{itemize}
\item Avoid conditional branches in loops
\item They can often be written differently or taken out of the loop
\end{itemize}
\end{frame}
\subsection{Memory hierarchy}
\label{sec:memory_hierarchy}
\begin{frame}
\frametitle{Single-core optimization}
\framesubtitle{Tale of a smart librarian}
\begin{itemize}
\item To better understand the concepts behind caching, let's take the
example of a librarian
\item The first customer enters and asks for a book. The librarian goes into
the huge storeroom and returns with the book when he finds it
\item After some time, the client returns the book and the librarian puts it
back into the storeroom
\item A second customer enters and asks for the same book...
\item This workflow can take a lot of time depending on how much customers
want to read the same book
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Single-core optimization}
\framesubtitle{Tale of a smart librarian}
\begin{itemize}
\item Our librarian is a bit lazy, but clever. Since a lot of customers ask
for the same book, he decides to put a small shelf behind his desk to
temporarily store the books he retrieves.
\item This way he can quickly grab the book instead of going to the
storeroom.
\item When a customer asks for a book, he will first look on his shelf. If
he finds the book, it's a \textit{cache hit} and he returns it to the customer.
If not, it's a \textit{cache miss} and he must go back in the storeroom.
\item This is a very clever system, especially if there is \textit{temporal
locality}, i.e. if the customers often ask for the same books.
\item Can he do better ?
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Single-core optimization}
\framesubtitle{Tale of a smart librarian}
\begin{itemize}
\item Oftentimes, our librarian see that people taking one book will go back
and ask for the sequels of the book
\item He decides to change a bit his workflow. Now, when he goes into the
storeroom to retrieve a book, he comes back with a few of them, all on
the same shelf
\item This way, when the customer brings back a book and asks for the
sequel, it is already present on the librarian shelf
\item This workflow works well when there is \textit{spatial locality}, i.e.
when you ask for a book there is a significant chance that you will
read the sequel
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Single-core optimization}
\framesubtitle{Data loading}
\begin{itemize}
\item Now, what is the link between our librarian and the CPU? They work in a similar fashion!
\item When a load instruction is issued the L1 cache logic checks if data is
already present. If yes, this is a \textit{cache hit} and data can be
retrieved very quickly. If no, this is a \textit{cache miss} and the
next memory levels are checked.
\item If the data is nowhere to be found, then it is loaded from the main
memory
\item As for our librarian, not only the required data is loaded for each
cache miss, but a whole \textit{cache line}
\end{itemize}
\end{frame}
\begin{frame}[t,fragile]
\frametitle{Single-core optimization}
\framesubtitle{Example: vector multiplication with a scalar}
\begin{itemize}
\item Simple vector/scalar multiplication
\item Focus on data loading (\code{b[i]})
\item Assume only one level of cache with a cache line of two doubles (16 bytes)
\end{itemize}
\begin{cxxcode}{}
for (int i = 0; i < N; ++i) {
a[i] = alpha * b[i];
}
\end{cxxcode}
\onslide<1>\addimage[width=7cm]{\FIGREP/data_loading_1}{5cm}{0.5cm}
\onslide<2>\addimage[width=7cm]{\FIGREP/data_loading_2}{5cm}{0.5cm}
\onslide<3>\addimage[width=7cm]{\FIGREP/data_loading_3}{5cm}{0.5cm}
\onslide<4>\addimage[width=7cm]{\FIGREP/data_loading_4}{5cm}{0.5cm}
\onslide<5>\addimage[width=7cm]{\FIGREP/data_loading_5}{5cm}{0.5cm}
\onslide<6>\addimage[width=7cm]{\FIGREP/data_loading_6}{5cm}{0.5cm}
\end{frame}
\begin{frame}[t]
\frametitle{Single-core optimization}
\framesubtitle{Memory layout and data access}
\begin{itemize}
\item How do we store ND arrays into memory?
\item Memory is a linear storage. Arrays are stored contiguously, one
element after the other.
\item We have to choose a convention. Row major (C/C++) or column major (Fortran).
\item Row major means that elements are stored contiguously according to the
last index of the array. In column-major order, they are stored according to
the first index.
\end{itemize}
\onslide<1>\addimage[width=6cm]{\FIGREP/row_major_1}{5cm}{0.5cm}
\onslide<2>\addimage[width=6cm]{\FIGREP/row_major_2}{5cm}{0.5cm}
\end{frame}
\begin{frame}[t,fragile]
\frametitle{Single-core optimization}
\framesubtitle{Example: matrix/vector multiplication}
\begin{itemize}
\item Focus on data loading (\code{a[i][j]})
\item Assume only one level of cache with a cache line of two doubles (16 bytes)
\end{itemize}
\begin{cxxcode}{}
for (int j = 0; j < N; ++j) {
for (int i = 0; i < N; ++i) {
c[i] += a[i][j] * b[j];
}
}
\end{cxxcode}
\onslide<1>\addimage[width=7cm]{\FIGREP/matrix_vector_1}{5cm}{0.5cm}
\onslide<2>\addimage[width=7cm]{\FIGREP/matrix_vector_2}{5cm}{0.5cm}
\onslide<3>\addimage[width=7cm]{\FIGREP/matrix_vector_3}{5cm}{0.5cm}
\onslide<4>\addimage[width=7cm]{\FIGREP/matrix_vector_4}{5cm}{0.5cm}
\onslide<5>\addimage[width=7cm]{\FIGREP/matrix_vector_5}{5cm}{0.5cm}
\onslide<6>\addimage[width=7cm]{\FIGREP/matrix_vector_6}{5cm}{0.5cm}
\onslide<7>\addimage[width=7cm]{\FIGREP/matrix_vector_7}{5cm}{0.5cm}
\vspace{5cm}
\begin{itemize}
\item<4> Non contiguous data accesses are detrimental for performance!
\end{itemize}
\end{frame}
\begin{frame}[t]
\frametitle{Single-core optimization}
\framesubtitle{Early conclusions}
\begin{itemize}
\item Caches are small, but very fast memories
\item Their purpose is to alleviate long latency and limited bandwidth of
the RAM
\item Data is fetched by group, called cache line, and stored into the
different levels of cache
\item In order to fully exploit caches, data in caches must be re-used as
much as possible
\end{itemize}
\vfill
\begin{itemize}
\item Avoid random memory accesses that case many cache misses and prefer contiguous access
\item Be careful of the data types you use and how they are mapped onto
memory
\end{itemize}
\end{frame}
\subsection{Single Instruction Multiple Data}
\label{sec:simd}
\begin{frame}[t]
\frametitle{Single-core optimization}
\framesubtitle{Single Instruction Multiple Data}
\begin{itemize}
\item Modern CPUs can apply the same operation to multiple data
\item Special registers \cmd{xmm}, \cmd{ymm} and \cmd{zmm} holding 2, 4 or 8
doubles
\end{itemize}
\onslide<1>\addimage[width=7cm]{\FIGREP/vectorization_1}{5cm}{1.2cm}
\onslide<2>\addimage[width=7cm]{\FIGREP/vectorization_2}{5cm}{1.2cm}
\end{frame}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../../phys_743_parallel_programming"
%%% End:

Event Timeline