optimization.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, May 28, 13:53

optimization.tex
View Options

	\renewcommand{\FIGREP}{src/optimization/figures}

	\section{Single-core optimization}
	\label{sec:optimization}
	\intersec{deneb}

	\begin{frame}
	\frametitle{Single-core optimization}
	\framesubtitle{Goal of this section}
	\begin{itemize}
	\item Better grasp how programming can influence performance
	\item We first review some basic optimization principles to keep in mind
	\item Deeper understanding of the working principles of the CPU
	\begin{itemize}
	\item How data transfers are handled
	\item Concept of vectorization
	\end{itemize}
	\end{itemize}
	\end{frame}

	\subsection{Basic optimization concepts}
	\label{sec:basic_optimization}
	\begin{frame}[t,fragile]
	\frametitle{Single-core optimization}
	\framesubtitle{Basic optimization techniques}
	\begin{itemize}
	\item Often, very simple changes to the code lead to significant performance
	improvements
	\item The following may seem trivial, but you would be surprised how often
	they could be used in scientific codes
	\item The main problem is that we often make a one-to-one mapping between
	the equations and the algorithm
	\end{itemize}
	\vfill
	\textbf{Do less work}\\
	\begin{minipage}[t]{0.475\linewidth}
	\begin{cxxcode}{}
	for (int i = 0; i < N; ++i) {
	a[i] = (alpha + sin(x)) * b[i];
	}
	\end{cxxcode}
	\end{minipage}
	\hfill
	\begin{minipage}[t]{0.475\linewidth}
	\begin{cxxcode}{}
	double tmp = alpha + sin(x);
	for (int i = 0; i < N; ++i) {
	a[i] = tmp * b[i];
	}
	\end{cxxcode}
	\end{minipage}
	\begin{itemize}
	\item Constant term is re-computed at every iteration of the loop
	\item Can be taken out of the loop and computed once
	\end{itemize}
	\end{frame}

	\note{
	\begin{itemize}
	\item In very simple cases like here, the compiler is smart enough to do it
	for you
	\item The main point is that the compiler will do most of the optimization
	job. Our goal is to write code that expresses our intention in a clear way
	so that the compiler can optimize it.
	\end{itemize}
	}

	\begin{frame}[t,fragile]
	\frametitle{Single-core optimization}
	\framesubtitle{Basic optimization techniques}
	\textbf{Avoid branches}\\
	\begin{minipage}[t]{0.475\linewidth}
	\begin{cxxcode}{}
	for (i = 0; i < N; ++i) {
	for (j = 0; j < N; ++j) {
	if (j >= i) {
	sign = 1.0;
	} else {
	sign = -1.0;
	}
	b[j] = sign * a[i][j];
	}
	}
	\end{cxxcode}
	\end{minipage}
	\hfill
	\begin{minipage}[t]{0.475\linewidth}
	\begin{cxxcode}{}
	for (i = 0; i < N; ++i) {
	for (j = i; j < N; ++j) {
	b[j] = a[i][j];
	}
	for (j = 0; j < i; ++j) {
	b[j] = -a[i][j];
	}
	}
	\end{cxxcode}
	\end{minipage}
	\begin{itemize}
	\item Avoid conditional branches in loops
	\item They can often be written differently or taken out of the loop
	\end{itemize}
	\end{frame}

	\subsection{Memory hierarchy}
	\label{sec:memory_hierarchy}
	\begin{frame}
	\frametitle{Single-core optimization}
	\framesubtitle{Tale of a smart librarian}
	\begin{itemize}
	\item To better understand the concepts behind caching, let's take the
	example of a librarian
	\item The first customer enters and asks for a book. The librarian goes into
	the huge storeroom and returns with the book when he finds it
	\item After some time, the client returns the book and the librarian puts it
	back into the storeroom
	\item A second customer enters and asks for the same book...
	\item This workflow can take a lot of time depending on how much customers
	want to read the same book
	\end{itemize}
	\end{frame}

	\begin{frame}
	\frametitle{Single-core optimization}
	\framesubtitle{Tale of a smart librarian}
	\begin{itemize}
	\item Our librarian is a bit lazy, but clever. Since a lot of customers ask
	for the same book, he decides to put a small shelf behind his desk to
	temporarily store the books he retrieves.
	\item This way he can quickly grab the book instead of going to the
	storeroom.
	\item When a customer asks for a book, he will first look on his shelf. If
	he finds the book, it's a \textit{cache hit} and he returns it to the customer.
	If not, it's a \textit{cache miss} and he must go back in the storeroom.
	\item This is a very clever system, especially if there is \textit{temporal
	locality}, i.e. if the customers often ask for the same books.
	\item Can he do better ?
	\end{itemize}
	\end{frame}

	\begin{frame}
	\frametitle{Single-core optimization}
	\framesubtitle{Tale of a smart librarian}
	\begin{itemize}
	\item Oftentimes, our librarian see that people taking one book will go back
	and ask for the sequels of the book
	\item He decides to change a bit his workflow. Now, when he goes into the
	storeroom to retrieve a book, he comes back with a few of them, all on
	the same shelf
	\item This way, when the customer brings back a book and asks for the
	sequel, it is already present on the librarian shelf
	\item This workflow works well when there is \textit{spatial locality}, i.e.
	when you ask for a book there is a significant chance that you will
	read the sequel
	\end{itemize}
	\end{frame}

	\begin{frame}
	\frametitle{Single-core optimization}
	\framesubtitle{Data loading}
	\begin{itemize}
	\item Now, what is the link between our librarian and the CPU? They work in a similar fashion!
	\item When a load instruction is issued the L1 cache logic checks if data is
	already present. If yes, this is a \textit{cache hit} and data can be
	retrieved very quickly. If no, this is a \textit{cache miss} and the
	next memory levels are checked.
	\item If the data is nowhere to be found, then it is loaded from the main
	memory
	\item As for our librarian, not only the required data is loaded for each
	cache miss, but a whole \textit{cache line}
	\end{itemize}
	\end{frame}

	\begin{frame}[t,fragile]
	\frametitle{Single-core optimization}
	\framesubtitle{Example: vector multiplication with a scalar}
	\begin{itemize}
	\item Simple vector/scalar multiplication
	\item Focus on data loading (\code{b[i]})
	\item Assume only one level of cache with a cache line of two doubles (16 bytes)
	\end{itemize}
	\begin{cxxcode}{}
	for (int i = 0; i < N; ++i) {
	a[i] = alpha * b[i];
	}
	\end{cxxcode}

	\onslide<1>\addimage[width=7cm]{\FIGREP/data_loading_1}{5cm}{0.5cm}
	\onslide<2>\addimage[width=7cm]{\FIGREP/data_loading_2}{5cm}{0.5cm}
	\onslide<3>\addimage[width=7cm]{\FIGREP/data_loading_3}{5cm}{0.5cm}
	\onslide<4>\addimage[width=7cm]{\FIGREP/data_loading_4}{5cm}{0.5cm}
	\onslide<5>\addimage[width=7cm]{\FIGREP/data_loading_5}{5cm}{0.5cm}
	\onslide<6>\addimage[width=7cm]{\FIGREP/data_loading_6}{5cm}{0.5cm}
	\end{frame}


	\begin{frame}[t]
	\frametitle{Single-core optimization}
	\framesubtitle{Memory layout and data access}
	\begin{itemize}
	\item How do we store ND arrays into memory?
	\item Memory is a linear storage. Arrays are stored contiguously, one
	element after the other.
	\item We have to choose a convention. Row major (C/C++) or column major (Fortran).
	\item Row major means that elements are stored contiguously according to the
	last index of the array. In column-major order, they are stored according to
	the first index.
	\end{itemize}
	\onslide<1>\addimage[width=6cm]{\FIGREP/row_major_1}{5cm}{0.5cm}
	\onslide<2>\addimage[width=6cm]{\FIGREP/row_major_2}{5cm}{0.5cm}
	\end{frame}

	\begin{frame}[t,fragile]
	\frametitle{Single-core optimization}
	\framesubtitle{Example: matrix/vector multiplication}
	\begin{itemize}
	\item Focus on data loading (\code{a[i][j]})
	\item Assume only one level of cache with a cache line of two doubles (16 bytes)
	\end{itemize}
	\begin{cxxcode}{}
	for (int j = 0; j < N; ++j) {
	for (int i = 0; i < N; ++i) {
	c[i] += a[i][j] * b[j];
	}
	}
	\end{cxxcode}
	\onslide<1>\addimage[width=7cm]{\FIGREP/matrix_vector_1}{5cm}{0.5cm}
	\onslide<2>\addimage[width=7cm]{\FIGREP/matrix_vector_2}{5cm}{0.5cm}
	\onslide<3>\addimage[width=7cm]{\FIGREP/matrix_vector_3}{5cm}{0.5cm}
	\onslide<4>\addimage[width=7cm]{\FIGREP/matrix_vector_4}{5cm}{0.5cm}
	\onslide<5>\addimage[width=7cm]{\FIGREP/matrix_vector_5}{5cm}{0.5cm}
	\onslide<6>\addimage[width=7cm]{\FIGREP/matrix_vector_6}{5cm}{0.5cm}
	\onslide<7>\addimage[width=7cm]{\FIGREP/matrix_vector_7}{5cm}{0.5cm}
	\vspace{5cm}
	\begin{itemize}
	\item<4> Non contiguous data accesses are detrimental for performance!
	\end{itemize}
	\end{frame}

	\begin{frame}[t]
	\frametitle{Single-core optimization}
	\framesubtitle{Early conclusions}
	\begin{itemize}
	\item Caches are small, but very fast memories
	\item Their purpose is to alleviate long latency and limited bandwidth of
	the RAM
	\item Data is fetched by group, called cache line, and stored into the
	different levels of cache
	\item In order to fully exploit caches, data in caches must be re-used as
	much as possible
	\end{itemize}
	\vfill
	\begin{itemize}
	\item Avoid random memory accesses that case many cache misses and prefer contiguous access
	\item Be careful of the data types you use and how they are mapped onto
	memory
	\end{itemize}
	\end{frame}

	\subsection{Single Instruction Multiple Data}
	\label{sec:simd}
	\begin{frame}[t]
	\frametitle{Single-core optimization}
	\framesubtitle{Single Instruction Multiple Data}

	\begin{itemize}
	\item Modern CPUs can apply the same operation to multiple data
	\item Special registers \cmd{xmm}, \cmd{ymm} and \cmd{zmm} holding 2, 4 or 8
	doubles
	\end{itemize}

	\onslide<1>\addimage[width=7cm]{\FIGREP/vectorization_1}{5cm}{1.2cm}
	\onslide<2>\addimage[width=7cm]{\FIGREP/vectorization_2}{5cm}{1.2cm}
	\end{frame}

	%%% Local Variables:
	%%% mode: latex
	%%% TeX-master: "../../phys_743_parallel_programming"
	%%% End:

optimization.texNo OneTemporaryActions

File Metadata

optimization.texView Options

Event Timeline

optimization.tex
No OneTemporary
Actions

optimization.tex
View Options