Page MenuHomec4science

performance_measurement.tex
No OneTemporary

File Metadata

Created
Mon, Jul 1, 21:36

performance_measurement.tex

\renewcommand{\FIGREP}{src/performance_measurement/figures}
\section{Performance measurement}
\label{sec:performance_measurement}
\begin{frame}
\frametitle{Goal of this section}
\framesubtitle{}
\begin{itemize}
\item Key concepts to understand and quantify performance
\item Roofline model
\item Using a profiler
\end{itemize}
\end{frame}
\subsection{Perfromance metrics}
\label{sec:metrics}
\begin{frame}
\frametitle{Performance metrics}
\framesubtitle{}
\begin{itemize}
\item How can we quantify performance?
\item We need to define a means to measure it
\item Plethora of metrics, but for scientific codes a few are
particularly interesting
\end{itemize}
\vfill
\pause
\begin{itemize}
\item The first that comes in mind is \textit{time}
\item It gives a good indication about how long you have to wait
\end{itemize}
\vfill
\pause
\begin{itemize}
\item Scientific codes do computations on floating point numbers
\item A second metric is the number of \textit{floating-point operations per second}
(\unit{\flops})
\end{itemize}
\vfill
\pause
\begin{itemize}
\item Finally, the \textit{memory bandwidth} indicates how much data does your code
transfers per unit of time
\end{itemize}
\end{frame}
\note{
\begin{itemize}
\item My code is super fast, it runs in $2.5\unit{\ns}$!
\item It seems fast, but is it? How fast can your hardware go?
\item To really understand how much your code exploit the hardware, we use
the \unit{\flops} and memory BW
\item Your hardware has theoretical maximum values for those
\item You can compare the values from your code to the max to see how well
you use the hardware
\end{itemize}
}
\begin{frame}
\frametitle{\unit{\flops} and memory bandwidth}
\framesubtitle{}
\begin{itemize}
\item How to measure \unit{\flops}?
\begin{itemize}
\item By hand, dividing the number of operations by the running time
\item Using tools such as PAPI, Tau, likwid, Intel Amplxe, etc.
\end{itemize}
\end{itemize}
\vfill
\begin{itemize}
\item Memory bandwidth measures the amount of data transferred by unit of
time
[\unit{\byte\per\second}, \unit{\kibi\byte\per\second}, \unit{\mebi\byte\per\second}, \unit{\gibi\byte\per\second}, ...]
\item How to measure it?
\begin{itemize}
\item By hand dividing the amount of data transferred by the running
time
\item Using tools such as STREAM, PAPI, Tau, Intel Amplxe, etc
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[t,fragile]
\frametitle{Performance measurement}
\framesubtitle{A simple DAXPY example}
\begin{itemize}
\item Assume Intel Xeon Gold 6132 (Gacrux), no cache
\end{itemize}
\vfill
\cxxfile[%
title={daxpy.cpp},
minted options app={
% highlightlines={2, 7},
firstline=19,
lastline=21,
% firstnumber=1,
}]{examples/optimization/daxpy.cpp}
\begin{itemize}
\item My code runs in $\qty{147.25}{\ms}$. It is amazingly fast!
\end{itemize}
\pause
\vfill
\begin{itemize}
\item Each iteration has 2 FLOP (1 add and 1 mul) and there are \cmd{N = 1e8}
iterations
\item Our hardware can achieve a theoretical peak performance of $\qty{1.16}{\tera\flops}$
\item Our code $\qty{2d8}{\flop} / \qty{174.25d3}{\second} = \qty{0.001}{\tera\flops}$...
\end{itemize}
\end{frame}
\subsection{Scalings}
\label{sec:scalings}
\begin{frame}
\frametitle{Scalings}
\framesubtitle{}
\begin{itemize}
\item Scalings are a way to assess how well a program performs when adding
computational resources
\item Two types of scalings:
\begin{description}
\item[Strong scaling] Add resources, keep amount of work constant
\item[Weak scaling] Add both resources and amount of work
\end{description}
\end{itemize}
\vfill
\begin{itemize}
\item Compare timings with $n$ processes, $T_{n}$, against the one-process
case, $T_{1}$
\end{itemize}
\vfill
\begin{minipage}{0.47\linewidth}
\begin{center}
\textbf{Speedup}
\end{center}
\begin{equation*}
S(n) = \frac{T_{1}}{T_{n}}
\end{equation*}
\end{minipage}
\hfill
\begin{minipage}{0.47\linewidth}
\begin{center}
\textbf{Efficiency}
\end{center}
\begin{equation*}
E(n) = \frac{S(n)}{n}
\end{equation*}
\end{minipage}
\vfill
\begin{itemize}
\item We want $S(n)$ as close to $n$ and $E(n)$ as close to 1 as possible
\end{itemize}
\end{frame}
\subsection{Amdahl's law}
\label{sec:amdahl}
\begin{frame}[t]
\frametitle{Amdahl's law}
\framesubtitle{}
\begin{itemize}
\item Amdahl's law gives you an upper bound to the achievable speedup (for a
fixed problem size)
\pause
\item Assume a fraction $p$ of your code is (perfectly) parallel and timing with 1 process
is $T_{1}$
\item Timing with $n$ processes is
\begin{equation*}
T_{n} = (1-p) T_{1} + \frac{p}{n}T_{1} = \left[ (1-p) + \frac{p}{n}\right] T_{1}
\end{equation*}
\pause
\item Speedup becomes
\begin{equation*}
S(n) = \frac{T_{1}}{T_{n}} = \frac{1}{(1-p) + \frac{p}{n}}
\end{equation*}
\end{itemize}
\vspace{2cm}
\begin{itemize}
\pause
\item In the limit of infinite resources
\begin{equation*}
\lim_{n\rightarrow\infty}S(n) = \frac{1}{1-p}
\end{equation*}
\end{itemize}
\onslide<2->\addimage{\FIGREP/amdahl_illustration}{4.5cm}{6.75cm}{2.0cm}
\end{frame}
\begin{frame}[b]
\frametitle{Amdahl's law}
\framesubtitle{}
\begin{itemize}
\item Limited by the serial part (very sensitive)!
\item Does this mean we cannot exploit large HPC machines?
\end{itemize}
\addimage{\FIGREP/amdahl}{8.cm}{4cm}{1.7cm}
\end{frame}
\subsection{Gustafson's law}
\label{sec:amdahl}
\begin{frame}[t]
\frametitle{Gustafson's law}
\framesubtitle{}
\begin{itemize}
\item Variation of Amdahl where the problem size is not fixed
\pause
\item In general, the serial part does not scale much with problem size,
e.g. reading data from disk
\item Increase problem size by $n$
\pause
\item Timings with 1 process and $n$ processes are
\begin{equation*}
T_{1} = (1-p) + np, \qquad T_{n} = (1-p) + \frac{np}{n}
\end{equation*}
\pause
\item Speedup becomes
\begin{equation*}
S(n) = \frac{T_{1}}{T_{n}} = (1-p) + np
\end{equation*}
\item In the limit of infinite resources
\begin{equation*}
\lim_{n\rightarrow\infty}S(n) = \infty
\end{equation*}
\end{itemize}
\end{frame}
\subsection{Pareto principle}
\label{sec:pareto}
\begin{frame}
\frametitle{Pareto principle}
\framesubtitle{The 80/20 rule}
\begin{itemize}
\item General principle that states that 80\% of the effect comes from 20\%
of causes
\item Applies in many domains and especially in optimization
\item 80\% of the time is spent in 20\% of your code
\item Concentrate on those 20\% and don't arbitrarily optimize
\end{itemize}
\end{frame}
\subsection{Roofline model}
\label{sec:roofline}
\begin{frame}[t]
\frametitle{Roofline model}
\framesubtitle{}
\begin{itemize}
\item How well am I exploiting the hardware resources?
\item The roofline model is a performance model allowing to have an estimate
to this question
\end{itemize}
\vspace{1cm}
\pause
\begin{itemize}
\item Key concept: the arithmetic intensity, $AI$, of an algorithm is \# \unit{\flop\per\byte} of data loaded
\item It measures data reuse
\end{itemize}
\addimage{\FIGREP/ai}{8.cm}{4cm}{0.5cm}
\end{frame}
\begin{frame}[t]
\frametitle{Roofline model}
\framesubtitle{}
\begin{itemize}
\item Roofline model is plotted on \textbf{log-log scale}
\begin{itemize}
\item x-axis is the AI
\item y-axis is \unit{\flops}
\end{itemize}
\pause
\item The hardware limits are defined by
\begin{equation*}
P = \min(P_{\text{max}}, I\cdot b_{s})
\end{equation*}
\begin{itemize}
\item $P_{\text{max}}$ is the CPU peak \unit{\flops}
\item$I$ is the intensity
\item $b_{s}$ is the memory BW
\end{itemize}
\end{itemize}
\onslide<1>\addimage{\FIGREP/roofline_1}{5cm}{5.5cm}{0.5cm}
\onslide<2>\addimage{\FIGREP/roofline_2}{5cm}{5.5cm}{0.5cm}
\onslide<3>\addimage{\FIGREP/roofline_3}{5cm}{5.5cm}{0.5cm}
\end{frame}
\begin{frame}[t]
\frametitle{Roofline model}
\framesubtitle{}
\begin{itemize}
\item Refinements can be made to the Roofline model
\item Adding a memory hierarchy with caches
\item Adding different levels of DLP
\item They give you hint on what to optimize for
\end{itemize}
\addimage{\FIGREP/roofline_extended}{7cm}{4.5cm}{0.5cm}
\end{frame}
\begin{frame}[t]
\frametitle{Roofline model}
\framesubtitle{How to find the peak performance}
\begin{itemize}
\item Theoretical peak performance
\begin{align*}
P_{\text{max}} = & \textcolor{white}{\times} \text{Number of FP ports (Superscalar)} \\
& \times \text{flops} / \text{cycles (e.g. 2 for FMA)} \\
& \times \text{vector size (DLP)} \\
& \times \text{frequency (in GHz)} \\
& \times \text{number of cores (TLP)}
\end{align*}
\item Example: \href{https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server)}{Intel Xeon Gold 6132}
\begin{align*}
P_{\text{max}} = & \textcolor{white}{\times} 2 \text{ (Superscalar)} \\
& \times 2 \text{ (e.g. 2 for FMA)} \\
& \times 8 \text{ (DLP)} \\
& \times 2.3 \text{ (frequency)} \\
& \times 14 \text{ (TLP)} \\
= & \qty{1.16}{\tera\flops}
\end{align*}
\pause
\item Or use a software that estimates it
\end{itemize}
\end{frame}
\begin{frame}[t]
\frametitle{Roofline model}
\framesubtitle{How to find the memory bandwidth}
\begin{itemize}
\item Theoretical memory bandwidth can be found from the RAM spec.
\begin{align*}
\text{BW}_{\text{max}} = & \textcolor{white}{\times} \text{RAM frequency} \\
& \times \text{Number of transfers per clock cycle} \\
& \times \text{Bus width} \\
& \times \text{Number of interfaces}
\end{align*}
\item In general, we suppose that RAM matches CPU bandwidth (found on the CPU spec. list)
\item Example: \href{https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server)}{Intel Xeon Gold 6132}
\begin{itemize}
\item $\qty{21.33}{\gibi\byte\per\second}$ for 1 channel
\item Maximum of $\qty{128}{\gibi\byte\per\second}$
\end{itemize}
\pause
\item Or use a software that estimates it
\end{itemize}
\begin{itemize}
\item A corollary from ``theoretical'' is that it is not achievable in practice!
\end{itemize}
\end{frame}
\begin{frame}[t,fragile]
\frametitle{Roofline model}
\framesubtitle{How to find arithmetic intensity}
\begin{itemize}
\item For very simple algorithms, you can compute the AI
\item Let's take back the DAXPY example
\begin{cxxcode}{}
N = 1e8;
for (int i = 0; i < N; ++i) {
c[i] = a[i] + alpha * b[i];
}
\end{cxxcode}
\item There are 2 operations (1 add and 1 mul)
\item Three 8-byte memory operations (2 loads and 1 store)
\item The AI is then $2/24 = 1/12$
\pause
\item For more complex algorithms, use a tool, e.g. Intel Advisor
\end{itemize}
\end{frame}
\subsection{Profiling}
\label{sec:profiling}
\begin{frame}
\frametitle{Profiling}
\framesubtitle{A precious ally for optimization}
\begin{itemize}
\item Where is my application spending most of its time?
\begin{itemize}
\item (bad) measure time ``by hand'' using timings and prints
\item (good) use a tool made for this, e.g. Intel Amplifier, Scorep,
gprof
\end{itemize}
\end{itemize}
\vfill
\begin{itemize}
\item In addition to timings, profilers give you a lot more information on
\begin{itemize}
\item Memory usage
\item Hardware counters
\item CPU activity
\item MPI communications
\item etc.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Profiling}
\framesubtitle{Interactive demonstration}
\begin{itemize}
\item For the purpose of this exercise, we will use MiniFE
\begin{itemize}
\item 3D implicit finite-elements on an unstructured mesh
\item C++ mini application
\item \url{https://github.com/Mantevo/miniFE}
\end{itemize}
\item We will use Intel VTune, part of the \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html\#base-kit}{OneAPI Base toolkit (free)}
\end{itemize}
\end{frame}
\begin{frame}[fragile,exercise]
\frametitle{Profiling}
\framesubtitle{Compile MiniFE}
\begin{itemize}
\item Download miniFE
\begin{bashcode}
$> git clone https://github.com/Mantevo/miniFE.git
$> cd miniFE
\end{bashcode}
\item Compile the basic version found in \cmd{ref/src}
\begin{itemize}
\item You will need to load a compiler and an MPI library
\begin{bashcode}
$> module load intel intel-mpi
\end{bashcode}
\item Change the \cmd{Makefile} to set \cmd{CXX=mpiicpc} and \cmd{CC=mpiicc} and compile
\begin{bashcode}
$> make
\end{bashcode}
\item Make sure to compile your code with \cmd{-g -O3}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile,exercise]
\frametitle{Profiling}
\framesubtitle{Profile MiniFE}
\begin{itemize}
\item Profile the code using
\begin{bashcode}
$> srun -n 1 amplxe-cl -collect hotspots -r prof_results -- ./miniFE.x -nx 128 -ny 128 -nz 128
\end{bashcode}
\item This will profile for the ``hotspots'' and store the timings in \cmd{prof\_results}
\item You can have more info on the types of analysis with
\begin{bashcode}
$> amplxe-cl -h collect
\end{bashcode}
\item Open Intel VTune and select your timings
\begin{bashcode}
$> amplxe-gui prof_results/prof_results.amplxe
\end{bashcode} %$
\item Play around and find the 5 most time-consuming functions
\end{itemize}
\end{frame}
\begin{frame}[fragile,exercise]
\frametitle{Profiling}
\framesubtitle{What do we learn?}
\begin{itemize}
\item 50.0\% of the time spent in matrix/vector multiplications
\item 12.5\% of time spent imposing boundary conditions
\item etc.
\item Does the problem size influence the timings?
\end{itemize}
\end{frame}
\begin{frame}[fragile,exercise]
\frametitle{Profiling}
\framesubtitle{Smaller problem}
\begin{itemize}
\item This time, we profile a problem of size $(16, 16, 16)$
\item 13.6\% of the time is spent opening libraries
\item 13.6\% of the time is spent initializing MPI
\item etc.
\item Depending on the problem size, different parts of the code will dominate
\end{itemize}
\end{frame}
\begin{frame}[fragile,exercise]
\frametitle{Profiling}
\framesubtitle{Some tips and tricks}
\begin{itemize}
\item Profile a code without bugs!
\item Choose the right problem size (representative of your simulations)
\item Focus on the functions taking the most time first
\item If the profile is not explicit, try refactoring into smaller functions
\begin{itemize}
\item Some profilers, e.g. ScoreP, let you define custom regions
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile,exercise,t]
\frametitle{Profiling}
\framesubtitle{Optimization}
\begin{itemize}
\item We now have a pretty good idea of which part of the code to optimize
\item Different options are possible (by order of complexity)
\begin{enumerate}
\item Compiler and linker flags
\item Optimized external libraries
\item Handmade optimization (loop reordering, better data access,
etc.)
\item Algorithmic changes
\end{enumerate}
\pause
\item Example of matrix/vector multiplication. Graph shows complexity ($\mathcal{O}(n^{\omega})$) for
different algorithms
\end{itemize}
\onslide<2>\addimage{\FIGREP/matmul}{7cm}{4.5cm}{0.5cm}
\end{frame}
\begin{frame}[fragile,exercise,t]
\frametitle{Parallelization}
\framesubtitle{When to parallelize}
\begin{itemize}
\item Only when your code has \textit{no bugs} and is \textit{optimized}
\item Are your ready to parallelize?
\begin{enumerate}
\item Is it worth to parallelize my code? Does my algorithm scale?
\item Performance prediction?
\item Profiling?
\item Bottelnecks?
\item Which parallel paradigm should I use? What is the target architecture
(SMP, cluster, GPU, hybrid, etc)?
\end{enumerate}
\end{itemize}
\end{frame}
\begin{frame}[fragile,exercise,t]
\frametitle{Parallelization}
\framesubtitle{When to parallelize}
In 1991, David H. Bailey published a famous paper: \href{https://www.davidhbailey.com/dhbpapers/twelve-ways.pdf}{Twelve ways to fool
the masses when giving performance results on parallel computers}
\vspace{1cm}
\textit{6: Compare your results against scalar, unoptimized code on Crays.}
\addimage{\FIGREP/dhb}{7cm}{4.5cm}{0.5cm}
\end{frame}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../../phys_743_parallel_programming"
%%% End:

Event Timeline