diff --git a/src/performance_measurement/figures/roofline.svg b/src/performance_measurement/figures/roofline.svg index 6069a47..f33400c 100644 --- a/src/performance_measurement/figures/roofline.svg +++ b/src/performance_measurement/figures/roofline.svg @@ -1,368 +1,368 @@ image/svg+xml - + 1/8 1/4 1/2 1 2 4 8 16 32 64 I FLOPS Pmax I bs only ~20% of HW capabilities + style="font-size:2.11667px;fill:#dc0000;fill-opacity:1;stroke-width:0.264583">Lot of potential improvement! diff --git a/src/performance_measurement/figures/roofline_3.pdf b/src/performance_measurement/figures/roofline_3.pdf index 813b65d..26fe06f 100644 Binary files a/src/performance_measurement/figures/roofline_3.pdf and b/src/performance_measurement/figures/roofline_3.pdf differ diff --git a/src/performance_measurement/figures/roofline_extended.pdf b/src/performance_measurement/figures/roofline_extended.pdf new file mode 100644 index 0000000..ac7d4a9 Binary files /dev/null and b/src/performance_measurement/figures/roofline_extended.pdf differ diff --git a/src/performance_measurement/figures/roofline.svg b/src/performance_measurement/figures/roofline_extended.svg similarity index 68% copy from src/performance_measurement/figures/roofline.svg copy to src/performance_measurement/figures/roofline_extended.svg index 6069a47..1eba7f8 100644 --- a/src/performance_measurement/figures/roofline.svg +++ b/src/performance_measurement/figures/roofline_extended.svg @@ -1,368 +1,490 @@ + sodipodi:docname="roofline_extended.svg"> image/svg+xml - + + + + + Pmax + + + + + L3 cache + + L2 cache + L1 cache + RAM + + + + No FMA + FMA + AVX + + - + 1/8 1/4 1/2 1 2 4 8 16 32 64 I FLOPS - - - - - Pmax - I bs - - - - - - only ~20% of HW capabilities - diff --git a/src/performance_measurement/performance_measurement.tex b/src/performance_measurement/performance_measurement.tex index 84c26f0..a3706a3 100644 --- a/src/performance_measurement/performance_measurement.tex +++ b/src/performance_measurement/performance_measurement.tex @@ -1,355 +1,398 @@ \renewcommand{\FIGREP}{src/performance_measurement/figures} \section{Performance measurement} \label{sec:performance_measurement} \begin{frame} \frametitle{Goal of this section} \framesubtitle{} \begin{itemize} \item Key concepts to understand and quantify performance \item Using a profiler \item Roofline model \end{itemize} \end{frame} \subsection{Perfromance metrics} \label{sec:metrics} \begin{frame} \frametitle{Performance metrics} \framesubtitle{} \begin{itemize} \item How can we quantify performance? \item We need to define a mean to measure it \item Plethora of metrics, but for scientific codes a few are particularly interesting \end{itemize} \vfill \pause \begin{itemize} \item The first that comes in mind is \textit{time} \item It gives a good indication about how long you have to wait \end{itemize} \vfill \pause \begin{itemize} \item Scientific codes do computations on floating point numbers \item A second metric is the number of \textit{floating-point operations per second} (FLOPS) \end{itemize} \vfill \pause \begin{itemize} \item Finally, the \textit{memory bandwidth} indicates how much data does your code transfers per unit of time \end{itemize} \end{frame} \note{ \begin{itemize} \item My code is super fast, it runs in 2.5\,ns! \item It seems fast, but is it? How fast can your hardware go? \item To really understand how much your code exploit the hardware, we use the FLOPS and memory BW \item Your hardware has theoretical maximum values for those \item You can compare the values from your code to the max to see how well you use the hardware \end{itemize} } \begin{frame} \frametitle{FLOPS and memory bandwidth} \framesubtitle{} \begin{itemize} \item How to measure FLOPS? \begin{itemize} \item By hand, dividing the number of operations by the running time \item Using tools such as PAPI, Tau, likwid, Intel Amplxe, etc. \end{itemize} \end{itemize} \vfill \begin{itemize} \item Memory bandwidth measures the amount of data transferred by unit of time [(k,M,G,T)byte / s] \item How to measure it? \begin{itemize} \item By hand dividing the amount of data transferred by the running time \item Using tools such as STREAM, PAPI, Tau, Intel Amplxe, etc \end{itemize} \end{itemize} \end{frame} \begin{frame}[t,fragile] \frametitle{Performance measurement} \framesubtitle{A simple DAXPY example} \begin{itemize} \item Assume Intel Xeon 6132 (Fidis), no cache \end{itemize} \vfill \begin{cxxcode} N = 1e8; for (int i = 0; i < N; ++i) { c[i] = a[i] + alpha * b[i]; } \end{cxxcode} \begin{itemize} \item My code runs in 174.25\,ms. It is amazingly fast! \end{itemize} \pause \vfill \begin{itemize} \item Each iteration has 2 FLOP (1 add and 1 mul) \item Our hardware can achieve a theoretical peak performance of 1.16\,TFLOPS \item Our code $2 \cdot 10^{8} / 174.25 \cdot 10^{3} = 0.001$\,TFLOPS... \end{itemize} \end{frame} \subsection{Scalings} \label{sec:scalings} \begin{frame} \frametitle{Scalings} \framesubtitle{} \begin{itemize} \item Scalings are a way to assess how well a program performs when adding computational resources \item Two types of scalings: \begin{description} \item[Strong scaling] Add resources, keep amount of work constant \item[Weak scaling] Add both resources and amount of work \end{description} \end{itemize} \vfill \begin{itemize} \item Compare timings with $n$ processes, $T_{n}$, against the one-process case, $T_{1}$ \end{itemize} \vfill \begin{minipage}{0.47\linewidth} \begin{center} \textbf{Speedup} \end{center} \begin{equation*} S(n) = \frac{T_{1}}{T_{n}} \end{equation*} \end{minipage} \hfill \begin{minipage}{0.47\linewidth} \begin{center} \textbf{Efficiency} \end{center} \begin{equation*} E(n) = \frac{S(n)}{n} \end{equation*} \end{minipage} \vfill \begin{itemize} \item We want $S(n)$ as close to $n$ and $E(n)$ as close to 1 as possible \end{itemize} \end{frame} \subsection{Amdahl's law} \label{sec:amdahl} \begin{frame}[t] \frametitle{Amdahl's law} \framesubtitle{} \begin{itemize} \item Amdahl's law gives you an upper bound to the achievable speedup (for a fixed problem size) \pause \item Assume a fraction $p$ of your code is (perfectly) parallel and timing with 1 process is $T_{1}$ \item Timing with $n$ processes is \begin{equation*} T_{n} = (1-p) T_{1} + \frac{p}{n}T_{1} = \left[ (1-p) + \frac{p}{n}\right] T_{1} \end{equation*} \pause \item Speedup becomes \begin{equation*} S(n) = \frac{T_{1}}{T_{n}} = \frac{1}{(1-p) + \frac{p}{n}} \end{equation*} \end{itemize} \vspace{2cm} \begin{itemize} \pause \item In the limit of infinite resources \begin{equation*} \lim_{n\rightarrow\infty}S(n) = \frac{1}{1-p} \end{equation*} \end{itemize} \onslide<2->\addimage{\FIGREP/amdahl_illustration}{4.5cm}{6.75cm}{2.0cm} \end{frame} \begin{frame}[b] \frametitle{Amdahl's law} \framesubtitle{} \begin{itemize} \item Limited by the serial part (very sensitive)! \item Does this mean we cannot exploit large HPC machines? \end{itemize} \addimage{\FIGREP/amdahl}{8.cm}{4cm}{1.7cm} \end{frame} \subsection{Gustafson's law} \label{sec:amdahl} \begin{frame}[t] \frametitle{Gustafson's law} \framesubtitle{} \begin{itemize} \item Variation of Amdahl where the problem size is not fixed \pause \item In general, the serial part does not scale much with problem size, e.g. reading data from disk \item Increase problem size by $n$ \pause \item Timings with 1 process and $n$ processes are \begin{equation*} T_{1} = (1-p) + np, \qquad T_{n} = (1-p) + \frac{np}{n} \end{equation*} \pause \item Speedup becomes \begin{equation*} S(n) = \frac{T_{1}}{T_{n}} = (1-p) + np \end{equation*} \item In the limit of infinite resources \begin{equation*} \lim_{n\rightarrow\infty}S(n) = \infty \end{equation*} \end{itemize} \end{frame} \subsection{Pareto principle} \label{sec:pareto} \begin{frame} \frametitle{Pareto principle} \framesubtitle{The 80/20 rule} \begin{itemize} \item General principle that states that 80\% of the effect comes from 20\% of causes \item Applies in many domains and especially in optimization \item 80\% of the time is spent in 20\% of your code \item Concentrate on those 20\% and don't arbitrarily optimize \end{itemize} \end{frame} \subsection{Profiling} \label{sec:profiling} \begin{frame} \frametitle{Profiling} - \framesubtitle{} + \framesubtitle{A precious ally for optimization} + \begin{itemize} + \item Where is my application spending most of its time? + \begin{itemize} + \item (bad) measure time ``by hand'' using timings and prints + \item (good) use a tool made for this, e.g. Intel Amplifier, Scorep, + gprof + \end{itemize} + \end{itemize} + \vfill + \begin{itemize} + \item In addition to timings, profilers give you a lot more information on + \begin{itemize} + \item Memory usage + \item Hardware counters + \item CPU activity + \item MPI communications + \item etc. + \end{itemize} + \end{itemize} \end{frame} \subsection{Roofline model} \label{sec:roofline} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{} \begin{itemize} \item How well am I exploiting the hardware resources? \item The roofline model is a performance model allowing to have an estimate to this question \end{itemize} \vspace{1cm} \pause \begin{itemize} \item Key concept: the arithmetic intensity, $AI$, of an algorithm is \# FLOP / byte of data loaded \item It measures data reuse \end{itemize} \addimage{\FIGREP/ai}{8.cm}{4cm}{0.5cm} \end{frame} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{} \begin{itemize} - \item Roofline model is plotted on log-log scale + \item Roofline model is plotted on \textbf{log-log scale} \begin{itemize} \item x-axis is the AI \item y-axis is FLOPS \end{itemize} \pause \item The hardware limits are defined by \begin{equation*} P = \min(P_{\text{max}}, I\cdot b_{s}) \end{equation*} \begin{itemize} \item $P_{\text{max}}$ is the CPU peak FLOPS \item$I$ is the intensity \item $b_{s}$ is the memory BW \end{itemize} \end{itemize} \onslide<1>\addimage{\FIGREP/roofline_1}{5cm}{5.5cm}{0.5cm} \onslide<2>\addimage{\FIGREP/roofline_2}{5cm}{5.5cm}{0.5cm} \onslide<3>\addimage{\FIGREP/roofline_3}{5cm}{5.5cm}{0.5cm} \end{frame} +\begin{frame}[t] + \frametitle{Roofline model} + \framesubtitle{} + + \begin{itemize} + \item Refinements can be made to the Roofline model + \item Adding a memory hierarchy with caches + \item Adding different levels of DLP + \item They give you hint on what to optimize for + \end{itemize} + \addimage{\FIGREP/roofline_extended}{7cm}{4.5cm}{0.5cm} +\end{frame} + \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{How to find the peak performance} \begin{itemize} \item Theoretical peak performance \begin{align*} P_{\text{max}} = & \textcolor{white}{\times} \text{Number of FP ports (Superscalar)} \\ & \times \text{flops} / \text{cycles (e.g. 2 for FMA)} \\ & \times \text{vector size (DLP)} \\ & \times \text{frequency (in GHz)} \\ & \times \text{number of cores (TLP)} \end{align*} \item Example: \href{https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server)}{Intel Xeon 6132} \begin{align*} P_{\text{max}} = & \textcolor{white}{\times} 2 \text{ (Superscalar)} \\ & \times 2 \text{ (e.g. 2 for FMA)} \\ & \times 8 \text{ (DLP)} \\ & \times 2.3 \text{ (frequency)} \\ & \times 14 \text{ (TLP)} \\ = & 1.16\,\text{TFLOPS} \end{align*} \pause \item Or use a software that estimates it \end{itemize} \end{frame} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{How to find the memory bandwidth} \begin{itemize} - \item Theoretical memory bandwidth can be found on the CPU spec. list + \item Theoretical memory bandwidth can be found from the RAM spec. + \begin{align*} + \text{BW}_{\text{max}} = & \textcolor{white}{\times} \text{RAM frequency} \\ + & \times \text{Number of transfers per clock cycle} \\ + & \times \text{Bus width} \\ + & \times \text{Number of interfaces} + \end{align*} + \item In general, we suppose that RAM matches CPU bandwidth (found on the CPU spec. list) \item Example: \href{https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server)}{Intel Xeon 6132} \begin{itemize} \item 21.33\,GB/s for 1 channel \item Maximum of 128\,GB/s \end{itemize} \pause \item Or use a software that estimates it \end{itemize} + + \begin{itemize} + \item A corollary from ``theoretical'' is that it is not achievable in practice! + \end{itemize} \end{frame} \begin{frame}[t,fragile] \frametitle{Roofline model} \framesubtitle{How to find arithmetic intensity} \begin{itemize} \item For very simple algorithms, you can compute the AI \item Let's take back the DAXPY example \begin{cxxcode} N = 1e8; for (int i = 0; i < N; ++i) { c[i] = a[i] + alpha * b[i]; } \end{cxxcode} \item There are 2 operations (1 add and 1 mul) \item Three 8-byte memory operations (2 loads and 1 store) \item The AI is then $2/24 = 1/12$ \pause \item For more complex algorithms, use a tool, e.g. Intel Advisor \end{itemize} \end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: