diff --git a/src/performance_measurement/performance_measurement.tex b/src/performance_measurement/performance_measurement.tex
index 1ba6ce4..b661dfd 100644
--- a/src/performance_measurement/performance_measurement.tex
+++ b/src/performance_measurement/performance_measurement.tex
@@ -1,598 +1,598 @@
 \renewcommand{\FIGREP}{src/performance_measurement/figures}
 \section{Performance measurement}
 \label{sec:performance_measurement}
 
 \intersec{helvetios}
 
 \begin{frame}
   \frametitle{Goal of this section}
   \framesubtitle{}
 
   \begin{itemize}
     \item Key concepts to quantify performance
           \begin{itemize}
             \item Metrics
+            \item Using a profiler
             \item Scalings, speedup, efficiency
           \end{itemize}
     \item Roofline model
-    \item Using a profiler
   \end{itemize}
 \end{frame}
 
 \subsection{Performance metrics}
 \label{sec:metrics}
 \begin{frame}
   \frametitle{Performance metrics}
   \framesubtitle{}
 
   \begin{itemize}
     \item How can we quantify performance?
     \item We need to define a means to measure it
     \item We will focus on the most interesting metrics for HPC
   \end{itemize}
   \vfill
   \pause
   \begin{itemize}
     \item The first that comes in mind is \textit{time}, e.g. time-to-solution
     \item Derived metrics: speedup and efficiency
   \end{itemize}
   \vfill
   \pause
   \begin{itemize}
     \item Scientific codes do computations on floating point numbers
     \item A second metric is the number of \textit{floating-point operations per second}
           (\si{\flops})
   \end{itemize}
   \vfill
   \pause
   \begin{itemize}
     \item Finally, the \textit{memory bandwidth} indicates how much data does your code
           transfers per unit of time
   \end{itemize}
 \end{frame}
 
 \note{
   \begin{itemize}
     \item My code is super fast, it runs in $2.5\si{\ns}$!
     \item It seems fast, but is it? How fast can your hardware go?
     \item To really understand how much your code exploit the hardware, we use
           the \si{\flops} and memory BW
     \item Your hardware has theoretical maximum values for those
     \item You can compare the values from your code to the max to see how well
           you use the hardware
   \end{itemize}
 }
 
+\subsection{Profiling}
+\label{sec:profiling}
+\begin{frame}
+  \frametitle{Profiling}
+  \framesubtitle{A precious ally for optimization}
+
+  \begin{itemize}
+    \item Where is my application spending most of its time?
+          \begin{itemize}
+            \item (bad) measure time ``by hand'' using timings and prints
+            \item (good) use a tool made for this, e.g. Intel Amplifier, Score-P,
+                  gprof
+          \end{itemize}
+  \end{itemize}
+  \vfill
+  \begin{itemize}
+    \item In addition to timings, profilers give you a lot more information on
+          \begin{itemize}
+            \item Memory usage
+            \item Hardware counters
+            \item CPU activity
+            \item MPI communications
+            \item etc.
+          \end{itemize}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile,exercise]
+  \frametitle{Profiling}
+  \framesubtitle{Interactive demonstration}
+
+  \begin{itemize}
+    \item For the purpose of this exercise, we will use MiniFE
+          \begin{itemize}
+            \item 3D implicit finite-elements on an unstructured mesh
+            \item C++ mini application
+            \item \url{https://github.com/Mantevo/miniFE}
+            \item You don't need to understand what the code does!
+          \end{itemize}
+    \item We will use Intel VTune, part of the \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html\#base-kit}{OneAPI Base toolkit (free)}
+  \end{itemize}
+  \vfill
+  \begin{itemize}
+    \item Download miniFE
+    \item Compile the basic version found in \cmd{ref/src}
+    \item Profile the code using the hotspot analysis
+    \item Open Intel VTune and select your timings
+    \item Play around and find the 5 most time-consuming functions
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Profiling}
+  \framesubtitle{Compile MiniFE}
+
+  \begin{itemize}
+    \item Download miniFE
+          \begin{bashcode}
+            $> git clone https://github.com/Mantevo/miniFE.git
+            $> cd miniFE
+          \end{bashcode}
+    \item Compile the basic version found in \code{ref/src}
+          \begin{itemize}
+            \item You will need to load a compiler and an MPI library
+                  \begin{bashcode}
+                  $> module load intel intel-mpi intel-vtune
+                \end{bashcode}%$
+            \item Change the \cmd{Makefile} to set \cmd{CXX=mpiicpc} and \cmd{CC=mpiicc} and compile
+                  \begin{bashcode}
+                    $> make
+                  \end{bashcode}%$
+            \item Make sure to compile your code with \cmd{-g -O3}
+          \end{itemize}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Profiling}
+  \framesubtitle{Profile MiniFE}
+
+  \begin{itemize}
+    \item Profile the code using
+          \begin{bashcode}
+            $> srun -n 1 amplxe-cl -collect hotspots -r prof_results -- ./miniFE.x -nx 128 -ny 128 -nz 128
+          \end{bashcode}%$
+    \item This will profile for the ``hotspots'' and store the timings in \cmd{prof\_results}
+    \item You can have more info on the types of analysis with
+          \begin{bashcode}
+            $> amplxe-cl -h collect
+          \end{bashcode}%$
+    \item Open Intel VTune and select your timings
+          \begin{bashcode}
+            $> amplxe-gui prof_results/prof_results.amplxe
+          \end{bashcode}%$
+    \item Play around and find the 5 most time-consuming functions
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Profiling}
+  \framesubtitle{What do we learn?}
+
+  \begin{itemize}
+    \item 50.0\% of the time spent in matrix/vector multiplications
+    \item 12.5\% of time spent imposing boundary conditions
+    \item etc.
+    \item Does the problem size influence the timings?
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Profiling}
+  \framesubtitle{Smaller problem}
+
+  \begin{itemize}
+    \item This time, we profile a problem of size $(16, 16, 16)$
+    \item 13.6\% of the time is spent opening libraries
+    \item 13.6\% of the time is spent initializing MPI
+    \item etc.
+    \item Depending on the problem size, different parts of the code will dominate
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Profiling}
+  \framesubtitle{Some tips and tricks}
+
+  \begin{itemize}
+    \item Profile a code without bugs!
+    \item Choose the right problem size (representative of your simulations)
+    \item Focus on the functions taking the most time first
+    \item If the profile is not explicit, try refactoring into smaller functions
+          \begin{itemize}
+            \item Some profilers, e.g. ScoreP, let you define custom regions
+          \end{itemize}
+  \end{itemize}
+\end{frame}
+
 \subsection{Scalings, speedup and efficiency}
 \label{sec:scalings}
 \begin{frame}
   \frametitle{Speedup and efficiency}
   \framesubtitle{}
   \begin{itemize}
     \item Two important metrics are derived from timings
     \item Compare timings with $n$ processes, $T_{n}$, against the reference
     timing, $T_\text{ref}$
   \end{itemize}
   \vfill
   \begin{minipage}{0.3\linewidth}
     \begin{center}
       \textbf{Speedup}
     \end{center}
     \begin{equation*}
       S(n) = \frac{T_\text{ref}}{T_{n}}
     \end{equation*}
   \end{minipage}
   \hspace{0.5cm}
   \begin{minipage}{0.3\linewidth}
     \begin{center}
       \textbf{Efficiency}
     \end{center}
     \begin{equation*}
       E(n) = \frac{S(n)}{n}
     \end{equation*}
   \end{minipage}
   \vfill
   \begin{itemize}
     \item We want $S(n)$ as close to $n$ and $E(n)$ as close to 1 (100\%) as possible
   \end{itemize}
 \end{frame}
 
 \begin{frame}[t]
   \frametitle{Strong scaling}
   \framesubtitle{}
 
   \begin{itemize}
     \item Scalings are a way to assess how well a program performs when adding
     computational resources
     \item Strong scaling: add resources, keep total amount of work
     constant
     \begin{equation*}
       S(n) = \frac{T_{1}}{T_{n}}, \qquad E(n) = \frac{S(n)}{n} = \frac{T_{1}}{nT_{n}}
     \end{equation*}
     \item Strong scaling is an indication on how much profitable it is to
     add resources to solve your problem
   \end{itemize}
   \addimage[width=6cm]{\FIGREP/strong_scaling}{5cm}{1cm}
 \end{frame}
 
 \begin{frame}[t]
   \frametitle{Weak scaling}
   \framesubtitle{}
 
   \begin{itemize}
     \item Weak scaling: add resources and maintain amount of work per resource
           constant
     \begin{equation*}
       S(n) = \frac{nT_{1}}{T_{n}}, \qquad E(n) = \frac{S(n)}{n} = \frac{T_{1}}{T_{n}}
     \end{equation*}
     \item Weak scalings are an indication on how well your code will perform on
     a bigger machine (and with a bigger problem)
     \item These scalings are always required for a proposal
     \begin{itemize}
       \item For strong scalings the metric is speedup (how do I improve performance)
       \item For weak scalings the metric is efficiency (how well
       performance is kept)
     \end{itemize}
   \end{itemize}
   \addimage[width=6cm]{\FIGREP/weak_scaling}{5cm}{1cm}
 \end{frame}
 
 \subsection{Amdahl's law}
 \label{sec:amdahl}
 \begin{frame}[t]
   \frametitle{Amdahl's law}
   \framesubtitle{}
   \begin{itemize}
     \item Amdahl's law gives you an upper bound to the achievable speedup for a
     fixed problem size
     \item By definition it is a strong scaling analysis
     \vfill
     \pause
     \item Assume a fraction $p$ of your code is (perfectly) parallel and timing with 1 process
     is $T_{1}$
     \item Timing with $n$ processes is
     \begin{equation*}
       T_{n} = (1-p) T_{1} + \frac{p}{n}T_{1} = \left[ (1-p) + \frac{p}{n}\right] T_{1}
     \end{equation*}
     \pause
     \item Speedup becomes
     \begin{equation*}
       S(n) = \frac{T_{1}}{T_{n}} = \frac{1}{(1-p) + \frac{p}{n}}
     \end{equation*}
     \vfill
     \pause
     \item In the limit of infinite resources
     \begin{equation*}
       \lim_{n\rightarrow\infty}S(n) = \frac{1}{1-p}
     \end{equation*}
   \end{itemize}
   \onslide<2->\addimage[width=3cm]{\FIGREP/amdahl_illustration}{12.5cm}{1.0cm}
 \end{frame}
 
 \begin{frame}[b]
   \frametitle{Amdahl's law}
   \framesubtitle{}
   \begin{itemize}
     \item Limited by the serial part (very sensitive)!
     \item Does this mean we cannot exploit large HPC machines?
     \pause
     \item No, in general with more resources, we simulate larger systems
     $\Rightarrow$ weak scaling (see
     \href{https://en.wikipedia.org/wiki/Gustafson\%27s_law}{Gustafson law})
   \end{itemize}
   \onslide<1->\addimage[width=8.cm]{\FIGREP/amdahl_speedup}{4cm}{2cm}
 \end{frame}
 
 \begin{frame}
   \frametitle{\si{\flops} and memory bandwidth}
   \framesubtitle{}
   \begin{itemize}
     \item FLOPs are floating point operations, e.g. $+, -, \times, \div$
     \item Can be evaluated by hand, dividing the number of operations by the running time
     \vfill
     \item Memory bandwidth measures the amount of data transferred by unit of
           time [\si{\byte\per\second}, \si{\kibi\byte\per\second},
           \si{\mebi\byte\per\second}, \si{\gibi\byte\per\second}, ...]
     \item Can be measured by hand dividing the amount of data transferred by the
     running time
     \vfill
     \item In both cases, generally use tools such as PAPI, Tau, likwid, Intel
           Amplxe, STREAM, etc.
   \end{itemize}
 \end{frame}
 
 \begin{frame}[t,fragile]
   \frametitle{Performance measurement}
   \framesubtitle{A simple DAXPY example}
   \begin{itemize}
     \item Assume \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel Xeon Gold 6132} (Gacrux)
   \end{itemize}
   \cxxfile[%
      title={optimization/daxpy.cc},
      minted options app={
 %       highlightlines={2, 7},
        firstline=25,
        lastline=27,
        firstnumber=1,
      }]{examples/optimization/daxpy.cc}
   \begin{itemize}
     \item My code runs in \SI{174.25}{\ms}. It is amazingly fast!
   \end{itemize}
   \pause
   \vfill
   \begin{itemize}
     \item Each iteration has 2 FLOP (1 add and 1 mul) and there are \cmd{N = 1e8}
           iterations
-    \item Our hardware can achieve a theoretical peak performance of $\SI{1.16}{\tera\flops}$
-    \item Our code $\SI{2d8}{\flop} / \SI{174.25d-3}{\second} = \SI{0.001}{\tera\flops}$...
+    \item Our code $\SI{2d8}{\flop} / \SI{174.25d-3}{\second} = \SI{0.001}{\tera\flops}$
+    \item Our hardware can achieve a theoretical peak performance of $\SI{1.16}{\tera\flops}$...
   \end{itemize}
   \pause
   \vfill
   \begin{itemize}
     \item Each iteration has 3 memory operations (2 loads and 1 store)
-    \item Our hardware can achieve a theoretical memory bandwidth of $\SI{125}{\gibi\byte\per\second}$
-    \item Our code $\SI{2.23}{\gibi\byte} / \SI{174.25d-3}{\second} = \SI{12.82}{\gibi\byte\per\second}$...
+    \item Our code $\SI{2.23}{\gibi\byte} / \SI{174.25d-3}{\second} = \SI{12.82}{\gibi\byte\per\second}$
+    \item Our hardware can achieve a theoretical memory bandwidth of $\SI{125}{\gibi\byte\per\second}$...
   \end{itemize}
 \end{frame}
 
 \subsection{Roofline model}
 \label{sec:roofline}
 \begin{frame}[t]
   \frametitle{Roofline model}
   \framesubtitle{}
 
   \begin{itemize}
     \item How well am I exploiting the hardware resources?
     \item The roofline model is a performance model allowing to have an estimate
     to this question
   \end{itemize}
   \vspace{1cm}
   \pause
   \begin{itemize}
     \item Key concept: the arithmetic intensity, $AI$, of an algorithm is \# \si{\flop\per\byte} of data transferred
     \item It measures data reuse
   \end{itemize}
   \addimage[width=8.cm]{\FIGREP/ai}{4cm}{0.5cm}
 \end{frame}
 
+\begin{frame}[t,fragile]
+  \frametitle{Roofline model}
+  \framesubtitle{How to find arithmetic intensity}
+  \begin{itemize}
+    \item For very simple algorithms, you can compute the AI
+    \item Let's take back the DAXPY example
+  \cxxfile[%
+     title={optimization/daxpy.cc},
+     minted options app={
+%       highlightlines={2, 7},
+       firstline=25,
+       lastline=27,
+       firstnumber=1,
+     }]{examples/optimization/daxpy.cc}
+    \item There are 2 operations (1 add and 1 mul)
+    \item Three 8-byte memory operations (2 loads and 1 store)
+    \item The AI is then $2/24 = 1/12$
+    \pause
+    \item For more complex algorithms, use a tool, e.g. Intel Advisor
+  \end{itemize}
+\end{frame}
+
 \begin{frame}[t]
   \frametitle{Roofline model}
-  \framesubtitle{}
+  \framesubtitle{Building the model}
 
   \begin{itemize}
     \item Roofline model is plotted on \textbf{log-log scale}
     \begin{itemize}
       \item x-axis is the $AI$
       \item y-axis is \si{\flops}
     \end{itemize}
     \pause
     \item The hardware limits are defined by
     \begin{equation*}
       P = \min(P_{\text{max}}, b_{s} \cdot AI)
     \end{equation*}
     \begin{itemize}
       \item $P_{\text{max}}$ is the CPU peak \si{\flops}
       \item$AI$ is the intensity
       \item $b_{s}$ is the memory BW
     \end{itemize}
   \end{itemize}
   \onslide<1>\addimage[width=5cm]{\FIGREP/roofline_1}{5.5cm}{0.5cm}
   \onslide<2>\addimage[width=5cm]{\FIGREP/roofline_2}{5.5cm}{0.5cm}
   \onslide<3>\addimage[width=5cm]{\FIGREP/roofline_3}{5.5cm}{0.5cm}
 \end{frame}
 
 \begin{frame}[t]
   \frametitle{Roofline model}
-  \framesubtitle{}
+  \framesubtitle{Building the model}
 
   \begin{itemize}
     \item Refinements can be made to the Roofline model
     \item Adding a memory hierarchy with caches
     \item Adding different levels of DLP (Data-Level parallelism)
     \item They give you hint on what to optimize for
   \end{itemize}
   \addimage[width=7cm]{\FIGREP/roofline_extended}{4.5cm}{0.5cm}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Roofline model}
   \framesubtitle{How to find the peak performance}
   \begin{itemize}
     \item Theoretical peak performance\\
           \vspace{-2ex}
           \begin{minipage}{.4\linewidth}
             \begin{align*}
               P_{\text{max}} = & \textcolor{white}{\times} \text{Number of FP ports (ILP)} \\
                                & \times \text{flops} / \text{cycles (e.g. 2 for FMA)} \\
                                & \times \text{vector size (DLP)} \\
                                & \times \text{frequency (in GHz)} \\
                                & \times \text{number of cores (TLP)}
             \end{align*}
           \end{minipage}
           \vspace{3ex}
     \item Example:
           \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel
           Xeon Gold 6132}\\
           \vspace{-2ex}
           \begin{minipage}{.4\linewidth}
             \begin{align*}
               P_{\text{max}} = & \textcolor{white}{\times} 2 \text{ (ports)} \\
                                & \times \SI{2}{\flop\per\cycle} \text{ (2 for FMA)} \\
                                & \times \frac{\SI{512}{\bit} \text{ (AVX512)} }{\SI{64}{\bit}\text{ (double)}} \\
                                & \times \SI{2.3}{\giga\hertz} \\
                                & \times 14 \text{ (cores)} \\
               = & \SI{1.16}{\tera\flops}
             \end{align*}
           \end{minipage}
           \addimage[width=6cm]{\FIGREP/skylake_server_block_diagram}{9cm}{0.8cm}
           \pause
           \vspace{3ex}
     \item Or use a software that estimates it
   \end{itemize}
 \end{frame}
 
 \begin{frame}[t]
   \frametitle{Roofline model}
   \framesubtitle{How to find the memory bandwidth}
   \begin{itemize}
     \item Theoretical memory bandwidth  of the memory
     \begin{align*}
       \text{BW}_{\text{max}} = &\textcolor{white}{\times} \text{Number of transfers per second} \\
                        & \times \text{Bus width} \\
                        & \times \text{Number of interfaces}
     \end{align*}
     \item In general, we suppose that RAM matches CPU bandwidth (found on the CPU spec. list)
     \item Example:
           \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel
           Xeon Gold 6132}
 
           \begin{align*}
             \text{BW}_{\text{max}} = &\textcolor{white}{\times} \SI{2666}{\mega\transfer\per\second} \text{ (DDR4 2666)} \\
               & \times \SI{8}{\byte\per\transfer} \text{ (64bit bus)}\\
               & \times 6
           \end{align*}
           \begin{itemize}
 
       \item $\SI{19.86}{\gibi\byte\per\second}$ for 1 channel
       \item Maximum of $\SI{119.18}{\gibi\byte\per\second}$
     \end{itemize}
     \pause
     \item Or use a software that estimates it
   \end{itemize}
 
   \begin{itemize}
     \item A corollary from ``theoretical'' is that it is not achievable in practice!
   \end{itemize}
 \end{frame}
 
-\begin{frame}[t,fragile]
-  \frametitle{Roofline model}
-  \framesubtitle{How to find arithmetic intensity}
-  \begin{itemize}
-    \item For very simple algorithms, you can compute the AI
-    \item Let's take back the DAXPY example
-  \cxxfile[%
-     title={optimization/daxpy.cc},
-     minted options app={
-%       highlightlines={2, 7},
-       firstline=25,
-       lastline=27,
-       firstnumber=1,
-     }]{examples/optimization/daxpy.cc}
-    \item There are 2 operations (1 add and 1 mul)
-    \item Three 8-byte memory operations (2 loads and 1 store)
-    \item The AI is then $2/24 = 1/12$
-    \pause
-    \item For more complex algorithms, use a tool, e.g. Intel Advisor
-  \end{itemize}
-\end{frame}
-
-\subsection{Profiling}
-\label{sec:profiling}
-\begin{frame}
-  \frametitle{Profiling}
-  \framesubtitle{A precious ally for optimization}
-
-  \begin{itemize}
-    \item Where is my application spending most of its time?
-          \begin{itemize}
-            \item (bad) measure time ``by hand'' using timings and prints
-            \item (good) use a tool made for this, e.g. Intel Amplifier, Score-P,
-                  gprof
-          \end{itemize}
-  \end{itemize}
-  \vfill
-  \begin{itemize}
-    \item In addition to timings, profilers give you a lot more information on
-          \begin{itemize}
-            \item Memory usage
-            \item Hardware counters
-            \item CPU activity
-            \item MPI communications
-            \item etc.
-          \end{itemize}
-  \end{itemize}
-\end{frame}
-
-\begin{frame}[fragile,exercise]
-  \frametitle{Profiling}
-  \framesubtitle{Interactive demonstration}
-
-  \begin{itemize}
-    \item For the purpose of this exercise, we will use MiniFE
-          \begin{itemize}
-            \item 3D implicit finite-elements on an unstructured mesh
-            \item C++ mini application
-            \item \url{https://github.com/Mantevo/miniFE}
-            \item You don't need to understand what the code does!
-          \end{itemize}
-    \item We will use Intel VTune, part of the \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html\#base-kit}{OneAPI Base toolkit (free)}
-  \end{itemize}
-  \vfill
-  \begin{itemize}
-    \item Download miniFE
-    \item Compile the basic version found in \cmd{ref/src}
-    \item Profile the code using the hotspot analysis
-    \item Open Intel VTune and select your timings
-    \item Play around and find the 5 most time-consuming functions
-  \end{itemize}
-\end{frame}
-
-\begin{frame}[fragile]
-  \frametitle{Profiling}
-  \framesubtitle{Compile MiniFE}
-
-  \begin{itemize}
-    \item Download miniFE
-          \begin{bashcode}
-            $> git clone https://github.com/Mantevo/miniFE.git
-            $> cd miniFE
-          \end{bashcode}
-    \item Compile the basic version found in \code{ref/src}
-          \begin{itemize}
-            \item You will need to load a compiler and an MPI library
-                  \begin{bashcode}
-                  $> module load intel intel-mpi intel-vtune
-                \end{bashcode}%$
-            \item Change the \cmd{Makefile} to set \cmd{CXX=mpiicpc} and \cmd{CC=mpiicc} and compile
-                  \begin{bashcode}
-                    $> make
-                  \end{bashcode}%$
-            \item Make sure to compile your code with \cmd{-g -O3}
-          \end{itemize}
-  \end{itemize}
-\end{frame}
-
-\begin{frame}[fragile]
-  \frametitle{Profiling}
-  \framesubtitle{Profile MiniFE}
-
-  \begin{itemize}
-    \item Profile the code using
-          \begin{bashcode}
-            $> srun -n 1 amplxe-cl -collect hotspots -r prof_results -- ./miniFE.x -nx 128 -ny 128 -nz 128
-          \end{bashcode}%$
-    \item This will profile for the ``hotspots'' and store the timings in \cmd{prof\_results}
-    \item You can have more info on the types of analysis with
-          \begin{bashcode}
-            $> amplxe-cl -h collect
-          \end{bashcode}%$
-    \item Open Intel VTune and select your timings
-          \begin{bashcode}
-            $> amplxe-gui prof_results/prof_results.amplxe
-          \end{bashcode}%$
-    \item Play around and find the 5 most time-consuming functions
-  \end{itemize}
-\end{frame}
-
-\begin{frame}[fragile]
-  \frametitle{Profiling}
-  \framesubtitle{What do we learn?}
-
-  \begin{itemize}
-    \item 50.0\% of the time spent in matrix/vector multiplications
-    \item 12.5\% of time spent imposing boundary conditions
-    \item etc.
-    \item Does the problem size influence the timings?
-  \end{itemize}
-\end{frame}
-
-\begin{frame}[fragile]
-  \frametitle{Profiling}
-  \framesubtitle{Smaller problem}
-
-  \begin{itemize}
-    \item This time, we profile a problem of size $(16, 16, 16)$
-    \item 13.6\% of the time is spent opening libraries
-    \item 13.6\% of the time is spent initializing MPI
-    \item etc.
-    \item Depending on the problem size, different parts of the code will dominate
-  \end{itemize}
-\end{frame}
-
-\begin{frame}[fragile]
-  \frametitle{Profiling}
-  \framesubtitle{Some tips and tricks}
-
-  \begin{itemize}
-    \item Profile a code without bugs!
-    \item Choose the right problem size (representative of your simulations)
-    \item Focus on the functions taking the most time first
-    \item If the profile is not explicit, try refactoring into smaller functions
-          \begin{itemize}
-            \item Some profilers, e.g. ScoreP, let you define custom regions
-          \end{itemize}
-  \end{itemize}
-\end{frame}
-
 \begin{frame}[fragile,t]
-  \frametitle{Profiling}
-  \framesubtitle{Optimization}
+  \frametitle{Optimization}
+  \framesubtitle{}
 
   \begin{itemize}
     \item We now have a pretty good idea of which part of the code to optimize
     \item Different options are possible (by order of complexity)
           \begin{enumerate}
             \item Compiler and linker flags
             \item Optimized external libraries
             \item Handmade optimization (loop reordering, better data access,
                   etc.)
             \item Algorithmic changes
           \end{enumerate}
           \pause
     \item Example of matrix/matrix multiplication. Graph shows complexity ($\mathcal{O}(n^{\omega})$) for
           different algorithms
   \end{itemize}
   \onslide<2>\addimage[width=7cm]{\FIGREP/matmul}{4.5cm}{0.5cm}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Parallelization}
   \framesubtitle{When to parallelize}
 
   \begin{itemize}
     \item Only when your code has \textit{no bugs} and is \textit{optimized}
     \item Are your ready to parallelize?
           \begin{enumerate}
             \item Is it worth to parallelize my code? Does my algorithm scale?
             \item Performance prediction?
             \item Profiling?
             \item Bottelnecks?
             \item Which parallel paradigm should I use? What is the target architecture
                   (SMP, cluster, GPU, hybrid, etc)?
           \end{enumerate}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Parallelization}
   \framesubtitle{When to parallelize}
 
   In 1991, David H. Bailey published a famous paper: \href{https://www.davidhbailey.com/dhbpapers/twelve-ways.pdf}{Twelve ways to fool
     the masses when giving performance results on parallel computers}
 
   \vspace{1cm}
 
   \textit{6: Compare your results against scalar, unoptimized code on Crays.}
   \addimage[width=7cm]{\FIGREP/dhb}{4.5cm}{0.5cm}
 \end{frame}
 
 \subsection{Pareto principle}
 \label{sec:pareto}
 \begin{frame}
   \frametitle{Pareto principle}
   \framesubtitle{The 80/20 rule}
 
   \begin{itemize}
     \item General principle that states that 80\% of the effect comes from 20\%
           of causes
     \item Applies in many domains and especially in optimization
     \item 80\% of the time is spent in 20\% of your code
     \item Concentrate on those 20\% and don't arbitrarily optimize
   \end{itemize}
 \end{frame}
 
 %%% Local Variables:
 %%% mode: latex
 %%% TeX-master: "../../phys_743_parallel_programming"
 %%% End: