diff --git a/src/performance_measurement/figures/Roofline.png b/src/performance_measurement/figures/Roofline.png
new file mode 100644
index 0000000..60b1191
Binary files /dev/null and b/src/performance_measurement/figures/Roofline.png differ
diff --git a/src/performance_measurement/figures/benchmark_gacrux.csv b/src/performance_measurement/figures/benchmark_gacrux.csv
new file mode 100644
index 0000000..d645da2
--- /dev/null
+++ b/src/performance_measurement/figures/benchmark_gacrux.csv
@@ -0,0 +1,19 @@
+name,iterations,real_time,cpu_time,time_unit,bytes_per_second,items_per_second,label,error_occurred,error_message,"nthreads","size_kb"
+"MyFixture/copy/2",19698087,35.5945,35.5095,ns,1.1535e+11,,,,,0,2
+"MyFixture/copy/4",9323085,75.168,74.9886,ns,1.09243e+11,,,,,0,4
+"MyFixture/copy/8",4846012,144.839,144.494,ns,1.13388e+11,,,,,0,8
+"MyFixture/copy/16",2472819,284.517,283.839,ns,1.15446e+11,,,,,0,16
+"MyFixture/copy/32",801724,834.516,832.528,ns,7.87192e+10,,,,,0,32
+"MyFixture/copy/64",447004,1566.36,1562.64,ns,8.38785e+10,,,,,0,64
+"MyFixture/copy/128",263861,2622.89,2616.66,ns,1.00183e+11,,,,,0,128
+"MyFixture/copy/256",105416,6688.54,6672.6,ns,7.85733e+10,,,,,0,256
+"MyFixture/copy/512",30704,22852.8,22798.3,ns,4.59935e+10,,,,,0,512
+"MyFixture/copy/1024",8836,79613.3,79423.6,ns,2.64046e+10,,,,,0,1024
+"MyFixture/copy/2048",4394,159566,159184,ns,2.63487e+10,,,,,0,2048
+"MyFixture/copy/4096",2191,319712,318950,ns,2.63007e+10,,,,,0,4096
+"MyFixture/copy/8192",1006,697645,695979,ns,2.41059e+10,,,,,0,8192
+"MyFixture/copy/16384",309,2.27814e+06,2.2727e+06,ns,1.47642e+10,,,,,0,16384
+"MyFixture/copy/32768",144,4.85367e+06,4.84201e+06,ns,1.38597e+10,,,,,0,32768
+"MyFixture/copy/65536",70,9.94862e+06,9.92484e+06,ns,1.35234e+10,,,,,0,65536
+"MyFixture/copy/131072",35,2.0252e+07,2.02036e+07,ns,1.32865e+10,,,,,0,131072
+"MyFixture/copy/262144",17,4.09358e+07,4.08381e+07,ns,1.31463e+10,,,,,0,262144
diff --git a/src/performance_measurement/figures/plot.py b/src/performance_measurement/figures/plot.py
new file mode 100644
index 0000000..b89cb94
--- /dev/null
+++ b/src/performance_measurement/figures/plot.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+"""Script to visualize google-benchmark output"""
+from __future__ import print_function
+import argparse
+import sys
+import logging
+import json
+import pandas as pd
+import matplotlib.pyplot as plt
+import pathlib
+
+logging.basicConfig(format="[%(levelname)s] %(message)s")
+
+METRICS = [
+    "real_time",
+    "cpu_time",
+    "bytes_per_second",
+    "items_per_second",
+    "iterations",
+]
+TRANSFORMS = {"": lambda x: x, "inverse": lambda x: 1.0 / x}
+
+
+def get_default_ylabel(args):
+    """Compute default ylabel for commandline args"""
+    label = ""
+    if args.transform == "":
+        label = args.metric
+    else:
+        label = args.transform + "(" + args.metric + ")"
+    if args.relative_to is not None:
+        label += " relative to %s" % args.relative_to
+    return label
+
+
+def parse_args():
+    """Parse commandline arguments"""
+    parser = argparse.ArgumentParser(description="Visualize google-benchmark output")
+    parser.add_argument(
+        "-f",
+        metavar="FILE",
+        type=argparse.FileType("r"),
+        default=sys.stdin,
+        dest="file",
+        help="path to file containing the csv or json benchmark data",
+    )
+    parser.add_argument(
+        "-m",
+        metavar="METRIC",
+        choices=METRICS,
+        default=METRICS[0],
+        dest="metric",
+        help="metric to plot on the y-axis, valid choices are: %s" % ", ".join(METRICS),
+    )
+    parser.add_argument(
+        "-t",
+        metavar="TRANSFORM",
+        choices=TRANSFORMS.keys(),
+        default="",
+        help="transform to apply to the chosen metric, valid choices are: %s"
+        % ", ".join(list(TRANSFORMS)),
+        dest="transform",
+    )
+    parser.add_argument(
+        "-r",
+        metavar="RELATIVE_TO",
+        type=str,
+        default=None,
+        dest="relative_to",
+        help="plot metrics relative to this label",
+    )
+    parser.add_argument(
+        "--xlabel", type=str, default="input size", help="label of the x-axis"
+    )
+    parser.add_argument("--ylabel", type=str, help="label of the y-axis")
+    parser.add_argument("--title", type=str, default="", help="title of the plot")
+    parser.add_argument(
+        "--logx", action="store_true", help="plot x-axis on a logarithmic scale"
+    )
+    parser.add_argument(
+        "--logy", action="store_true", help="plot y-axis on a logarithmic scale"
+    )
+    parser.add_argument(
+        "--output", type=str, default="", help="File in which to save the graph"
+    )
+
+    args = parser.parse_args()
+    if args.ylabel is None:
+        args.ylabel = get_default_ylabel(args)
+    return args
+
+
+def parse_input_size(name):
+    splits = name.split("/")
+    if len(splits) == 1:
+        return 1
+    return int(splits[-1])
+
+
+def read_data(args):
+    """Read and process dataframe using commandline args"""
+    extension = pathlib.Path(args.file.name).suffix
+    try:
+        if extension == ".csv":
+            data = pd.read_csv(args.file, usecols=["name", args.metric])
+        elif extension == ".json":
+            json_data = json.load(args.file)
+            data = pd.DataFrame(json_data["benchmarks"])
+        else:
+            logging.error("Unsupported file extension '{}'".format(extension))
+            exit(1)
+    except ValueError:
+        logging.error(
+            'Could not parse the benchmark data. Did you forget "--benchmark_format=[csv|json] when running the benchmark"?'
+        )
+        exit(1)
+    data["label"] = data["name"].apply(lambda x: x.split("/")[-2])
+    data["input"] = data["name"].apply(parse_input_size)
+    data[args.metric] = data[args.metric].apply(TRANSFORMS[args.transform])
+    return data
+
+
+def plot_groups(label_groups, args):
+    """Display the processed data"""
+    for label, group in label_groups.items():
+        plt.plot(group["input"], group[args.metric], label=label, marker=".")
+    if args.logx:
+        plt.xscale("log", base=2)
+    if args.logy:
+        plt.yscale("log")
+    plt.xlabel(args.xlabel)
+    plt.ylabel(args.ylabel)
+    plt.title(args.title)
+    plt.legend()
+    if args.output:
+        logging.info("Saving to %s" % args.output)
+        plt.savefig(args.output)
+    else:
+        plt.show()
+
+
+def main():
+    """Entry point of the program"""
+    args = parse_args()
+    data = read_data(args)
+    label_groups = {}
+    for label, group in data.groupby("label"):
+        label_groups[label] = group.set_index("input", drop=False)
+    if args.relative_to is not None:
+        try:
+            baseline = label_groups[args.relative_to][args.metric].copy()
+        except KeyError as key:
+            msg = "Key %s is not present in the benchmark output"
+            logging.error(msg, str(key))
+            exit(1)
+
+    if args.relative_to is not None:
+        for label in label_groups:
+            label_groups[label][args.metric] /= baseline
+    plot_groups(label_groups, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/performance_measurement/performance_measurement.tex b/src/performance_measurement/performance_measurement.tex
index a60ac41..ba7fa24 100644
--- a/src/performance_measurement/performance_measurement.tex
+++ b/src/performance_measurement/performance_measurement.tex
@@ -1,670 +1,693 @@
 \renewcommand{\FIGREP}{src/performance_measurement/figures}
 \section{Performance measurement}
 \label{sec:performance_measurement}
 
 \intersec{helvetios}
 
 \begin{frame}
   \frametitle{Goal of this section}
   \framesubtitle{}
 
   \begin{itemize}
     \item Key concepts to quantify performance
           \begin{itemize}
             \item Metrics
             \item Using a profiler
             \item Scalings, speedup, efficiency
           \end{itemize}
     \item Roofline model
   \end{itemize}
 \end{frame}
 
 \subsection{Performance metrics}
 \label{sec:metrics}
 \begin{frame}
   \frametitle{Performance metrics}
   \framesubtitle{}
 
   \begin{itemize}
     \item How can we quantify performance?
     \item We need to define a means to measure it
     \item We will focus on the most interesting metrics for HPC
   \end{itemize}
   \vfill
   \pause
   \begin{itemize}
     \item The first that comes in mind is \textit{time}, e.g. time-to-solution
     \item Derived metrics: speedup and efficiency
   \end{itemize}
   \vfill
   \pause
   \begin{itemize}
     \item Scientific codes do computations on floating point numbers
     \item A second metric is the number of \textit{floating-point operations per second}
           (\si{\flops})
   \end{itemize}
   \vfill
   \pause
   \begin{itemize}
     \item Finally, the \textit{memory bandwidth} indicates how much data does your code
           transfers per unit of time
   \end{itemize}
 \end{frame}
 
 \note{
   \begin{itemize}
     \item My code is super fast, it runs in $2.5\si{\ns}$!
     \item It seems fast, but is it? How fast can your hardware go?
     \item To really understand how much your code exploit the hardware, we use
           the \si{\flops} and memory BW
     \item Your hardware has theoretical maximum values for those
     \item You can compare the values from your code to the max to see how well
           you use the hardware
   \end{itemize}
 }
 
 \subsection{Profiling}
 \label{sec:profiling}
 \begin{frame}
   \frametitle{Profiling}
   \framesubtitle{A tool to measure various timings}
 
   \begin{itemize}
     \item Where is my application spending most of its time?
           \begin{itemize}
             \item (bad) measure time ``by hand'' using timings and prints
             \item (good) use a tool made for this, e.g. Intel Amplifier, Score-P,
                   gprof
           \end{itemize}
   \end{itemize}
   \vfill
   \begin{itemize}
     \item There are two types of profiling techniques
           \begin{itemize}
             \item Sampling: you stop the code every now and then and check in
                   which function you are
             \item Code instrumentation: instructions are added at compile time
                   to trigger measurements
           \end{itemize}
   \end{itemize}
   \vfill
   \begin{itemize}
     \item In addition to timings, profilers give you a lot more information on
           \begin{itemize}
             \item Memory usage
             \item Hardware counters
             \item CPU activity
             \item MPI communications
             \item etc.
           \end{itemize}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,exercise]
   \frametitle{Profiling}
   \framesubtitle{Interactive demonstration}
 
   \begin{itemize}
     \item For the purpose of this exercise, we will use MiniFE
           \begin{itemize}
             \item 3D implicit finite-elements on an unstructured mesh
             \item C++ mini application
             \item \url{https://github.com/Mantevo/miniFE}
             \item You don't need to understand what the code does!
           \end{itemize}
     \item We will use Intel VTune, part of the \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html\#base-kit}{OneAPI Base toolkit (free)}
   \end{itemize}
   \vfill
   \begin{itemize}
     \item Download miniFE
     \item Compile the basic version found in \cmd{ref/src}
     \item Profile the code using the hotspot analysis
     \item Open Intel VTune and select your timings
     \item Play around and find the 5 most time-consuming functions
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Profiling}
   \framesubtitle{Compile MiniFE}
 
   \begin{itemize}
     \item Download miniFE
           \begin{bashcode}
             $> git clone https://github.com/Mantevo/miniFE.git
             $> cd miniFE
           \end{bashcode}
     \item Compile the basic version found in \code{ref/src}
           \begin{itemize}
             \item You will need to load a compiler and an MPI library
                   \begin{bashcode}
                   $> module load intel intel-mpi intel-vtune
                 \end{bashcode}%$
             \item Change the \cmd{Makefile} to set \cmd{CXX=mpiicpc} and \cmd{CC=mpiicc} and compile
                   \begin{bashcode}
                     $> make
                   \end{bashcode}%$
             \item Make sure to compile your code with \cmd{-g -O3}
           \end{itemize}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Profiling}
   \framesubtitle{Profile MiniFE}
 
   \begin{itemize}
     \item Profile the code using
           \begin{bashcode}
             $> srun -n 1 amplxe-cl -collect hotspots -r prof_results -- ./miniFE.x -nx 128 -ny 128 -nz 128
           \end{bashcode}%$
     \item This will profile for the ``hotspots'' and store the timings in \cmd{prof\_results}
     \item You can have more info on the types of analysis with
           \begin{bashcode}
             $> amplxe-cl -h collect
           \end{bashcode}%$
     \item Open Intel VTune and select your timings
           \begin{bashcode}
             $> amplxe-gui prof_results/prof_results.amplxe
           \end{bashcode}%$
     \item Play around and find the 5 most time-consuming functions
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Profiling}
   \framesubtitle{What do we learn?}
 
   \begin{itemize}
     \item 50.0\% of the time spent in matrix/vector multiplications
     \item 12.5\% of time spent imposing boundary conditions
     \item etc.
     \item Does the problem size influence the timings?
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Profiling}
   \framesubtitle{Smaller problem}
 
   \begin{itemize}
     \item This time, we profile a problem of size $(16, 16, 16)$
     \item 13.6\% of the time is spent opening libraries
     \item 13.6\% of the time is spent initializing MPI
     \item etc.
     \item Depending on the problem size, different parts of the code will dominate
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Profiling}
   \framesubtitle{Some tips and tricks}
 
   \begin{itemize}
     \item Profile a code without bugs!
     \item Choose the right problem size (representative of your simulations)
     \item Focus on the functions taking the most time first
     \item If the profile is not explicit, try refactoring into smaller functions
           \begin{itemize}
             \item Some profilers, e.g. ScoreP, let you define custom regions
           \end{itemize}
   \end{itemize}
 \end{frame}
 
 \subsection{Scalings, speedup and efficiency}
 \label{sec:scalings}
 \begin{frame}
   \frametitle{Speedup and efficiency}
   \framesubtitle{}
   \begin{itemize}
     \item Two important metrics are derived from timings
     \item Compare timings with $n$ processes, $T_{n}$, against the reference
     timing, $T_\text{ref}$
   \end{itemize}
   \vfill
   \begin{minipage}{0.3\linewidth}
     \begin{center}
       \textbf{Speedup}
     \end{center}
     \begin{equation*}
       S(n) = \frac{T_\text{ref}}{T_{n}}
     \end{equation*}
   \end{minipage}
   \hspace{0.5cm}
   \begin{minipage}{0.3\linewidth}
     \begin{center}
       \textbf{Efficiency}
     \end{center}
     \begin{equation*}
       E(n) = \frac{S(n)}{n}
     \end{equation*}
   \end{minipage}
   \vfill
   \begin{itemize}
     \item We want $S(n)$ as close to $n$ and $E(n)$ as close to 1 (100\%) as possible
   \end{itemize}
 \end{frame}
 
 \begin{frame}[t]
   \frametitle{Strong scaling}
   \framesubtitle{}
 
   \begin{itemize}
     \item Scalings are a way to assess how well a program performs when adding
     computational resources
     \item Strong scaling: add resources, keep total amount of work
     constant
     \begin{equation*}
       S(n) = \frac{T_{1}}{T_{n}}, \qquad E(n) = \frac{S(n)}{n} = \frac{T_{1}}{nT_{n}}
     \end{equation*}
     \item Strong scaling is an indication on how much profitable it is to
     add resources to solve your problem
   \end{itemize}
   \addimage[width=6cm]{\FIGREP/strong_scaling}{5cm}{1cm}
 \end{frame}
 
 \begin{frame}[t]
   \frametitle{Weak scaling}
   \framesubtitle{}
 
   \begin{itemize}
     \item Weak scaling: add resources and maintain amount of work per resource
           constant
     \begin{equation*}
       S(n) = \frac{nT_{1}}{T_{n}}, \qquad E(n) = \frac{S(n)}{n} = \frac{T_{1}}{T_{n}}
     \end{equation*}
     \item Weak scalings are an indication on how well your code will perform on
     a bigger machine (and with a bigger problem)
     \item These scalings are always required for a proposal
     \begin{itemize}
       \item For strong scalings the metric is speedup (how do I improve performance)
       \item For weak scalings the metric is efficiency (how well
       performance is kept)
     \end{itemize}
   \end{itemize}
   \addimage[width=6cm]{\FIGREP/weak_scaling}{5cm}{1cm}
 \end{frame}
 
 \subsection{Amdahl's law}
 \label{sec:amdahl}
 \begin{frame}[t]
   \frametitle{Amdahl's law}
   \framesubtitle{}
   \begin{itemize}
     \item Amdahl's law gives you an upper bound to the achievable speedup for a
     fixed problem size
     \item By definition it is a strong scaling analysis
     \vfill
     \pause
     \item Assume a fraction $p$ of your code is (perfectly) parallel and timing with 1 process
     is $T_{1}$
     \item Timing with $n$ processes is
     \begin{equation*}
       T_{n} = (1-p) T_{1} + \frac{p}{n}T_{1} = \left[ (1-p) + \frac{p}{n}\right] T_{1}
     \end{equation*}
     \pause
     \item Speedup becomes
     \begin{equation*}
       S(n) = \frac{T_{1}}{T_{n}} = \frac{1}{(1-p) + \frac{p}{n}}
     \end{equation*}
     \vfill
     \pause
     \item In the limit of infinite resources
     \begin{equation*}
       \lim_{n\rightarrow\infty}S(n) = \frac{1}{1-p}
     \end{equation*}
   \end{itemize}
   \onslide<2->\addimage[width=3cm]{\FIGREP/amdahl_illustration}{12.5cm}{1.0cm}
 \end{frame}
 
 \begin{frame}[b]
   \frametitle{Amdahl's law}
   \framesubtitle{}
   \begin{itemize}
     \item Limited by the serial part (very sensitive)!
     \item Does this mean we cannot exploit large HPC machines?
     \pause
     \item No, in general with more resources, we simulate larger systems
     $\Rightarrow$ weak scaling (see
     \href{https://en.wikipedia.org/wiki/Gustafson\%27s_law}{Gustafson law})
   \end{itemize}
   \onslide<1->\addimage[width=8.cm]{\FIGREP/amdahl_speedup}{4cm}{2cm}
 \end{frame}
 
 \begin{frame}
   \frametitle{\si{\flops} and memory bandwidth}
   \framesubtitle{}
   \begin{itemize}
     \item FLOPs are floating point operations, e.g. $+, -, \times, \div$
     \item Can be evaluated by hand, dividing the number of operations by the running time
     \vfill
     \item Memory bandwidth measures the amount of data transferred by unit of
           time [\si{\byte\per\second}, \si{\kibi\byte\per\second},
           \si{\mebi\byte\per\second}, \si{\gibi\byte\per\second}, ...]
     \item Can be measured by hand dividing the amount of data transferred by the
     running time
     \vfill
     \item In both cases, generally use tools such as PAPI, Tau, likwid, Intel
           Amplxe, STREAM, etc.
   \end{itemize}
 \end{frame}
 
 \begin{frame}[t,fragile]
   \frametitle{Performance measurement}
   \framesubtitle{A simple DAXPY example}
   \begin{itemize}
     \item Assume \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel Xeon Gold 6132} (Gacrux)
   \end{itemize}
   \cxxfile[%
      title={optimization/daxpy.cc},
      minted options app={
 %       highlightlines={2, 7},
        firstline=25,
        lastline=27,
        firstnumber=1,
      }]{examples/optimization/daxpy.cc}
   \begin{itemize}
     \item My code runs in \SI{174.25}{\ms}. It is amazingly fast!
   \end{itemize}
   \pause
   \vfill
   \begin{itemize}
     \item Each iteration has 2 FLOP (1 add and 1 mul) and there are \cmd{N = 1e8}
           iterations
     \item Our code $\SI{2d8}{\flop} / \SI{174.25d-3}{\second} = \SI{0.001}{\tera\flops}$
     \item Our hardware can achieve a theoretical peak performance of $\SI{1.16}{\tera\flops}$...
   \end{itemize}
   \pause
   \vfill
   \begin{itemize}
     \item Each iteration has 3 memory operations (2 loads and 1 store)
     \item Our code $\SI{2.23}{\gibi\byte} / \SI{174.25d-3}{\second} = \SI{12.82}{\gibi\byte\per\second}$
     \item Our hardware can achieve a theoretical memory bandwidth of $\SI{125}{\gibi\byte\per\second}$...
   \end{itemize}
 \end{frame}
 
 \subsection{Roofline model}
 \label{sec:roofline}
 \begin{frame}[t]
   \frametitle{Roofline model}
   \framesubtitle{}
 
   \begin{itemize}
     \item How well am I exploiting the hardware resources?
     \item The roofline model is a performance model allowing to have an estimate
     to this question
   \end{itemize}
   \vspace{1cm}
   \pause
   \begin{itemize}
     \item Key concept: the arithmetic intensity, $AI$, of an algorithm is \# \si{\flop\per\byte} of data transferred
     \item It measures data reuse
   \end{itemize}
   \addimage[width=8.cm]{\FIGREP/ai}{4cm}{0.5cm}
 \end{frame}
 
 \begin{frame}[t,fragile]
   \frametitle{Roofline model}
   \framesubtitle{How to find arithmetic intensity}
   \begin{itemize}
     \item For very simple algorithms, you can compute the AI
     \item Let's take back the DAXPY example
   \cxxfile[%
      title={optimization/daxpy.cc},
      minted options app={
 %       highlightlines={2, 7},
        firstline=25,
        lastline=27,
        firstnumber=1,
      }]{examples/optimization/daxpy.cc}
     \item There are 2 operations (1 add and 1 mul)
     \item Three 8-byte memory operations (2 loads and 1 store)
     \item The AI is then $2/24 = 1/12$
     \pause
     \item For more complex algorithms, use a tool, e.g. Intel Advisor
   \end{itemize}
 \end{frame}
 
 \begin{frame}[t]
   \frametitle{Roofline model}
   \framesubtitle{Building the model}
 
   \begin{itemize}
     \item Roofline model is plotted on \textbf{log-log scale}
     \begin{itemize}
       \item x-axis is the $AI$
       \item y-axis is \si{\flops}
     \end{itemize}
     \pause
     \item The hardware limits are defined by
     \begin{equation*}
       P = \min(P_{\text{max}}, b_{s} \cdot AI)
     \end{equation*}
     \begin{itemize}
       \item $P_{\text{max}}$ is the CPU peak \si{\flops}
       \item$AI$ is the intensity
       \item $b_{s}$ is the memory BW
     \end{itemize}
   \end{itemize}
   \onslide<1>\addimage[width=5cm]{\FIGREP/roofline_1}{5.5cm}{0.5cm}
   \onslide<2>\addimage[width=5cm]{\FIGREP/roofline_2}{5.5cm}{0.5cm}
   \onslide<3>\addimage[width=5cm]{\FIGREP/roofline_3}{5.5cm}{0.5cm}
 \end{frame}
 
 \begin{frame}[t]
   \frametitle{Roofline model}
   \framesubtitle{Building the model}
 
   \begin{itemize}
     \item Refinements can be made to the Roofline model
     \item Adding a memory hierarchy with caches
     \item Adding different levels of DLP (Data-Level parallelism)
     \item They give you hint on what to optimize for
   \end{itemize}
   \addimage[width=7cm]{\FIGREP/roofline_extended}{4.5cm}{0.5cm}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Roofline model}
   \framesubtitle{How to find the peak performance}
   \begin{itemize}
     \item Theoretical peak performance\\
           \vspace{-2ex}
           \begin{minipage}{.4\linewidth}
             \begin{align*}
               P_{\text{max}} = & \textcolor{white}{\times} \text{Number of FP ports (ILP)} \\
                                & \times \text{flops} / \text{cycles (e.g. 2 for FMA)} \\
                                & \times \text{vector size (DLP)} \\
                                & \times \text{frequency (in GHz)} \\
                                & \times \text{number of cores (TLP)}
             \end{align*}
           \end{minipage}
           \vspace{3ex}
     \item Example:
           \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel
           Xeon Gold 6132}\\
           \vspace{-2ex}
           \begin{minipage}{.4\linewidth}
             \begin{align*}
               P_{\text{max}} = & \textcolor{white}{\times} 2 \text{ (ports)} \\
                                & \times \SI{2}{\flop\per\cycle} \text{ (2 for FMA)} \\
                                & \times \frac{\SI{512}{\bit} \text{ (AVX512)} }{\SI{64}{\bit}\text{ (double)}} \\
                                & \times \SI{2.3}{\giga\hertz} \\
                                & \times 14 \text{ (cores)} \\
               = & \SI{1.16}{\tera\flops}
             \end{align*}
           \end{minipage}
           \addimage[width=6cm]{\FIGREP/skylake_server_block_diagram}{9cm}{0.8cm}
           \pause
           \vspace{3ex}
     \item Or use a software that estimates it
   \end{itemize}
 \end{frame}
 
-\begin{frame}[t]
+\begin{frame}[fragile,t]
   \frametitle{Roofline model}
   \framesubtitle{How to find the memory bandwidth}
   \begin{itemize}
     \item Theoretical memory bandwidth  of the memory
     \begin{align*}
       \text{BW}_{\text{max}} = &\textcolor{white}{\times} \text{Number of transfers per second} \\
                        & \times \text{Bus width} \\
                        & \times \text{Number of interfaces}
     \end{align*}
     \item In general, we suppose that RAM matches CPU bandwidth (found on the CPU spec. list)
     \item Example:
           \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel
           Xeon Gold 6132}
 
           \begin{align*}
             \text{BW}_{\text{max}} = &\textcolor{white}{\times} \SI{2666}{\mega\transfer\per\second} \text{ (DDR4 2666)} \\
               & \times \SI{8}{\byte\per\transfer} \text{ (64bit bus)}\\
               & \times 6
           \end{align*}
           \begin{itemize}
 
       \item $\SI{19.86}{\gibi\byte\per\second}$ for 1 channel
       \item Maximum of $\SI{119.18}{\gibi\byte\per\second}$
     \end{itemize}
     \pause
     \item Or use a software that estimates it
   \end{itemize}
 
   \begin{itemize}
     \item A corollary from ``theoretical'' is that it is not achievable in practice!
   \end{itemize}
 \end{frame}
 
+\begin{frame}[fragile]
+  \frametitle{Roofline model}
+  \framesubtitle{How to measure the actual values}
+  \begin{itemize}
+    \item Peak performance measurement
+          \begin{itemize}
+            \item Using a compute bound kernel
+            \item Using dgemm:\\
+                  1 core: $\SI{98.0}{\giga\flops}$\\
+                  14 cores: $\SI{965.0}{\giga\flops}$
+          \end{itemize}
+    \item Bandwidth measurement
+          \begin{itemize}
+            \item Using a memory bound kernel
+            \item Using stream (triad):\\
+                  1 core: $\SI{12.7}{\gibi\byte\per\second}$\\
+                  6 core: $\SI{70.1}{\gibi\byte\per\second}$\\
+                  9 core: $\SI{82.7}{\gibi\byte\per\second}$
+                  \addimage[width=6cm]{\FIGREP/skylake_server_block_diagram}{9cm}{0.8cm}
+          \end{itemize}
+  \end{itemize}
+\end{frame}
+
 \begin{frame}[fragile,t]
   \frametitle{Optimization}
   \framesubtitle{}
 
   \begin{itemize}
     \item We now have a pretty good idea of which part of the code to optimize
     \item Different options are possible (by order of complexity)
           \begin{enumerate}
             \item Compiler and linker flags
             \item Optimized external libraries
             \item Handmade optimization (loop reordering, better data access,
                   etc.)
             \item Algorithmic changes
           \end{enumerate}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Optimization}
   \framesubtitle{Compiler flags}
 
   \begin{itemize}
     \item Compilers have a set of optimizations they can do (if possible)
     \item You can find a list of \href{https://gcc.gnu.org/onlinedocs/gcc/gcc-command-options/options-that-control-optimization.html}{options
           for GNU compilers on their doc}
           \pause
     \item Common options are:
           \begin{itemize}
             \item \cmd{-O0}, \cmd{-O1}, \cmd{-O2}, \cmd{-O3}: from almost no
                   optimizations to most optimizations
                   \pause
             \item \cmd{-Ofast}: activate more aggressive options, \eg{}
                   \cmd{-ffast-math} (but can produce wrong results in some
                   particular cases)
           \end{itemize}
           \pause
     \item Test your program with different options (\cmd{-O3} does not
           necessarily leads to faster programs)
     \item Note that the more optimization the longer the compilation time
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Optimization}
   \framesubtitle{Optimized libraries}
 
   \begin{itemize}
     \item Do not re-invent the wheel!
     \item A lot of optimized libraries exist with different purposes (solvers,
           data structures, I/O, etc.). A few examples:
           \begin{itemize}
             \item Solvers: PETSc, MUMPS, LAPACK, scaLAPACK, PARDISO, etc.
             \item I/O: HDF5, ADIOS, etc.
             \item Math libraries: FFTW, BLAS, etc.
           \end{itemize}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Optimization}
   \framesubtitle{Handmade optimizations}
 
   \begin{itemize}
     \item Sometimes, we cannot rely on compiler options or libraries and we must
           optimize ``by hand''
     \item Usually, the goal is to rewrite the code in such a way that the
           compiler can optimize it
     \item Start by having a correct program before trying to optimize
     \item ``Premature optimization is the root of all evil'', D. Knuth
   \end{itemize}
 \end{frame}
 
 \subsection{Pareto principle}
 \label{sec:pareto}
 \begin{frame}
   \frametitle{Pareto principle}
   \framesubtitle{The 80/20 rule}
 
   \begin{itemize}
     \item General principle that states that 80\% of the effect comes from 20\%
           of causes
     \item Applies in many domains and especially in optimization
     \item 80\% of the time is spent in 20\% of your code
     \item Concentrate on those 20\% and don't arbitrarily optimize
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Optimization}
   \framesubtitle{Algorithmic optimizations}
 
   \begin{itemize}
     \item Example of matrix/matrix multiplication. Graph shows complexity ($\mathcal{O}(n^{\omega})$) for
           different algorithms
   \end{itemize}
   \onslide<2>\addimage[width=7cm]{\FIGREP/matmul}{4.5cm}{0.5cm}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Parallelization}
   \framesubtitle{When to parallelize}
 
   \begin{itemize}
     \item Only when your code has \textit{no bugs} and is \textit{optimized}
     \item Are your ready to parallelize?
           \begin{enumerate}
             \item Is it worth to parallelize my code? Does my algorithm scale?
             \item Performance prediction?
             \item Profiling?
             \item Bottelnecks?
             \item Which parallel paradigm should I use? What is the target architecture
                   (SMP, cluster, GPU, hybrid, etc)?
           \end{enumerate}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Parallelization}
   \framesubtitle{When to parallelize}
 
   In 1991, David H. Bailey published a famous paper: \href{https://www.davidhbailey.com/dhbpapers/twelve-ways.pdf}{Twelve ways to fool
     the masses when giving performance results on parallel computers}
 
   \vspace{1cm}
 
   \textit{6: Compare your results against scalar, unoptimized code on Crays.}
   \addimage[width=7cm]{\FIGREP/dhb}{4.5cm}{0.5cm}
 \end{frame}
 
 %%% Local Variables:
 %%% mode: latex
 %%% TeX-master: "../../phys_743_parallel_programming"
 %%% End: