diff --git a/examples/mpi/datatypes.cc b/examples/mpi/datatypes.cc index c4fef20..7b9962d 100644 --- a/examples/mpi/datatypes.cc +++ b/examples/mpi/datatypes.cc @@ -1,53 +1,53 @@ #include #include #include int main() { MPI_Init(NULL, NULL); int prank; MPI_Comm_rank(MPI_COMM_WORLD, &prank); MPI_Count lb, extent, size; struct Test_t { double d[2]; int i; }; std::vector foo(100); std::array block_lengths = {2, 1}; std::array displacements; std::array old_types = {MPI_DOUBLE, MPI_INT}; MPI_Aint addr0, addr1; MPI_Get_address(&foo[0], &addr0); - MPI_Get_address(&foo[0].d, &displacements[0]); + MPI_Get_address(&foo[0].d[0], &displacements[0]); MPI_Get_address(&foo[0].i, &displacements[1]); displacements[0] = MPI_Aint_diff(displacements[0], addr0); displacements[1] = MPI_Aint_diff(displacements[1], addr0); MPI_Datatype mpi_test_t, mpi_test_vector_t; MPI_Type_create_struct(2, block_lengths.data(), displacements.data(), old_types.data(), &mpi_test_t); MPI_Get_address(&foo[1], &addr1); addr1 = MPI_Aint_diff(addr1, addr0); MPI_Type_create_resized(mpi_test_t, 0, addr1, &mpi_test_vector_t); MPI_Type_commit(&mpi_test_vector_t); MPI_Type_get_extent_x(mpi_test_t, &lb, &extent); MPI_Type_size_x(mpi_test_t, &size); std::cout << "MPI Datatype: mpi_test_t, size: " << size << " extent: " << extent << std::endl; std::cout << "C++ Datatype: Test_t, size: " << sizeof(Test_t) << std::endl; MPI_Type_free(&mpi_test_vector_t); MPI_Finalize(); return 0; } diff --git a/phys_743_parallel_programming.tex b/phys_743_parallel_programming.tex index c14d467..cf00c3f 100644 --- a/phys_743_parallel_programming.tex +++ b/phys_743_parallel_programming.tex @@ -1,279 +1,279 @@ \documentclass[8pt,aspectratio=169]{beamer} \usepackage[utf8]{inputenc} \usepackage[T1]{fontenc} \usepackage[english]{babel} \usepackage[most, minted]{tcolorbox} \usepackage{xcolor} \usepackage{graphicx} \usepackage{fancyvrb} \usepackage{tikz} \usepackage{colortbl} \usepackage{booktabs} \usepackage[super]{nth} \usepackage{amssymb} \usepackage[binary-units=true]{siunitx} \usepackage{booktabs} \usepackage{pgfpages} -%\setbeameroption{show notes on second screen=left} +\setbeameroption{show notes on second screen=left} \usemintedstyle{emacs} \makeatletter % Define commands to select the folder where the Beamer theme lies in \def\beamer@calltheme#1#2#3{% \def\beamer@themelist{#2} \@for\beamer@themename:=\beamer@themelist\do {\usepackage[{#1}]{\beamer@themelocation/#3\beamer@themename}}} \def\usefolder#1{ \def\beamer@themelocation{#1} } \def\beamer@themelocation{} % Patch Table of Content to insert fixed spaces between items instead of vfills \patchcmd{\beamer@sectionintoc} {\vfill} {\vskip\itemsep} {} {} % New counter for line numbers \newcounter{verbatim@cnt} % This is for color band on the linenos in listings \AtEndEnvironment{Verbatim}{% \stepcounter{verbatim@cnt}% \protected@write\@auxout{}{% \global\protect\@namedef{verbatim@numwidth@\the\c@verbatim@cnt}{% \ifnum\c@FancyVerbLine>999 7.5mm% \else\ifnum\c@FancyVerbLine>99 6mm% \else 4mm% \fi\fi }% }% } \def\minted@auto@numwidth#1{% \ifcsname verbatim@numwidth@\the\numexpr\c@verbatim@cnt#1\relax\endcsname \csname verbatim@numwidth@\the\numexpr\c@verbatim@cnt#1\relax\endcsname \else 4mm% \fi } \tcbset{bashstyle/.style={ colframe=black!70, listing engine=minted, listing only, minted style=colorful, minted language=console, size=fbox, breakable, enhanced, minted options={ autogobble=true, breaklines=true, breakbefore=., numbersep=2mm, }, }} \tcbset{cxx/.style={ colframe=black!70, listing engine=minted, listing only, minted style=emacs, minted language=C++, size=fbox, breakable, enhanced, minted options={ autogobble=true, linenos, breaklines=true, breakbefore=., numbersep=2mm, escapeinside=||, }, overlay={% \begin{tcbclipinterior} \fill[gray!25] (frame.south west) rectangle ([xshift=\dimexpr\minted@auto@numwidth{}\relax]frame.north west); \end{tcbclipinterior} }, % in "left", \c@verbatim@cnt is not stepped yet, hence the argument "+1" left=\dimexpr\minted@auto@numwidth{+1}\relax, }} % \EscMintinline[options]{}{} \def\EscMintinline{% \FVExtraRobustCommand \RobustEscMintinline \FVExtraUnexpandedReadOArgMArgEscVArg} \NewExpandableDocumentCommand \FVExtraUnexpandedReadOArgMArgEscVArg { o m m } {% \IfNoValueTF{#1} {\FVExtraAlwaysUnexpanded {\FVExtraUnexpandedReadOArgMArgEscVArg{#2}{#3}}} {\FVExtraAlwaysUnexpanded {\FVExtraUnexpandedReadOArgMArgEscVArg[#1]{#2}{#3}}}% } \newrobustcmd\RobustEscMintinline[2][]{% % similar to \mintinline \begingroup \setboolean{minted@isinline}{true}% \minted@configlang{#2}% \setkeys{minted@opt@cmd}{#1}% \minted@fvset \begingroup \@ifnextchar\bgroup {\FVExtraDetokenizeREscVArg{\minted@inline@iii}}% {\PackageError{minted}% {\string\EscMintinline\space delimiters must be paired curly braces in this context}% {Delimit argument with curly braces}}} \makeatother \newtcblisting{bashcode}{% colframe=black!70, width=\linewidth, bashstyle } \newtcblisting{consoleoutput}{% colback=black, colupper=gray!50, colframe=black!70, listing engine=minted, listing only, minted style=monokai, minted language=console, size=fbox, breakable, enhanced, minted options={ autogobble=true, breaklines=true, breakbefore=., numbersep=2mm, } } \newtcblisting{cxxcode}[2][]{ cxx, title={#2}, #1, } \newtcbinputlisting{\cxxfile}[2][]{% cxx, minted options app={ fontsize=\small, }, listing file={#2}, % width=80ex, #1 } \newcommand{\cxxinline}[1]{\EscMintinline{C++}{#1}} \newcommand{\cmd}[1]{\EscMintinline[style=colorful]{console}{#1}} %newmintinline[cmd]{console}{style=colorful,autogobble} \newcommand{\code}[1]{\texttt{\bf #1}} \DeclareSIUnit\flop{FLOP} \DeclareSIUnit\transfer{T} \DeclareSIUnit\cycle{c} \DeclareSIUnit\flops{\flop\per\second} \DeclareSIUnit\chf{CHF} \sisetup{per-mode=symbol} \sisetup{exponent-product = \cdot} \sisetup{group-separator={\mathrm{'}}} \definecolor{blue0}{HTML}{002255} \definecolor{blue1}{HTML}{003380} \definecolor{blue2}{HTML}{0044AA} \definecolor{blue3}{HTML}{0055D4} \definecolor{blue4}{HTML}{0066FF} \definecolor{blue5}{HTML}{2A7FFF} \definecolor{blue6}{HTML}{5599FF} \definecolor{blue7}{HTML}{80B3FF} \definecolor{blue8}{HTML}{AACCFF} \definecolor{blue9}{HTML}{D5E5FF} \definecolor{yellowbrown0}{HTML}{554400} \definecolor{yellowbrown1}{HTML}{806600} \definecolor{yellowbrown2}{HTML}{AA8800} \definecolor{yellowbrown3}{HTML}{D4AA00} \definecolor{yellowbrown4}{HTML}{FFCC00} \definecolor{yellowbrown5}{HTML}{FFD42A} \definecolor{yellowbrown6}{HTML}{FFDD55} \definecolor{yellowbrown7}{HTML}{FFE680} \definecolor{yellowbrown8}{HTML}{FFEEAA} \definecolor{yellowbrown9}{HTML}{FFF6D5} \definecolor{colShellBg}{HTML}{F5EDE4} \definecolor{links}{HTML}{2A1B81} \hypersetup{colorlinks,linkcolor=,urlcolor=links} \usefolder{scitas_theme} \usetheme{scitas} \newcommand{\eg}{\textit{e.g.}} \newcommand{\ie}{\textit{i.e.}} \newcommand{\FIGREP}{figures} \renewcommand{\arraystretch}{1.3} % Remove numbering from the ToC when it's spread on multiple frames \setbeamertemplate{frametitle continuation}{} \title{{\huge Parallel Programming}\\Single-core optimization, MPI, OpenMP, and hybrid programming} \author[N. Richart, E. Lanti]{Nicolas Richart \\ Emmanuel Lanti \\ {\scriptsize Course based on V. Keller's lecture notes}} \date{\nth{14} - \nth{18} of November 2022} \begin{document} \begin{frame}[plain] \titlepage \end{frame} % \section{Table of Contents} % \begin{frame}[allowframebreaks=0.8] % \frametitle{Table of Contents} % \tableofcontents%[hideallsubsections] % \end{frame} % Administration %\input{src/admin/admin} % Single-core optimization %\input{src/basic_concepts/basic_concepts} %\input{src/cluster_architecture/cluster_architecture} %\input{src/performance_measurement/performance_measurement} %\input{src/optimization/optimization} % OpenMP \input{src/openmp/openmp} % MPI %\input{src/mpi/mpi} %\input{src/mpi/mpi_advanced} % Hybrid programming %\input{src/hybrid/hybrid} % Recapitulation of the course %\input{src/recap/recap} % Project description %\input{src/projects/projects} \end{document} %%% Local Variables: %%% mode: latex %%% TeX-command-extra-options: "-shell-escape" %%% TeX-master: t %%% End: diff --git a/src/cluster_architecture/cluster_architecture.tex b/src/cluster_architecture/cluster_architecture.tex index e9f967d..bacfa1b 100644 --- a/src/cluster_architecture/cluster_architecture.tex +++ b/src/cluster_architecture/cluster_architecture.tex @@ -1,310 +1,311 @@ \renewcommand{\FIGREP}{src/cluster_architecture/figures} \section{Cluster Architecture} \label{sec:cluster_architecture} \intersec{helvetios} \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{} \begin{itemize} \item The goal of this section is to understand what's under the cluster's hood \item In order to take full advantage of your computer, you have to understand how it works, what are the limits, etc. \item We'll go from the cluster level down to the core level \end{itemize} \addimage[width=7cm]{\FIGREP/summit}{5.5cm}{0.5cm} \end{frame} \subsection{Cluster as a whole} \label{sec:cluster} \begin{frame} \frametitle{Cluster Architecture} \framesubtitle{General presentation} \begin{minipage}{0.5\linewidth} \begin{itemize} \item An HPC cluster is composed of \begin{itemize} \item Login node(s) \item Compute nodes \item Storage system \item High performance interconnect \end{itemize} \item The simulation data is written on the storage systems. At SCITAS: \begin{itemize} \item \code{/home}: store source files, input data, small files \item \code{/work}: collaboration space for a group \item \code{/scratch}: temporary huge result files \end{itemize} Please, note that only \cmd{/home} and \cmd{/work} have backups! \cmd{/scratch} data can be erased at any moment! \end{itemize} \end{minipage} \addimage[width=6cm]{\FIGREP/abstract_architecture}{9.5cm}{1.5cm} \end{frame} \note{ \begin{itemize} \item The users connect to the login node \item Backups on \code{/work} are paying \end{itemize} } \subsection{Introduction to SLURM} \begin{frame} \frametitle{Introduction to SLURM} \framesubtitle{} \begin{minipage}{0.55\linewidth} \begin{itemize} \item Users do not run their calculations directly on the compute nodes \item A \textit{scheduler} is used to ensure fair resource usage \pause \item At SCITAS, we use the SLURM scheduler \pause \item You submit your simulation and the resources you need to SLURM \item SLURM stores it into a queue and assigns it a starting time depending on many parameters \item Your job may not start right away and it is normal! \end{itemize} \end{minipage} \onslide<1>\addimage[width=6cm]{\FIGREP/abstract_architecture}{9.5cm}{1.5cm} \onslide<2->\addimage[width=6cm]{\FIGREP/abstract_architecture_slurm}{9.5cm}{1.5cm} \end{frame} \begin{frame}[fragile] \frametitle{Introduction to SLURM} \framesubtitle{How to submit simulations} \textbf{To submit a job} \begin{bashcode} $> srun -A phys-743 --reservation phys-743 ./my_program \end{bashcode}%$ \vfill \begin{description} \item[-A / -{}-account=]: name of your SLURM account \item[-{}-reservation=]: name of your SLURM reservation \item[-t / -{}-time=]: set a limit on the total run time of the job \item[-N / -{}-nodes=]: request that a minimum of $N$ nodes be allocated to the job \item[-n / -{}-ntasks=]: advise SLURM that this job will launch a maximum of $n$ tasks \item[-c / -{}-cpus-per-task=]: advise SLURM that job will require \code{ncpus} per task \item[-{}-mem=]: specify the memory required per node \end{description} \vfill Need more help? Have a look at the \href{https://slurm.schedmd.com/sbatch.html}{documentation} \end{frame} \begin{frame}[fragile] \frametitle{Introduction to SLURM} \framesubtitle{How to submit simulations} \textbf{Or you can put everything in a file called, e.g. \code{my\_simulation.job}} \begin{bashcode} + #!/bin/bash -l #SBATCH --account=phys-743 #SBATCH --reservation=phys-743 #SBATCH --time=01:10:00 #SBATCH --nodes=2 #SBATCH --ntasks=56 srun ./my_program \end{bashcode} and submit the job with \begin{bashcode} $> sbatch my_simulation.job \end{bashcode}%$ \end{frame} \begin{frame}[fragile] \frametitle{Introduction to SLURM} \framesubtitle{How to manage simulations} \textbf{To list all your jobs} \begin{bashcode} $> squeue -u \end{bashcode}%$ \vfill \textbf{To cancel a simulation} \begin{bashcode} $> scancel \end{bashcode}%$ The \code{} can be found using \code{squeue} \end{frame} \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{A few numbers} \textbf{Let's go back to Summit} \begin{itemize} \item Second most powerful HPC cluster in the world according to the \href{https://www.top500.org/lists/top500/list/2021/06/}{Top500 June 2021 list} \item It is composed of \SI{4608}{} compute nodes \item Power consumption of \SI{10096.00}{\kilo\watt} % Consommation annuelle par habitant en Suisse ~7500kWh % https://donnees.banquemondiale.org/indicator/EG.USE.ELEC.KH.PC \item Equivalent consumption as a city with $\sim$\SI{13000}{} inhabitants % Prix du kWh a Lausanne ~20 centimes \item In Lausanne, running Summit would cost $\sim$\SI{50000}{\chf\per\day} only for electricity! \end{itemize} \addimage[width=5cm]{\FIGREP/summit}{5.5cm}{1.0cm} \end{frame} \subsection{The compute node} \label{sec:node} \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{Let's dive into a compute node!} \begin{itemize} \item The compute node is the basic building bloc of a cluster \item It is composed of one or more CPU with RAM (memory) and eventually one or more accelerator, e.g. GPUs \item All the nodes are connected together with an interconnect \end{itemize} \addimage[width=4.5cm]{\FIGREP/node_architecture}{5.75cm}{1.0cm} \end{frame} \note{ \begin{itemize} \item A compute node is like a personal computer on steroid \end{itemize} } \subsection{The CPU} \label{sec:cpu} \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{Central processing unit} \begin{itemize} \item The CPU is the ``brain'' of the node \item CPUs work in clock cycles; they are the ``heart beat'' of the CPU \item It is composed of cores and different levels of memories called caches \item There are usually three levels of cache called L1, L2, and L3 \end{itemize} \vspace{0.8cm} \begin{table} \hspace{5cm} \scriptsize \begin{tabular}{@{}llll@{}} \toprule \textbf{Event} & \textbf{Latency} & \textbf{Scaled} & \textbf{Capacity} \\ \midrule 1 CPU cycle & 0.1\,ns & 1 s & -- \\ L1 cache access & 1\,ns & 10\,s & kB \\ L2 cache access & 1\,ns & 10\,s & MB \\ L3 cache access & 10\,ns & 1\,min & MB \\ RAM access & 100\,ns & 10\,min & GB \\ Solid-state disk access & 100\,$\mu$s & 10 days & TB \\ Hard-disk drive access& 1--10\,ms & 1--12 months & TB \\ \bottomrule \end{tabular} \end{table} \addimage[width=4cm]{\FIGREP/cpu_architecture}{3cm}{1.5cm} \end{frame} \note{ \begin{itemize} \item Caches are extremely fast memories that are used to hide the latency of other memories (RAM, hard drive, etc.) \item However, they are usually quite small compared to e.g. RAM, hard drive \item L1 is the closest to the core, followed by L2 and L3 \item Some cache levels are private to a core, e.g. here L1 and L2 \end{itemize} } \subsection{Summary of SCITAS' clusters} \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{Summary of SCITAS' clusters} \onslide<2>{\begin{tikzpicture}[overlay,remember picture] \begin{scope}[shift={(current page.south west)}] \draw[red, thick] (2, 1) -- (6.5, 7); \end{scope} \end{tikzpicture}} \begin{minipage}[t]{0.32\linewidth} \begin{center} \textbf{Fidis} \end{center} \begin{itemize} \item CPU cluster \item 336 nodes each with \begin{itemize} \item 2 Intel Xeon E5-2690 @\SI{2.6}{\giga\hertz} with 14 cores each \item \SI{128}{\gibi\byte} of RAM \end{itemize} \item 72 nodes each with \begin{itemize} \item 2 Intel Xeon E5-2690 @\SI{2.6}{\giga\hertz} with 14 cores each \item \SI{256}{\gibi\byte} of RAM \end{itemize} \item 216 nodes each with \begin{itemize} \item 2 Intel Xeon Gold 6132 @\SI{2.6}{\giga\hertz} with 14 cores each \item \SI{192}{\gibi\byte} of RAM \end{itemize} \end{itemize} \end{minipage} \hfill \begin{minipage}[t]{0.32\linewidth} \begin{center} \textbf{Helvetios} \end{center} \begin{itemize} \item CPU cluster \item 287 nodes each with \begin{itemize} \item 2 Intel Xeon Gold 6140 @\SI{2.3}{\giga\hertz} with 18 cores each \item \SI{192}{\gibi\byte} of DDR3 RAM \end{itemize} \end{itemize} \vspace{0.5cm} \onslide<2>{\begin{minipage}[t]{1.0\linewidth} \begin{center} \textbf{Jed} \end{center} \begin{itemize} \item CPU cluster \item 419 nodes, 2 Intel Ice Lake Platinum with 36 cores each \begin{itemize} \item XXX nodes with \SI{512}{\gibi\byte} of DDR3 RAM \item YYY nodes with \SI{1}{\tebi\byte} of DDR3 RAM \end{itemize} \end{itemize} \end{minipage}} \end{minipage} \hfill \begin{minipage}[t]{0.32\linewidth} \begin{center} \textbf{Izar} \end{center} \begin{itemize} \item CPU + GPU cluster \item 64 nodes each with \begin{itemize} \item 2 Intel Xeon Gold 6230 @\SI{2.1}{\giga\hertz} with 20 cores each \item 2 NVIDIA V100 PCIe \SI{32}{\gibi\byte} GPUs \item \SI{192}{\gibi\byte} of DDR4 RAM \end{itemize} \item 2 nodes each with \begin{itemize} \item 2 Intel Skylake @\SI{2.1}{\giga\hertz} with 20 cores each \item 4 NVIDIA V100 SMX2 \SI{32}{\gibi\byte} GPUs \item \SI{192}{\gibi\byte} of DDR4 RAM \end{itemize} \end{itemize} \end{minipage} \end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: diff --git a/src/performance_measurement/figures/bandwidth.pdf b/src/performance_measurement/figures/bandwidth.pdf new file mode 100644 index 0000000..1a397f4 Binary files /dev/null and b/src/performance_measurement/figures/bandwidth.pdf differ diff --git a/src/performance_measurement/figures/bandwidth.svg b/src/performance_measurement/figures/bandwidth.svg new file mode 100644 index 0000000..1a9451c --- /dev/null +++ b/src/performance_measurement/figures/bandwidth.svg @@ -0,0 +1,1141 @@ + + + + + + + + 2022-11-14T11:41:32.643432 + image/svg+xml + + + Matplotlib v3.5.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/performance_measurement/figures/plot.py b/src/performance_measurement/figures/plot.py index b89cb94..56b9318 100644 --- a/src/performance_measurement/figures/plot.py +++ b/src/performance_measurement/figures/plot.py @@ -1,164 +1,177 @@ #!/usr/bin/env python """Script to visualize google-benchmark output""" from __future__ import print_function import argparse import sys import logging import json import pandas as pd import matplotlib.pyplot as plt import pathlib logging.basicConfig(format="[%(levelname)s] %(message)s") METRICS = [ "real_time", "cpu_time", "bytes_per_second", "items_per_second", "iterations", ] TRANSFORMS = {"": lambda x: x, "inverse": lambda x: 1.0 / x} def get_default_ylabel(args): """Compute default ylabel for commandline args""" label = "" if args.transform == "": label = args.metric else: label = args.transform + "(" + args.metric + ")" if args.relative_to is not None: label += " relative to %s" % args.relative_to return label def parse_args(): """Parse commandline arguments""" parser = argparse.ArgumentParser(description="Visualize google-benchmark output") parser.add_argument( "-f", metavar="FILE", type=argparse.FileType("r"), default=sys.stdin, dest="file", help="path to file containing the csv or json benchmark data", ) parser.add_argument( "-m", metavar="METRIC", choices=METRICS, default=METRICS[0], dest="metric", help="metric to plot on the y-axis, valid choices are: %s" % ", ".join(METRICS), ) parser.add_argument( "-t", metavar="TRANSFORM", choices=TRANSFORMS.keys(), default="", help="transform to apply to the chosen metric, valid choices are: %s" % ", ".join(list(TRANSFORMS)), dest="transform", ) parser.add_argument( "-r", metavar="RELATIVE_TO", type=str, default=None, dest="relative_to", help="plot metrics relative to this label", ) parser.add_argument( "--xlabel", type=str, default="input size", help="label of the x-axis" ) parser.add_argument("--ylabel", type=str, help="label of the y-axis") parser.add_argument("--title", type=str, default="", help="title of the plot") parser.add_argument( "--logx", action="store_true", help="plot x-axis on a logarithmic scale" ) parser.add_argument( "--logy", action="store_true", help="plot y-axis on a logarithmic scale" ) parser.add_argument( "--output", type=str, default="", help="File in which to save the graph" ) args = parser.parse_args() if args.ylabel is None: args.ylabel = get_default_ylabel(args) return args def parse_input_size(name): splits = name.split("/") if len(splits) == 1: return 1 return int(splits[-1]) def read_data(args): """Read and process dataframe using commandline args""" extension = pathlib.Path(args.file.name).suffix try: if extension == ".csv": data = pd.read_csv(args.file, usecols=["name", args.metric]) elif extension == ".json": json_data = json.load(args.file) data = pd.DataFrame(json_data["benchmarks"]) else: logging.error("Unsupported file extension '{}'".format(extension)) exit(1) except ValueError: logging.error( 'Could not parse the benchmark data. Did you forget "--benchmark_format=[csv|json] when running the benchmark"?' ) exit(1) data["label"] = data["name"].apply(lambda x: x.split("/")[-2]) data["input"] = data["name"].apply(parse_input_size) data[args.metric] = data[args.metric].apply(TRANSFORMS[args.transform]) return data def plot_groups(label_groups, args): """Display the processed data""" + fig, ax = plt.subplots() + for label, group in label_groups.items(): - plt.plot(group["input"], group[args.metric], label=label, marker=".") + ax.plot( + group["input"], + group[args.metric] / 1024 / 1024 / 1024, + label=label, + marker=".", + ) if args.logx: - plt.xscale("log", base=2) + ax.set_xscale("log", base=2) if args.logy: - plt.yscale("log") - plt.xlabel(args.xlabel) - plt.ylabel(args.ylabel) - plt.title(args.title) - plt.legend() + ax.set_yscale("log") + ax.set_xlabel(args.xlabel) + ax.set_ylabel(args.ylabel) + ax.set_title(args.title) + ax.legend() + + ax.vlines([32, 1024, 19712], 0, 120, color="gray") + ax.text(16, 115, "L1") + ax.text(512, 115, "L2") + ax.text(19712 / 2, 115, "L3") + if args.output: logging.info("Saving to %s" % args.output) plt.savefig(args.output) else: plt.show() def main(): """Entry point of the program""" args = parse_args() data = read_data(args) label_groups = {} for label, group in data.groupby("label"): label_groups[label] = group.set_index("input", drop=False) if args.relative_to is not None: try: baseline = label_groups[args.relative_to][args.metric].copy() except KeyError as key: msg = "Key %s is not present in the benchmark output" logging.error(msg, str(key)) exit(1) if args.relative_to is not None: for label in label_groups: label_groups[label][args.metric] /= baseline plot_groups(label_groups, args) if __name__ == "__main__": main() diff --git a/src/performance_measurement/performance_measurement.tex b/src/performance_measurement/performance_measurement.tex index ba7fa24..2446979 100644 --- a/src/performance_measurement/performance_measurement.tex +++ b/src/performance_measurement/performance_measurement.tex @@ -1,693 +1,701 @@ \renewcommand{\FIGREP}{src/performance_measurement/figures} \section{Performance measurement} \label{sec:performance_measurement} \intersec{helvetios} \begin{frame} \frametitle{Goal of this section} \framesubtitle{} \begin{itemize} \item Key concepts to quantify performance \begin{itemize} \item Metrics \item Using a profiler \item Scalings, speedup, efficiency \end{itemize} \item Roofline model \end{itemize} \end{frame} \subsection{Performance metrics} \label{sec:metrics} \begin{frame} \frametitle{Performance metrics} \framesubtitle{} \begin{itemize} \item How can we quantify performance? \item We need to define a means to measure it \item We will focus on the most interesting metrics for HPC \end{itemize} \vfill \pause \begin{itemize} \item The first that comes in mind is \textit{time}, e.g. time-to-solution \item Derived metrics: speedup and efficiency \end{itemize} \vfill \pause \begin{itemize} \item Scientific codes do computations on floating point numbers \item A second metric is the number of \textit{floating-point operations per second} (\si{\flops}) \end{itemize} \vfill \pause \begin{itemize} \item Finally, the \textit{memory bandwidth} indicates how much data does your code transfers per unit of time \end{itemize} \end{frame} \note{ \begin{itemize} \item My code is super fast, it runs in $2.5\si{\ns}$! \item It seems fast, but is it? How fast can your hardware go? \item To really understand how much your code exploit the hardware, we use the \si{\flops} and memory BW \item Your hardware has theoretical maximum values for those \item You can compare the values from your code to the max to see how well you use the hardware \end{itemize} } \subsection{Profiling} \label{sec:profiling} \begin{frame} \frametitle{Profiling} \framesubtitle{A tool to measure various timings} \begin{itemize} \item Where is my application spending most of its time? \begin{itemize} \item (bad) measure time ``by hand'' using timings and prints \item (good) use a tool made for this, e.g. Intel Amplifier, Score-P, gprof \end{itemize} \end{itemize} \vfill \begin{itemize} \item There are two types of profiling techniques \begin{itemize} \item Sampling: you stop the code every now and then and check in which function you are \item Code instrumentation: instructions are added at compile time to trigger measurements \end{itemize} \end{itemize} \vfill \begin{itemize} \item In addition to timings, profilers give you a lot more information on \begin{itemize} \item Memory usage \item Hardware counters \item CPU activity \item MPI communications \item etc. \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile,exercise] \frametitle{Profiling} \framesubtitle{Interactive demonstration} \begin{itemize} \item For the purpose of this exercise, we will use MiniFE \begin{itemize} \item 3D implicit finite-elements on an unstructured mesh \item C++ mini application \item \url{https://github.com/Mantevo/miniFE} \item You don't need to understand what the code does! \end{itemize} \item We will use Intel VTune, part of the \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html\#base-kit}{OneAPI Base toolkit (free)} \end{itemize} \vfill \begin{itemize} \item Download miniFE \item Compile the basic version found in \cmd{ref/src} \item Profile the code using the hotspot analysis \item Open Intel VTune and select your timings \item Play around and find the 5 most time-consuming functions \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{Compile MiniFE} \begin{itemize} \item Download miniFE \begin{bashcode} $> git clone https://github.com/Mantevo/miniFE.git $> cd miniFE \end{bashcode} \item Compile the basic version found in \code{ref/src} \begin{itemize} \item You will need to load a compiler and an MPI library \begin{bashcode} $> module load intel intel-mpi intel-vtune \end{bashcode}%$ \item Change the \cmd{Makefile} to set \cmd{CXX=mpiicpc} and \cmd{CC=mpiicc} and compile \begin{bashcode} $> make \end{bashcode}%$ \item Make sure to compile your code with \cmd{-g -O3} \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{Profile MiniFE} \begin{itemize} \item Profile the code using \begin{bashcode} $> srun -n 1 amplxe-cl -collect hotspots -r prof_results -- ./miniFE.x -nx 128 -ny 128 -nz 128 \end{bashcode}%$ \item This will profile for the ``hotspots'' and store the timings in \cmd{prof\_results} \item You can have more info on the types of analysis with \begin{bashcode} $> amplxe-cl -h collect \end{bashcode}%$ \item Open Intel VTune and select your timings \begin{bashcode} $> amplxe-gui prof_results/prof_results.amplxe \end{bashcode}%$ \item Play around and find the 5 most time-consuming functions \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{What do we learn?} \begin{itemize} \item 50.0\% of the time spent in matrix/vector multiplications \item 12.5\% of time spent imposing boundary conditions \item etc. \item Does the problem size influence the timings? \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{Smaller problem} \begin{itemize} \item This time, we profile a problem of size $(16, 16, 16)$ \item 13.6\% of the time is spent opening libraries \item 13.6\% of the time is spent initializing MPI \item etc. \item Depending on the problem size, different parts of the code will dominate \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{Some tips and tricks} \begin{itemize} \item Profile a code without bugs! \item Choose the right problem size (representative of your simulations) \item Focus on the functions taking the most time first \item If the profile is not explicit, try refactoring into smaller functions \begin{itemize} \item Some profilers, e.g. ScoreP, let you define custom regions \end{itemize} \end{itemize} \end{frame} \subsection{Scalings, speedup and efficiency} \label{sec:scalings} \begin{frame} \frametitle{Speedup and efficiency} \framesubtitle{} \begin{itemize} \item Two important metrics are derived from timings \item Compare timings with $n$ processes, $T_{n}$, against the reference timing, $T_\text{ref}$ \end{itemize} \vfill \begin{minipage}{0.3\linewidth} \begin{center} \textbf{Speedup} \end{center} \begin{equation*} S(n) = \frac{T_\text{ref}}{T_{n}} \end{equation*} \end{minipage} \hspace{0.5cm} \begin{minipage}{0.3\linewidth} \begin{center} \textbf{Efficiency} \end{center} \begin{equation*} E(n) = \frac{S(n)}{n} \end{equation*} \end{minipage} \vfill \begin{itemize} \item We want $S(n)$ as close to $n$ and $E(n)$ as close to 1 (100\%) as possible \end{itemize} \end{frame} \begin{frame}[t] \frametitle{Strong scaling} \framesubtitle{} \begin{itemize} \item Scalings are a way to assess how well a program performs when adding computational resources \item Strong scaling: add resources, keep total amount of work constant \begin{equation*} S(n) = \frac{T_{1}}{T_{n}}, \qquad E(n) = \frac{S(n)}{n} = \frac{T_{1}}{nT_{n}} \end{equation*} \item Strong scaling is an indication on how much profitable it is to add resources to solve your problem \end{itemize} \addimage[width=6cm]{\FIGREP/strong_scaling}{5cm}{1cm} \end{frame} \begin{frame}[t] \frametitle{Weak scaling} \framesubtitle{} \begin{itemize} \item Weak scaling: add resources and maintain amount of work per resource constant \begin{equation*} S(n) = \frac{nT_{1}}{T_{n}}, \qquad E(n) = \frac{S(n)}{n} = \frac{T_{1}}{T_{n}} \end{equation*} \item Weak scalings are an indication on how well your code will perform on a bigger machine (and with a bigger problem) \item These scalings are always required for a proposal \begin{itemize} \item For strong scalings the metric is speedup (how do I improve performance) \item For weak scalings the metric is efficiency (how well performance is kept) \end{itemize} \end{itemize} \addimage[width=6cm]{\FIGREP/weak_scaling}{5cm}{1cm} \end{frame} \subsection{Amdahl's law} \label{sec:amdahl} \begin{frame}[t] \frametitle{Amdahl's law} \framesubtitle{} \begin{itemize} \item Amdahl's law gives you an upper bound to the achievable speedup for a fixed problem size \item By definition it is a strong scaling analysis \vfill \pause \item Assume a fraction $p$ of your code is (perfectly) parallel and timing with 1 process is $T_{1}$ \item Timing with $n$ processes is \begin{equation*} T_{n} = (1-p) T_{1} + \frac{p}{n}T_{1} = \left[ (1-p) + \frac{p}{n}\right] T_{1} \end{equation*} \pause \item Speedup becomes \begin{equation*} S(n) = \frac{T_{1}}{T_{n}} = \frac{1}{(1-p) + \frac{p}{n}} \end{equation*} \vfill \pause \item In the limit of infinite resources \begin{equation*} \lim_{n\rightarrow\infty}S(n) = \frac{1}{1-p} \end{equation*} \end{itemize} \onslide<2->\addimage[width=3cm]{\FIGREP/amdahl_illustration}{12.5cm}{1.0cm} \end{frame} \begin{frame}[b] \frametitle{Amdahl's law} \framesubtitle{} \begin{itemize} \item Limited by the serial part (very sensitive)! \item Does this mean we cannot exploit large HPC machines? \pause \item No, in general with more resources, we simulate larger systems $\Rightarrow$ weak scaling (see \href{https://en.wikipedia.org/wiki/Gustafson\%27s_law}{Gustafson law}) \end{itemize} \onslide<1->\addimage[width=8.cm]{\FIGREP/amdahl_speedup}{4cm}{2cm} \end{frame} \begin{frame} \frametitle{\si{\flops} and memory bandwidth} \framesubtitle{} \begin{itemize} \item FLOPs are floating point operations, e.g. $+, -, \times, \div$ \item Can be evaluated by hand, dividing the number of operations by the running time \vfill \item Memory bandwidth measures the amount of data transferred by unit of time [\si{\byte\per\second}, \si{\kibi\byte\per\second}, \si{\mebi\byte\per\second}, \si{\gibi\byte\per\second}, ...] \item Can be measured by hand dividing the amount of data transferred by the running time \vfill \item In both cases, generally use tools such as PAPI, Tau, likwid, Intel Amplxe, STREAM, etc. \end{itemize} \end{frame} \begin{frame}[t,fragile] \frametitle{Performance measurement} \framesubtitle{A simple DAXPY example} \begin{itemize} \item Assume \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel Xeon Gold 6132} (Gacrux) \end{itemize} \cxxfile[% title={optimization/daxpy.cc}, minted options app={ % highlightlines={2, 7}, firstline=25, lastline=27, firstnumber=1, }]{examples/optimization/daxpy.cc} \begin{itemize} \item My code runs in \SI{174.25}{\ms}. It is amazingly fast! \end{itemize} \pause \vfill \begin{itemize} \item Each iteration has 2 FLOP (1 add and 1 mul) and there are \cmd{N = 1e8} iterations \item Our code $\SI{2d8}{\flop} / \SI{174.25d-3}{\second} = \SI{0.001}{\tera\flops}$ \item Our hardware can achieve a theoretical peak performance of $\SI{1.16}{\tera\flops}$... \end{itemize} \pause \vfill \begin{itemize} \item Each iteration has 3 memory operations (2 loads and 1 store) \item Our code $\SI{2.23}{\gibi\byte} / \SI{174.25d-3}{\second} = \SI{12.82}{\gibi\byte\per\second}$ \item Our hardware can achieve a theoretical memory bandwidth of $\SI{125}{\gibi\byte\per\second}$... \end{itemize} \end{frame} \subsection{Roofline model} \label{sec:roofline} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{} \begin{itemize} \item How well am I exploiting the hardware resources? \item The roofline model is a performance model allowing to have an estimate to this question \end{itemize} \vspace{1cm} \pause \begin{itemize} \item Key concept: the arithmetic intensity, $AI$, of an algorithm is \# \si{\flop\per\byte} of data transferred \item It measures data reuse \end{itemize} \addimage[width=8.cm]{\FIGREP/ai}{4cm}{0.5cm} \end{frame} \begin{frame}[t,fragile] \frametitle{Roofline model} \framesubtitle{How to find arithmetic intensity} \begin{itemize} \item For very simple algorithms, you can compute the AI \item Let's take back the DAXPY example \cxxfile[% title={optimization/daxpy.cc}, minted options app={ % highlightlines={2, 7}, firstline=25, lastline=27, firstnumber=1, }]{examples/optimization/daxpy.cc} \item There are 2 operations (1 add and 1 mul) \item Three 8-byte memory operations (2 loads and 1 store) \item The AI is then $2/24 = 1/12$ \pause \item For more complex algorithms, use a tool, e.g. Intel Advisor \end{itemize} \end{frame} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{Building the model} \begin{itemize} \item Roofline model is plotted on \textbf{log-log scale} \begin{itemize} \item x-axis is the $AI$ \item y-axis is \si{\flops} \end{itemize} \pause \item The hardware limits are defined by \begin{equation*} P = \min(P_{\text{max}}, b_{s} \cdot AI) \end{equation*} \begin{itemize} \item $P_{\text{max}}$ is the CPU peak \si{\flops} \item$AI$ is the intensity \item $b_{s}$ is the memory BW \end{itemize} \end{itemize} \onslide<1>\addimage[width=5cm]{\FIGREP/roofline_1}{5.5cm}{0.5cm} \onslide<2>\addimage[width=5cm]{\FIGREP/roofline_2}{5.5cm}{0.5cm} \onslide<3>\addimage[width=5cm]{\FIGREP/roofline_3}{5.5cm}{0.5cm} \end{frame} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{Building the model} \begin{itemize} \item Refinements can be made to the Roofline model \item Adding a memory hierarchy with caches \item Adding different levels of DLP (Data-Level parallelism) \item They give you hint on what to optimize for \end{itemize} \addimage[width=7cm]{\FIGREP/roofline_extended}{4.5cm}{0.5cm} \end{frame} \begin{frame}[fragile,t] \frametitle{Roofline model} \framesubtitle{How to find the peak performance} \begin{itemize} \item Theoretical peak performance\\ \vspace{-2ex} \begin{minipage}{.4\linewidth} \begin{align*} P_{\text{max}} = & \textcolor{white}{\times} \text{Number of FP ports (ILP)} \\ & \times \text{flops} / \text{cycles (e.g. 2 for FMA)} \\ & \times \text{vector size (DLP)} \\ & \times \text{frequency (in GHz)} \\ & \times \text{number of cores (TLP)} \end{align*} \end{minipage} \vspace{3ex} \item Example: \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel Xeon Gold 6132}\\ \vspace{-2ex} \begin{minipage}{.4\linewidth} \begin{align*} P_{\text{max}} = & \textcolor{white}{\times} 2 \text{ (ports)} \\ & \times \SI{2}{\flop\per\cycle} \text{ (2 for FMA)} \\ & \times \frac{\SI{512}{\bit} \text{ (AVX512)} }{\SI{64}{\bit}\text{ (double)}} \\ & \times \SI{2.3}{\giga\hertz} \\ & \times 14 \text{ (cores)} \\ = & \SI{1.16}{\tera\flops} \end{align*} \end{minipage} \addimage[width=6cm]{\FIGREP/skylake_server_block_diagram}{9cm}{0.8cm} \pause \vspace{3ex} \item Or use a software that estimates it \end{itemize} \end{frame} \begin{frame}[fragile,t] \frametitle{Roofline model} \framesubtitle{How to find the memory bandwidth} \begin{itemize} \item Theoretical memory bandwidth of the memory \begin{align*} \text{BW}_{\text{max}} = &\textcolor{white}{\times} \text{Number of transfers per second} \\ & \times \text{Bus width} \\ & \times \text{Number of interfaces} \end{align*} \item In general, we suppose that RAM matches CPU bandwidth (found on the CPU spec. list) \item Example: \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel Xeon Gold 6132} \begin{align*} \text{BW}_{\text{max}} = &\textcolor{white}{\times} \SI{2666}{\mega\transfer\per\second} \text{ (DDR4 2666)} \\ & \times \SI{8}{\byte\per\transfer} \text{ (64bit bus)}\\ & \times 6 \end{align*} \begin{itemize} \item $\SI{19.86}{\gibi\byte\per\second}$ for 1 channel \item Maximum of $\SI{119.18}{\gibi\byte\per\second}$ \end{itemize} \pause \item Or use a software that estimates it \end{itemize} \begin{itemize} \item A corollary from ``theoretical'' is that it is not achievable in practice! \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Roofline model} \framesubtitle{How to measure the actual values} \begin{itemize} \item Peak performance measurement \begin{itemize} \item Using a compute bound kernel \item Using dgemm:\\ 1 core: $\SI{98.0}{\giga\flops}$\\ 14 cores: $\SI{965.0}{\giga\flops}$ \end{itemize} \item Bandwidth measurement \begin{itemize} \item Using a memory bound kernel \item Using stream (triad):\\ 1 core: $\SI{12.7}{\gibi\byte\per\second}$\\ 6 core: $\SI{70.1}{\gibi\byte\per\second}$\\ 9 core: $\SI{82.7}{\gibi\byte\per\second}$ - \addimage[width=6cm]{\FIGREP/skylake_server_block_diagram}{9cm}{0.8cm} \end{itemize} \end{itemize} + \addimage[width=6cm]{\FIGREP/bandwidth}{9cm}{1.5cm} \end{frame} +\begin{frame}[fragile] + \frametitle{Roofline model} + \framesubtitle{Intel Amplifier} + \addimage[width=12cm]{\FIGREP/Roofline}{1cm}{1cm} +\end{frame} + + + \begin{frame}[fragile,t] \frametitle{Optimization} \framesubtitle{} \begin{itemize} \item We now have a pretty good idea of which part of the code to optimize \item Different options are possible (by order of complexity) \begin{enumerate} \item Compiler and linker flags \item Optimized external libraries \item Handmade optimization (loop reordering, better data access, etc.) \item Algorithmic changes \end{enumerate} \end{itemize} \end{frame} \begin{frame}[fragile,t] \frametitle{Optimization} \framesubtitle{Compiler flags} \begin{itemize} \item Compilers have a set of optimizations they can do (if possible) \item You can find a list of \href{https://gcc.gnu.org/onlinedocs/gcc/gcc-command-options/options-that-control-optimization.html}{options for GNU compilers on their doc} \pause \item Common options are: \begin{itemize} \item \cmd{-O0}, \cmd{-O1}, \cmd{-O2}, \cmd{-O3}: from almost no optimizations to most optimizations \pause \item \cmd{-Ofast}: activate more aggressive options, \eg{} \cmd{-ffast-math} (but can produce wrong results in some particular cases) \end{itemize} \pause \item Test your program with different options (\cmd{-O3} does not necessarily leads to faster programs) \item Note that the more optimization the longer the compilation time \end{itemize} \end{frame} \begin{frame}[fragile,t] \frametitle{Optimization} \framesubtitle{Optimized libraries} \begin{itemize} \item Do not re-invent the wheel! \item A lot of optimized libraries exist with different purposes (solvers, data structures, I/O, etc.). A few examples: \begin{itemize} \item Solvers: PETSc, MUMPS, LAPACK, scaLAPACK, PARDISO, etc. \item I/O: HDF5, ADIOS, etc. \item Math libraries: FFTW, BLAS, etc. \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile,t] \frametitle{Optimization} \framesubtitle{Handmade optimizations} \begin{itemize} \item Sometimes, we cannot rely on compiler options or libraries and we must optimize ``by hand'' \item Usually, the goal is to rewrite the code in such a way that the compiler can optimize it \item Start by having a correct program before trying to optimize \item ``Premature optimization is the root of all evil'', D. Knuth \end{itemize} \end{frame} \subsection{Pareto principle} \label{sec:pareto} \begin{frame} \frametitle{Pareto principle} \framesubtitle{The 80/20 rule} \begin{itemize} \item General principle that states that 80\% of the effect comes from 20\% of causes \item Applies in many domains and especially in optimization \item 80\% of the time is spent in 20\% of your code \item Concentrate on those 20\% and don't arbitrarily optimize \end{itemize} \end{frame} \begin{frame}[fragile,t] \frametitle{Optimization} \framesubtitle{Algorithmic optimizations} \begin{itemize} \item Example of matrix/matrix multiplication. Graph shows complexity ($\mathcal{O}(n^{\omega})$) for different algorithms \end{itemize} \onslide<2>\addimage[width=7cm]{\FIGREP/matmul}{4.5cm}{0.5cm} \end{frame} \begin{frame}[fragile,t] \frametitle{Parallelization} \framesubtitle{When to parallelize} \begin{itemize} \item Only when your code has \textit{no bugs} and is \textit{optimized} \item Are your ready to parallelize? \begin{enumerate} \item Is it worth to parallelize my code? Does my algorithm scale? \item Performance prediction? \item Profiling? \item Bottelnecks? \item Which parallel paradigm should I use? What is the target architecture (SMP, cluster, GPU, hybrid, etc)? \end{enumerate} \end{itemize} \end{frame} \begin{frame}[fragile,t] \frametitle{Parallelization} \framesubtitle{When to parallelize} In 1991, David H. Bailey published a famous paper: \href{https://www.davidhbailey.com/dhbpapers/twelve-ways.pdf}{Twelve ways to fool the masses when giving performance results on parallel computers} \vspace{1cm} \textit{6: Compare your results against scalar, unoptimized code on Crays.} \addimage[width=7cm]{\FIGREP/dhb}{4.5cm}{0.5cm} \end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: