diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 51b599a..59911a1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,8 +1,8 @@ project(phys-743-examples) cmake_minimum_required(VERSION 3.0.0) add_compile_options(-Wall -Wextra -pedantic) +add_subdirectory(optimization) add_subdirectory(openmp) add_subdirectory(mpi) -#add_subdirectory(optimization) diff --git a/examples/mpi/CMakeLists.txt b/examples/mpi/CMakeLists.txt index c0de803..29e5b0d 100644 --- a/examples/mpi/CMakeLists.txt +++ b/examples/mpi/CMakeLists.txt @@ -1,8 +1,8 @@ find_package(MPI REQUIRED COMPONENTS MPICXX) -foreach(tgt hello) +foreach(tgt hello_mpi) add_executable(${tgt} ${tgt}.cc) target_link_libraries(${tgt} PRIVATE ${MPI_CXX_LIBRARIES}) target_include_directories(${tgt} PRIVATE ${MPI_CXX_INCLUDE_DIRS}) target_compile_options(${tgt} PRIVATE ${MPI_CXX_COMPILE_OPTIONS}) endforeach() diff --git a/examples/mpi/hello.c b/examples/mpi/hello_mpi.cc similarity index 100% rename from examples/mpi/hello.c rename to examples/mpi/hello_mpi.cc diff --git a/examples/openmp/Makefile b/examples/openmp/Makefile deleted file mode 100644 index e7b570c..0000000 --- a/examples/openmp/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -CC=gcc -CFLAGS=-Wall -Werror -g -fopenmp -LDFLAGS=-lm - -EXECUTABLES=hello reduction scheduler - -all: $(EXECUTABLES) - -hello: hello.c - $(CC) $< $(CFLAGS) -o $@ - -reduction: reduction.c - $(CC) $< $(CFLAGS) -o $@ - -scheduler: scheduler.c timing.o - $(CC) $< timing.o $(CFLAGS) -o $@ $(LDFLAGS) - -clean: - rm -f $(EXECUTABLES) timing.o diff --git a/examples/openmp/fibonacci.c b/examples/openmp/fibonacci.c deleted file mode 100644 index 4021c50..0000000 --- a/examples/openmp/fibonacci.c +++ /dev/null @@ -1,39 +0,0 @@ -#include "timing.h" -#include -#include -#include - -long int fibonacci(int n) { - long int x, y; - if (n < 2) { - return n; - } else { -#pragma omp task shared(x) - x = fibonacci(n-1); -#pragma omp task shared(y) - y = fibonacci(n-2); -#pragma omp taskwait - return (x+y); - } -} - -int main() { - int n = 42; - double t1, t2; - long int fib = 0; - - t1 = second(); -#pragma omp parallel - { -#pragma omp single nowait - { - fib = fibonacci(n); - } - } - - t2 = second(); - - printf("fib(%d) = %ld (in %g [s])\n", n, fib, (t2-t1)); - - return 0; -} diff --git a/examples/openmp/fibonacci_cutoff.c b/examples/openmp/fibonacci_cutoff.c deleted file mode 100644 index 9e17e0a..0000000 --- a/examples/openmp/fibonacci_cutoff.c +++ /dev/null @@ -1,56 +0,0 @@ -#include "timing.h" -#include -#include -#include - -long int fibonacci_seq(int n) { - long int x, y; - if (n < 2) { - return n; - } else { - x = fibonacci_seq(n-1); - y = fibonacci_seq(n-2); - return (x+y); - } -} - -long int fibonacci(int n, int level, int cutoff) { - long int x, y; - if (n < 2) { - return n; - } else if (level < cutoff) { -#pragma omp task shared(x) - x = fibonacci(n-1, level+1, cutoff); -#pragma omp task shared(y) - y = fibonacci(n-2, level+1, cutoff); -#pragma omp taskwait - return (x+y); - } else { - x = fibonacci_seq(n - 1); - y = fibonacci_seq(n - 2); - return (x+y); - } -} - - -int main() { - int n = 42; - int cutoff = 10; - double t1, t2; - long int fib = 0; - - t1 = second(); -#pragma omp parallel - { -#pragma omp single nowait - { - fib = fibonacci(n, 0, cutoff); - } - } - - t2 = second(); - - printf("Fib(%d) = %ld (in %g [s])\n", n, fib, (t2-t1)); - - return 0; -} diff --git a/examples/openmp/hello.c b/examples/openmp/hello.c deleted file mode 100644 index 843bd81..0000000 --- a/examples/openmp/hello.c +++ /dev/null @@ -1,22 +0,0 @@ -#include -#if defined (_OPENMP) -#include -#endif - -int main(int argc, char *argv[]) { - int myrank=0; - int mysize=1; - -#if defined (_OPENMP) -#pragma omp parallel default(shared) private(myrank, mysize) - { - mysize = omp_get_num_threads(); - myrank = omp_get_thread_num(); -#endif - printf("Hello from thread %d out of %d\n", myrank, - mysize); -#if defined (_OPENMP) - } -#endif - return 0; -} diff --git a/examples/openmp/private.cc b/examples/openmp/private.cc index 38c02c5..bb0285e 100644 --- a/examples/openmp/private.cc +++ b/examples/openmp/private.cc @@ -1,18 +1,22 @@ #include #include int main() { - int a = 1, b = 2, c = 3; + int a = 1, b = 2; + double c = 3.; - std::printf("Variable a, b, c: %i, %i, %i (before)\n", a, b, c); + std::printf("Thread %i sees, a, b, c: %i, %i, %g (before)\n", + omp_get_thread_num(), a, b, c); #pragma omp parallel num_threads(3), private(a), firstprivate(b) { - std::printf("Variable a, b, c: %i, %i, %i (inside)\n", a, b, c); - c = 10; + std::printf("Thread %i sees, a, b, c: %i, %i, %g (inside)\n", + omp_get_thread_num(), a, b, c); + c = -1e-3; } - std::printf("Variable a, b, c: %i, %i, %i (after)\n", a, b, c); + std::printf("Thread %i sees, a, b, c: %i, %i, %g (after)\n", + omp_get_thread_num(), a, b, c); return 0; } diff --git a/examples/openmp/reduction.c b/examples/openmp/reduction.c deleted file mode 100644 index b231f44..0000000 --- a/examples/openmp/reduction.c +++ /dev/null @@ -1,26 +0,0 @@ -#include -#include -#include - - -int main(int argc, char *argv[]) { - int * vec; - int global_sum, i; - int size_vec = 10; - - vec = (int*) malloc (size_vec*sizeof(int)); - global_sum = 0; - - for (i = 0; i < size_vec; i++) { - vec[i] = i; - } - -#pragma omp parallel for reduction(+:global_sum) - for (i = 0; i < size_vec; i++) { - global_sum += vec[i]; - } - - printf("sum = %i\n", global_sum); - - return 0; -} diff --git a/examples/openmp/scheduler.c b/examples/openmp/scheduler.c deleted file mode 100644 index 45d1b8e..0000000 --- a/examples/openmp/scheduler.c +++ /dev/null @@ -1,96 +0,0 @@ -#include "timing.h" -#include -#include -#include -#include - -double verification(double ** matrix, int N){ - double ret = 0.; - int i,j; - - for (i=0;i -#include - -double second() { - struct timeval tp; - gettimeofday(&tp, NULL); - return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); -} diff --git a/examples/openmp/timing.h b/examples/openmp/timing.h deleted file mode 100644 index 2d64f7e..0000000 --- a/examples/openmp/timing.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef TIMING_H -#define TIMING_H - -double second(); - -#endif /* TIMING_H */ diff --git a/examples/optimization/CMakeLists.txt b/examples/optimization/CMakeLists.txt new file mode 100644 index 0000000..88677d5 --- /dev/null +++ b/examples/optimization/CMakeLists.txt @@ -0,0 +1,3 @@ +foreach(tgt daxpy) + add_executable(${tgt} ${tgt}.cc) +endforeach() diff --git a/examples/optimization/Makefile b/examples/optimization/Makefile deleted file mode 100644 index 637f571..0000000 --- a/examples/optimization/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -GSL_LIB=-L${GSL_LIBRARY} -lgsl -lgslcblas -GSL_INC=-I${GSL_INCLUDE} -GCC_OPT=-O3 -ftree-vectorize - -MKL_FLAGS=-DMKLINUSE -DMKL_ILP64 -I${MKLROOT}/include -L${MKLROOT}/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lpthread -ICC_OPT=-O3 -xHost - -dgemm_gcc: dgemm.c - gcc $(GCC_OPT) dgemm.c $(GSL_LIB) $(GSL_INC) -lm -o dgemm_gcc - -dgemm_icc: dgemm.c - icc $(ICC_OPT) dgemm.c $(MKL_FLAGS) -lm -o dgemm_icc diff --git a/examples/optimization/daxpy.cpp b/examples/optimization/daxpy.cc similarity index 78% rename from examples/optimization/daxpy.cpp rename to examples/optimization/daxpy.cc index 4fbb642..26185d7 100644 --- a/examples/optimization/daxpy.cpp +++ b/examples/optimization/daxpy.cc @@ -1,34 +1,34 @@ #include #include #include #include -using clock = std::chrono::high_resolution_clock; +using clk = std::chrono::high_resolution_clock; using millisecond = std::chrono::duration; int main() { const int N = 1e8; std::vector a(N), b(N), c(N); for (int i = 0; i < N; ++i) { a[i] = 1.0 * i; b[i] = 10.0 * i; } - std::mt19937 gen(); + std::mt19937 gen(0); std::uniform_real_distribution<> rand(0, 1000); - auto alpha = rand(); + auto alpha = rand(gen); - auto t1 = clock::now(); + auto t1 = clk::now(); for (int i = 0; i < N; ++i) { c[i] = a[i] + alpha * b[i]; } - auto t2 = clock::now(); + auto t2 = clk::now(); millisecond ms_double = t2 - t1; std::cout << "For loop ran in " << ms_double.count() << "ms" << std::endl; return 0; } diff --git a/examples/optimization/dgemm.c b/examples/optimization/dgemm.c deleted file mode 100644 index 1711f31..0000000 --- a/examples/optimization/dgemm.c +++ /dev/null @@ -1,255 +0,0 @@ -// -------------------------------------------------------------- -// File: ex10.c -// Description: Example with the sections directive -// Software stack: Exercises suite for V. Keller lecture -// Version: 1.0 -// License: BSD -// Author: Vincent Keller (Vincent.Keller@epfl.ch), CADMOS, 2012 -// -// COMPILATION AND LINKING -// -// WITHOUT MKL : gcc -O3 -ftree-vectorize dgemm.c -lgsl -lgslcblas -lm -o dgemm -// WITH MKL : export MKLROOT=/software/intel/mkl; icc -DMKLINUSE -DMKL_ILP64 -openmp -I${MKLROOT}/include -O3 -xHost dgemm.c -L${MKLROOT}/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_intel_thread -lpthread -lm -o dgemm -// -------------------------------------------------------------- - -#include -#include -#include - -#if defined(MKLINUSE) -#include "mkl.h" -#else -#include -#endif - - -double verification(double ** array, int N){ - double ret; - int i,j; - for (i=0;i % Requirement \RequirePackage{tikz} \RequirePackage{etoolbox} % Settings \useinnertheme{scitas} \useoutertheme{scitas} \usecolortheme{scitas} \newif\if@gridMode \@gridModefalse \DeclareOption{grid}{\@gridModetrue} \ProcessOptions \setbeamertemplate{navigation symbols}{} \setbeamertemplate{blocks}[rounded][shadow=true] % % Uncomment the following if you want the sections close to each other in the TOC % \makeatletter % \patchcmd{\beamer@sectionintoc} % {\vfill} % {\vskip\itemsep} % {} % {} % \makeatother \setbeamersize{text margin left=2.2cm,text margin right=1em} \if@gridMode \setbeamertemplate{background canvas}{% \begin{tikzpicture} \draw[step=0.5,white!90!black,thin] (0,0) grid (\paperwidth,\paperheight); \draw[step=2,white!60!black,thin] (0,0) grid (\paperwidth,\paperheight); \end{tikzpicture}} \fi -\newcommand{\addimage}[4]{% +\newcommand{\addimage}[4][]{% \begin{tikzpicture}[overlay,remember picture] \begin{scope}[shift={(current page.south west)}] - \node[anchor=south west, inner sep=0pt] at (#3, #4) {\includegraphics[width=#2]{#1}}; + \node[anchor=south west, inner sep=0pt] at (#3, #4) {\includegraphics[#1]{#2}}; % \node[anchor=north west, font=\scriptsize, text width=#2, align=left] at (#3, #4){#5}; \end{scope} \end{tikzpicture} } \newcommand{\intersec}[1]{% { \begin{frame}[plain] \begin{tikzpicture}[overlay,remember picture] \begin{scope}[shift={(current page.south west)}] \node[anchor=south west, inner sep=-0.005cm] at (4cm, 0cm) {\includegraphics[height=9cm]{scitas_theme/figures/#1}}; \fill[color=greyEPFL, anchor=south west] (4,3.25) rectangle (9.50,4.75); \fill[color=redEPFL, anchor=south west] (2,1.55) rectangle (4.00,3.25); \fill[color=greyEPFL, anchor=south west] (4,0.00) rectangle (6.25,1.55); \node[anchor=west, white,font=\large, align=left] at (4,4.0){\textbf{\secname}}; \end{scope} \end{tikzpicture} \end{frame} } } \mode diff --git a/src/basic_concepts/basic_concepts.tex b/src/basic_concepts/basic_concepts.tex index 9a3c6c4..bb2b3f8 100644 --- a/src/basic_concepts/basic_concepts.tex +++ b/src/basic_concepts/basic_concepts.tex @@ -1,230 +1,230 @@ \renewcommand{\FIGREP}{src/basic_concepts/figures} \section{Basic concepts} \label{sec:basic_concepts} \intersec{fidis} \begin{frame} \frametitle{Goal of this section} \framesubtitle{} \begin{itemize} \item Cluster access using \cmd{ssh} \item Data transfers using \cmd{scp} and/or \cmd{rsync} \item Code compilation \item Software modules on the cluster \item Debugging \end{itemize} \end{frame} \subsection{Connection to the cluster} \label{sec:cluster_connection} \begin{frame}[fragile] \frametitle{Connection to the cluster} \framesubtitle{} \begin{itemize} \item Secure shell or better known as SSH \begin{bashcode} $> man ssh ssh (SSH client) is a program for logging into a remote machine and for executing commands on a remote machine. It is intended to provide secure encrypted communications [...]. X11 connections [...] can also be forwarded over the secure channel. \end{bashcode}%$ \item Basic usage \begin{bashcode} $> ssh @ \end{bashcode}%$ \cmd{} is your GASPAR username, \cmd{} is any of \cmd{\{helvetios,fidis,izar\}.epfl.ch} \item Example \begin{bashcode} $> ssh jdoe@helvetios.epfl.ch password: ***** \end{bashcode}%$ \end{itemize} \end{frame} \note{ \begin{itemize} \item Read the manual/documentation \item Pause to test connection \end{itemize} } \begin{frame}[fragile] \frametitle{Connection to the cluster} \framesubtitle{Optional step} \begin{itemize} \item The alternative to password is to use a cryptographic key pair \begin{bashcode} $> ssh-keygen -b 4096 -t rsa [Follow the instructions] $> ssh-copy-id jdoe@helvetios.epfl.ch \end{bashcode} \item \cmd{ssh-keygen} generates a public/private key pair \item By default, they are found in \cmd{\~{}/.ssh} \begin{itemize} \item \cmd{id\_rsa.pub} is the public key and can be shared with anyone (\cmd{ssh-copy-id} copies it to the remote) \item \cmd{id\_rsa} is the private key and it is \textbf{SECRET}! \end{itemize} \end{itemize} \end{frame} \note{ \begin{itemize} \item Careful with \cmd{ssh-keygen} to not overwrite existing keys \item \cmd{ssh-copy-id} copies all the keys \item Pause to test \end{itemize} } \subsection{Data transfer} \label{sec:data_transfer} \begin{frame}[fragile] \frametitle{Data transfer} \framesubtitle{} \begin{itemize} \item We are working remotely and need to get your data back locally \item There are two main commands: \begin{bashcode} $> man scp scp copies files between hosts on a network. It uses ssh for data transfer, and uses the same authentication and provides the same security as ssh. \end{bashcode} \begin{bashcode} $> man rsync Rsync is a fast and extraordinarily versatile file copying tool. It can copy locally, to/from another host over any remote shell, or to/from a remote rsync daemon. [...] It is famous for its delta-transfer algorithm, which reduces the amount of data sent over the network by sending only the differences between the source files and the existing files in the destination. Rsync is widely used for backups and mirroring and as an improved copy command for everyday use. \end{bashcode} \item Similar usage pattern. The path on a remote host is written \cmd{hostname:/path/to/file}. For example \begin{bashcode} $> scp jdoe@helvetios.epfl.ch:src/myCode/file.c src/ \end{bashcode}%$ \end{itemize} \end{frame} \subsection{Compilation} \label{sec:compilation} \begin{frame} \frametitle{Compilation} \framesubtitle{0100101110101001010...} \begin{itemize} \item A computer only understands ON and OFF states (1 and 0) \item It would be very inconvenient for us to code in binary \item We therefore use different levels of abstraction (languages), e.g. C, C++, Fortran \item We need a translator! \end{itemize} \end{frame} \begin{frame} \frametitle{Compilation} \framesubtitle{The four compilation steps} \begin{itemize} \item Translation is made by a compiler in 4 steps \begin{description} \item[Preprocessing] Format source code to make it ready for compilation (remove comments, execute preprocessing directives such as \cmd{\#include}, etc.) \item[Compiling] Translate the source code (C, C++, Fortran, etc) into assembly, a very basic CPU-dependent language \item[Assembly] Translate the assembly into machine code and store it in object files \item[Linking] Link all the object files into one executable \end{description} \item In practice, the first three steps are combined together and simply called ``compiling'' \end{itemize} \end{frame} \begin{frame}[t,fragile] \frametitle{Compilation} \framesubtitle{The four compilation steps (visually)} \hspace{6cm} \begin{minipage}{0.5\textwidth} \begin{itemize} \item<5> Note that in reality, everything is done transparently \begin{bashcode} $> gcc -c file_1.c $> gcc -c file_2.c $> gcc file_1.o file_2.o -lexample -o exec \end{bashcode}%$ \end{itemize} \end{minipage} - \onslide<1>\addimage{\FIGREP/compilation_steps_0.pdf}{12cm}{2cm}{1cm} - \onslide<2>\addimage{\FIGREP/compilation_steps_1.pdf}{12cm}{2cm}{1cm} - \onslide<3>\addimage{\FIGREP/compilation_steps_2.pdf}{12cm}{2cm}{1cm} - \onslide<4>\addimage{\FIGREP/compilation_steps_3.pdf}{12cm}{2cm}{1cm} - \onslide<5>\addimage{\FIGREP/compilation_steps_4.pdf}{12cm}{2cm}{1cm} + \onslide<1>\addimage[width=12cm]{\FIGREP/compilation_steps_0.pdf}{2cm}{1cm} + \onslide<2>\addimage[width=12cm]{\FIGREP/compilation_steps_1.pdf}{2cm}{1cm} + \onslide<3>\addimage[width=12cm]{\FIGREP/compilation_steps_2.pdf}{2cm}{1cm} + \onslide<4>\addimage[width=12cm]{\FIGREP/compilation_steps_3.pdf}{2cm}{1cm} + \onslide<5>\addimage[width=12cm]{\FIGREP/compilation_steps_4.pdf}{2cm}{1cm} \end{frame} \subsection{Debugging} \label{sec:debugging} \begin{frame} \frametitle{Debugging} \framesubtitle{A few advices} \begin{itemize} \item Why bother debugging? \begin{itemize} \item Studies show $\sim$ 20 bugs/kloc in industry codes \item You don't want to find a bug when on a deadline \end{itemize} \item Only optimize a correct code \end{itemize} \vfill \begin{itemize} \item There are different types of bugs: \begin{description} \item[Syntax error] A code keyword is misspelled, e.g. \cmd{dobule} instead of \cmd{double}. The code doesn't compile and the compiler tells you where is the error. \item[Runtime error] Division by 0 (fpe), out of bound access (seg. fault), etc. The code compiles fine, but will (most likely) crash at runtime. \item[Logical errors] Mistake that leads to an incorrect or unexpected behavior. You want to compute a distance from a velocity and a time, but you use an acceleration instead. \end{description} \item Logical errors are clearly the most dangerous! The compiler doesn't complain and your code runs. You need to test it! \end{itemize} \end{frame} \note{ \begin{itemize} \item This is a parallel programming course. Why bother with debugging? \item After all, it is boring and time consuming. You could use this time to make your code faster instead! \item Small test cases \item Typical ticket: works on my machine but not clusters -> bug on the clusters \end{itemize} } \begin{frame} \frametitle{Debugging} \framesubtitle{A few advices} \begin{itemize} \item Write tests (unit tests, application tests)! \item Write tests! \item Ask the compiler to complain (\cmd{-g -Wall -Wextra}) \item Use debuggers (gdb, TotalView, Alinea DDT) \item Use memory checkers (Valgrind, TotalView, Intel Inspector, \cmd{-fsanitize=address}) \item Don't use print statements (Heisenbug) \end{itemize} \end{frame} \note{ \begin{itemize} \item Test! \item Compiler produces warnings that indicate possible source of bugs (uninitialized value, missleading indentation, loss of precision, ...) \item Heisenbug \end{itemize} } %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: diff --git a/src/cluster_architecture/cluster_architecture.tex b/src/cluster_architecture/cluster_architecture.tex index 718c096..88ac573 100644 --- a/src/cluster_architecture/cluster_architecture.tex +++ b/src/cluster_architecture/cluster_architecture.tex @@ -1,169 +1,169 @@ \renewcommand{\FIGREP}{src/cluster_architecture/figures} \section{Cluster Architecture} \label{sec:cluster_architecture} \intersec{helvetios} \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{} \begin{itemize} \item The goal of this section is to understand what's under the cluster's hood \item In order to take full advantage of your computer, you have to understand how it works, what are the limits, etc. \item We'll go from the cluster level down to the core level \end{itemize} - \addimage{\FIGREP/summit}{7cm}{5.5cm}{0.5cm} + \addimage[width=7cm]{\FIGREP/summit}{5.5cm}{0.5cm} \end{frame} \subsection{Cluster as a whole} \label{sec:cluster} \begin{frame} \frametitle{Cluster Architecture} \framesubtitle{General presentation} \begin{minipage}{0.5\linewidth} \begin{itemize} \item An HPC cluster is composed of \begin{itemize} \item Login node(s) \item Compute nodes \item Storage system \item High performance interconnect \end{itemize} \item Using a scheduler, the simulations are sent to the compute nodes which perform the computations \item The simulation data is written on the storage systems. At SCITAS: \begin{itemize} \item \code{/home}: store source files, input data, small files \item \code{/work}: collaboration space for a group \item \code{/scratch}: temporary huge result files \end{itemize} Please, note that only \cmd{/home} and \cmd{/work} have backups! \cmd{/scratch} data can be erased at any moment! \end{itemize} \end{minipage} - \addimage{\FIGREP/abstract_architecture}{6cm}{9.5cm}{1.5cm} + \addimage[width=6cm]{\FIGREP/abstract_architecture}{9.5cm}{1.5cm} \end{frame} \note{ \begin{itemize} \item The users connect to the login node \item Backups on \code{/work} are paying \end{itemize} } \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{A few numbers} \textbf{Let's go back to Summit} \begin{itemize} \item Second most powerful HPC cluster in the world according to the \href{https://www.top500.org/lists/top500/list/2021/06/}{Top500 June 2021 list} \item It is composed of \SI{4608}{} compute nodes \item Power consumption of \SI{10096.00}{\kilo\watt} % Consommation annuelle par habitant en Suisse ~6956kWh \item Equivalent consumption as a city with $\sim$\SI{13000}{} inhabitants % Prix du kWh a Lausanne ~20 centimes \item In Lausanne, running Summit would cost $\sim$\SI{50000}{\chf\per\day} only for electricity! \end{itemize} - \addimage{\FIGREP/summit}{5cm}{5.5cm}{1.0cm} + \addimage[width=5cm]{\FIGREP/summit}{5.5cm}{1.0cm} \end{frame} \subsection{The compute node} \label{sec:node} \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{Let's dive into a compute node!} \begin{itemize} \item The compute node is the basic building bloc of a cluster \item It is composed of one or more CPU with RAM (memory) and eventually one or more accelerator, e.g. GPUs \item All the nodes are connected together with an interconnect \end{itemize} - \addimage{\FIGREP/node_architecture}{4.5cm}{5.75cm}{1.0cm} + \addimage[width=4.5cm]{\FIGREP/node_architecture}{5.75cm}{1.0cm} \end{frame} \note{ \begin{itemize} \item A compute node is like a personal computer on steroid \end{itemize} } \subsection{The CPU} \label{sec:cpu} \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{Central processing unit} \begin{itemize} \item The CPU is the ``brain'' of the node \item CPUs work in clock cycles; they are the ``heart beat'' of the CPU \item It is composed of cores and different levels of memories called caches \item There are usually three levels of cache called L1, L2, and L3 \end{itemize} \vspace{0.8cm} \begin{table} \hspace{5cm} \scriptsize \begin{tabular}{@{}llll@{}} \toprule \textbf{Event} & \textbf{Latency} & \textbf{Scaled} & \textbf{Capacity} \\ \midrule 1 CPU cycle & 0.1\,ns & 1 s & -- \\ L1 cache access & 1\,ns & 10\,s & kB \\ L2 cache access & 1\,ns & 10\,s & MB \\ L3 cache access & 10\,ns & 1\,min & MB \\ RAM access & 100\,ns & 10\,min & GB \\ Solid-state disk access & 100\,$\mu$s & 10 days & TB \\ Hard-disk drive access& 1--10\,ms & 1--12 months & TB \\ \bottomrule \end{tabular} \end{table} - \addimage{\FIGREP/cpu_architecture}{4cm}{3cm}{1.5cm} + \addimage[width=4cm]{\FIGREP/cpu_architecture}{3cm}{1.5cm} \end{frame} \note{ \begin{itemize} \item Caches are extremely fast memories that are used to hide the latency of other memories (RAM, hard drive, etc.) \item However, they are usually quite small compared to e.g. RAM, hard drive \item L1 is the closest to the core, followed by L2 and L3 \item Some cache levels are private to a core, e.g. here L1 and L2 \end{itemize} } \begin{frame}[t] \frametitle{Cluster Architecture} \framesubtitle{Summary of SCITAS' clusters} \textcolor{red}{I would like to put a summary of SCITAS' clusters, but I would need to introduce also the concept of \si{\flops}...} \vfill \begin{minipage}{0.32\linewidth} \begin{center} \textbf{Fidis} \end{center} \end{minipage} \hfill \begin{minipage}{0.32\linewidth} \begin{center} \textbf{Helvetios} \end{center} \end{minipage} \hfill \begin{minipage}{0.32\linewidth} \begin{center} \textbf{Izar} \end{center} \end{minipage} \end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: diff --git a/src/openmp/figures/skylake_octadeca_core.png b/src/openmp/figures/skylake_octadeca_core.png new file mode 100644 index 0000000..683ce4c Binary files /dev/null and b/src/openmp/figures/skylake_octadeca_core.png differ diff --git a/src/openmp/figures/skylake_sp_2-way_3_upi.pdf b/src/openmp/figures/skylake_sp_2-way_3_upi.pdf new file mode 100644 index 0000000..1c0e181 Binary files /dev/null and b/src/openmp/figures/skylake_sp_2-way_3_upi.pdf differ diff --git a/src/openmp/figures/skylake_sp_2-way_3_upi.svg b/src/openmp/figures/skylake_sp_2-way_3_upi.svg new file mode 100644 index 0000000..7947f5a --- /dev/null +++ b/src/openmp/figures/skylake_sp_2-way_3_upi.svg @@ -0,0 +1,1016 @@ + + + + + + + + image/svg+xml + + + + + + SKLSP + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SKLSP + + + UPI + + + + UPI + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + UPI + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/openmp/figures/skylake_sp_core.png b/src/openmp/figures/skylake_sp_core.png new file mode 100644 index 0000000..14af473 Binary files /dev/null and b/src/openmp/figures/skylake_sp_core.png differ diff --git a/src/openmp/openmp.tex b/src/openmp/openmp.tex index 295e31b..c849493 100644 --- a/src/openmp/openmp.tex +++ b/src/openmp/openmp.tex @@ -1,1613 +1,806 @@ \renewcommand{\FIGREP}{src/openmp/figures} \section{OpenMP} \label{sec:openmp} \intersec{fidis} \subsection{Task parallelism} \label{sec:task_parallelism} \begin{frame} \frametitle{Goal of this section} \framesubtitle{} \begin{itemize} \item Understand the context of shared memory \item Understand more in detail the architecture of a node \item Get familiar with the OpenMP execution and memory model - \item Getting some speedup with Task parallelism + \item Getting some speedup with Task Level Parallelism \end{itemize} \end{frame} \subsection{Introduction} \label{sec:openmp:introduction} \begin{frame} \frametitle{Releases history, present and future} \framesubtitle{} \begin{itemize} \item{October 1997: Fortran version 1.0 } \item{Late 1998: C/C++ version 1.0 } \item{June 2000: Fortran version 2.0 } \item{April 2002: C/C++ version 2.0 } \item{June 2005: Combined C/C++ and Fortran version 2.5} \item{May 2008: Combined C/C++ and Fortran version 3.0} \item{\textbf{July 2011: Combined C/C++ and Fortran version 3.1}} \item{July 2013: Combined C/C++ and Fortran version 4.0} - \item{November 2015: Combined C/C++ and Fo version 4.5} + \item{November 2015: Combined C/C++ and Fortran version 4.5} \item{November 2018: Combined C/C++ and Fortran version 5.0} \item{November 2020: Combined C/C++ and Fortran version 5.1} \end{itemize} - \addimage{\FIGREP/logo_OpenMP.png}{5cm}{10cm}{5cm} + \addimage[width=5cm]{\FIGREP/logo_OpenMP.png}{10cm}{5cm} \end{frame} \begin{frame}[fragile] \frametitle{Terminology} \framesubtitle{Selected extract of the specification} \begin{itemize} \item Specification: \begin{itemize} \item \href{https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5-1.pdf}{Full specification} \item \href{https://www.openmp.org/wp-content/uploads/OpenMPRefCard-5.1-web.pdf}{RefCard} \end{itemize} \item Terms: \begin{description} \item[thread] an execution entity with a stack \\ and a static memory (\textit{threadprivate memory}) - \addimage{\FIGREP/thread}{1cm}{11cm}{4.4cm} + \vspace{1ex} + \addimage[width=1cm]{\FIGREP/thread}{11cm}{4.5cm} \item[OpenMP thread] a \textit{thread} managed by the OpenMP runtime\\ - \vspace{1em} - \addimage{\FIGREP/omp-thread}{1cm}{11cm}{3.6cm} + \vspace{2em} + \addimage[width=1cm]{\FIGREP/omp-thread}{11cm}{3.5cm} \item[processor] an hardware unit on which one or more \textit{OpenMP thread} can execute \item[directive] a base language mechanism to specify OpenMP program behavior \item[construct] an OpenMP executable directive and the associated statement, loop nest or structured block, if any, not including the code in any called routines. That is, the lexical extent of an executable directive. \end{description} \end{itemize} \end{frame} \begin{frame} \frametitle{Memory Model} \framesubtitle{Shared memory} - \onslide<1>\addimage{\FIGREP/detailed_node_architecture}{7.5cm}{4.75cm}{1.2cm} - \onslide<2>\addimage{\FIGREP/detailed_node_architecture_wo_gpus}{7.5cm}{4.75cm}{1.2cm} - \onslide<3>\addimage{\FIGREP/detailed_node_architecture_w_cores}{7.5cm}{4.75cm}{1.2cm} - \onslide<4>\addimage{\FIGREP/detailed_node_architecture_essential}{7.5cm}{4.75cm}{1.2cm} + \onslide<1>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture}{4.75cm}{1.2cm} + \onslide<2>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture_wo_gpus}{4.75cm}{1.2cm} + \onslide<3>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture_w_cores}{4.75cm}{1.2cm} + \onslide<4>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture_essential}{4.75cm}{1.2cm} \end{frame} \begin{frame} \frametitle{Execution Model} \framesubtitle{Fork/join} - \onslide<1>\addimage{\FIGREP/fork-join}{6.5cm}{4.75cm}{1.2cm} + \addimage{\FIGREP/fork-join}{4.75cm}{1.2cm} \end{frame} \begin{frame}[fragile] \frametitle{Compiling an OpenMP code} \framesubtitle{pragmas and compilation} \begin{itemize} \item OpenMP directives are written as pragmas: \cxxinline{#pragma omp} \item Use the conditional compilation flag \cxxinline{#if defined _OPENMP} for the preprocessor \end{itemize} \hspace{2cm} \pause \begin{itemize} \item Compilation using the GNU compiler: \begin{bashcode} $> g++ -fopenmp ex1.c -o ex1 \end{bashcode}%$ \item Compilation using the Intel compiler: \begin{bashcode} - $> i++ -qopenmp ex1.c -o ex1 + $> icpc -qopenmp ex1.c -o ex1 \end{bashcode}%$ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Hello World in C++} \framesubtitle{Simple version} \centering \cxxfile[title={openmp/hello.cc}]{examples/openmp/hello.cc} \pause \begin{consoleoutput} - $ OMP_NUM_THREADS=4 ../build/openmp/hello + $ OMP_NUM_THREADS=4 ./openmp/hello Hello from thread 2 out of 4 Hello from thread 1 out of 4 Hello from thread 0 out of 4 Hello from thread 3 out of 4 \end{consoleoutput} %$ \end{frame} \begin{frame}[fragile] \frametitle{Hello World in C++} \framesubtitle{With condition compilation} \centering \cxxfile[% title={openmp/hello\_cond.cc}, minted options app={ firstline=6, }]{examples/openmp/hello_cond.cc} \end{frame} \begin{frame}[fragile] \frametitle{Number of concurrent threads} \begin{itemize} \item Default implementation dependent (usually max hardware thread) \item At runtime in the code \begin{cxxcode}{} omp_set_num_threads(nthreads); \end{cxxcode} \item With en environment variable \begin{bashcode} $> export OMP_NUM_THREADS=4 \end{bashcode}%$ \end{itemize} \end{frame} \subsection{The \texttt{parallel} construct} \begin{frame}[fragile] \frametitle{The \texttt{parallel} construct} This is the mother of all constructs in OpenMP. It starts a parallel execution. \begin{cxxcode}{Syntax} #pragma omp parallel [clause[[,] clause]...] { |\emph{structured-block}| } \end{cxxcode} where \textit{clause} is one of the following: \begin{itemize} \item \code{if} or \code{num\_threads} : conditional clause - \item \code{default(private | firstprivate | shared | none)} : default data scoping + \item \code{default(private $\vert$ firstprivate $\vert$ shared $\vert$ none)} : default data scoping \item \code{private(\textit{list})}, \code{firstprivate(\textit{list})}, \code{shared(\textit{list})} or \code{copyin(\textit{list})} : data scoping \item \code{reduction(\textit{operator}$\,$:$\,$\textit{list})} \end{itemize} \end{frame} \begin{frame}[fragile,exercise] \frametitle{Hello $\pi$} \framesubtitle{} \begin{itemize} \item In the \texttt{pi.cc} add a function call to get the number of threads. \item Compile using the porper options for OpenMP \item Test that it works by varying the number of threads \code{export OMP\_NUM\_THREADS} \item To vary the number of threads in a \texttt{\bf sbatch} job you can set the number of threads to the number of cpus per task. \begin{bashcode} #!/bin/bash #SBATCH -c export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK \end{bashcode} %$ \end{itemize} \end{frame} \subsection{worksharing constructs ("subsubsections", "single", "workshare")} \begin{frame}[fragile] \frametitle{Worksharing constructs} Work-sharing constructs are possible in three ``flavours'' : \begin{itemize} \item \code{sections} construct \item \code{single} construct \item \code{workshare} construct (only in Fortran) \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Worksharing constructs} \framesubtitle{The \code{sections} construct} \begin{cxxcode}{Syntax} #pragma omp [parallel] sections [clause] { #pragma omp section { code_block } } \end{cxxcode} where \textit{clause} is one of the following: \begin{itemize} \item \code{private(\textit{list})}, \code{firstprivate(\textit{list})}, \code{lastprivate(\textit{list})} \item \code{reduction(\textit{operator} : \textit{list})} \item Each \code{section} within a \code{sections} construct is assigned to one and only one thread \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{A \texttt{sections} construct} \framesubtitle{Example} \centering \cxxfile[% title={openmp/sections.cc}, minted options app={ firstline=6, lastline=14, }]{examples/openmp/sections.cc} \pause \begin{consoleoutput} - $ ../build/openmp/sections + $ ./openmp/sections Thread 0 handling section 1 Thread 1 handling section 2 Thread 2 handling section 3 \end{consoleoutput} %$ \end{frame} \begin{frame}[fragile] \frametitle{Worksharing constructs} \framesubtitle{The \code{single} construct} Only one thread (usualy the first entering thread) executes the \code{single} region. \begin{cxxcode}{Syntax} #pragma omp single [clause[[,] clause] ...] { |\emph{structured-block}| } \end{cxxcode} where \textit{clause} is one of the following: \begin{itemize} \item \code{private(\textit{list})}, \code{firstprivate(\textit{list})} \item \code{nowait} \end{itemize} \end{frame} \begin{frame}[fragile] - \frametitle{The \texttt{master} construct} - \framesubtitle{Deprecated} + \frametitle{The \code{master} directive} + \framesubtitle{Deprecated for \code{masked}} Only the master thread execute the section. It can be used in any OpenMP construct \begin{cxxcode}{Syntax} #pragma omp master { |\emph{structured-block}| } \end{cxxcode} \end{frame} \subsection[Loops]{The Worksharing-loop construct} \begin{frame}[fragile] \frametitle{The Worksharing-loop construct} \framesubtitle{The \code{for} construct} Parallelization of the following loop \begin{cxxcode}{Syntax} #pragma omp for [clause[[,] clause] ... ] { |\emph{for-loop}| } \end{cxxcode} where \textit{clause} is one of the following: \begin{itemize} \item \code{schedule(\textit{kind[, chunk\_size]})} \item \code{collapse(\textit{n})} \item \code{ordered} \item \code{private(\textit{list})}, \code{firstprivate(\textit{list})}, \code{lastprivate(\textit{list})} % \item \code{reduction(\textit{operator}\,:\,\textit{list})} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Example of \code{for} construct} \centering \cxxfile[title={openmp/for.cc}, minted options app={ firstline=6, lastline=13, }]{examples/openmp/for.cc} \pause \begin{consoleoutput} - $ ../build/openmp/for + $ ./openmp/for Thread 0 handling i=0 Thread 0 handling i=1 Thread 0 handling i=2 Thread 1 handling i=3 Thread 1 handling i=4 Thread 1 handling i=5 \end{consoleoutput} %$ \end{frame} \begin{frame}[fragile,exercise] \frametitle{Compute $\pi$ in parallel} \framesubtitle{First try at parallel code} \begin{itemize} \item Add a \code{parallel for} work sharing construct around the integral computation \item Run the code \item Run the code \item Run the code \item What can you observe on the value of $\pi$ ? \end{itemize} \end{frame} \subsection{The Synchronization constructs} \begin{frame}[fragile] \frametitle{The Synchronization constructs} \framesubtitle{The \code{critical} construct} Restricts execution of the associated structured block to a single thread at a time. \begin{cxxcode}{Syntax} #pragma omp critical [(name) [[,] |\textbf{hint}|(hint-expression)]] { |\emph{structured-block}| } \end{cxxcode} \begin{itemize} \item \code{name} optional to identify the construct \item \code{hint-expression} information on the expected execution \begin{itemize} \item \cxxinline{omp_sync_hint_none} \item \cxxinline{omp_sync_hint_uncontended} \item \cxxinline{omp_sync_hint_contended} \item \cxxinline{omp_sync_hint_nonspeculative} \item \cxxinline{omp_sync_hint_speculative} \end{itemize} % \item \code{reduction(\textit{operator}\,:\,\textit{list})} \end{itemize} \end{frame} \begin{frame}[fragile,exercise] \frametitle{Compute $\pi$ in parallel} \framesubtitle{Naive reduction} \begin{itemize} \item To solve the raise condition from the previous exercise we can protect the computation of the sum. \item Add a \code{critical} directive to protect the sum \item Run the code \item What can you observe on the execution time while varying the number of threads \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{The Synchronization constructs} \framesubtitle{The \code{barrier} construct} Specifies an explicit barrier. \begin{cxxcode}{Syntax} #pragma omp barrier \end{cxxcode} \end{frame} \begin{frame}[fragile] \frametitle{The Synchronization constructs} \framesubtitle{The \code{atomic} construct} Ensures a specific storage location is accessed atomically. \begin{cxxcode}{Syntax} #pragma omp atomic [clause[[,] clause] ... ] |\emph{statement}| \end{cxxcode} where \textit{clause} is one of the following: \begin{itemize} \item \emph{atomic-clauses} \code{read}, \code{write}, \code{update} \item \emph{memory-order-clauses} \code{seq\_cst}, \code{acq\_rel}, \code{releases}, \code{acquire}, \code{relaxed} \item or one of \code{capture}, \code{compare}, - \code{hint(hint-expression)}, \code{fail(seq\_cst | acquire | + \code{hint(hint-expression)}, \code{fail(seq\_cst $\vert$ acquire $\vert$ relaxed)}, or \code{weak} \end{itemize} \end{frame} \subsection{Data sharing clauses} \begin{frame}[fragile] \frametitle{Data sharing clauses} \framesubtitle{What are the variables values} \begin{itemize} - \item most common source of errors - \item determine which variables are \code{private} to a thread, which are - \code{shared} among all the threads - \item in case of a \code{private} variable + \item Most common source of errors + \item Determine which variables are \code{private} to a thread, which are + \code{shared} among all the threads + \item In case of a \code{private} variable the variable values can be defined using: \begin{itemize} \item \code{firstprivate} defines the value when entering the region \item \code{lastprivate} defines the value when exiting the region (OpenMP - 5.1 in C/C++) + 5.1 in C/C++) \end{itemize} - \item \code{default(none)} means each variables should appear in a shared or - private list + \item \code{default(private $\vert$ firstprivate $\vert$ shared $\vert$ none)} can be specified\\ + \code{default(none)} means each variables should appear in a + \code{shared} or \code{private} list \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Data sharing clauses} \framesubtitle{\code{shared} and \code{private}} These attributes determines the scope (visibility) of a single or list of variables \begin{cxxcode}{Syntax} - shared(list1) private(list2) + shared(list1), |\texttt{private}|(list2) \end{cxxcode} \begin{itemize} - \item The \code{private} attribute : the data is private to each thread and - non-initiatilized. Each thread has its own copy. \cxxinline{#pragma + \item The \code{private} clause: the data is private to each thread and + non-initialized. Each thread has its own copy. \cxxinline{#pragma omp parallel private(i)} - \item The \code{shared} attribute : the data is shared among all the + \item The \code{shared} clause: the data is shared among all the threads. It is accessible (and non-protected) by all the threads simultaneously. \cxxinline{#pragma omp parallel shared(array)} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Data sharing clauses} \framesubtitle{\code{firstprivate} and \code{lastprivate}} These clauses determines the attributes of the variables within a parallel region: \begin{cxxcode}{Syntax} - firstprivate(list1) lastprivate(list2) + firstprivate(list1), lastprivate(list2) \end{cxxcode} \begin{itemize} - \item The \code{firstprivate} like \code{private} but initialized to the value before the parallel region - \item The \code{lastprivate} like \code{private} but the value is updated after the parallel region + \item The \code{firstprivate} super-set of \code{private}, variable is + initialized to a copie of variable before the region + \item The \code{lastprivate} super-set of \code{private} the value of the + last thread exiting the region is copied \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Data sharing clauses} \framesubtitle{Example} \centering \cxxfile[% title={openmp/private.cc}, minted options app={ - firstline=5, - lastline=15, + firstline=8, + lastline=19, }]{examples/openmp/private.cc} \pause \begin{consoleoutput} - $ ../build/openmp/private - Variable a, b, c: 1, 2, 3 (before) - Variable a, b, c: 0, 2, 3 (inside) - Variable a, b, c: 32527, 2, 3 (inside) - Variable a, b, c: 0, 2, 10 (inside) - Variable a, b, c: 1, 2, 10 (after) + $ ./openmp/private + Thread 0 sees, a, b, c: 1, 2, 3 (before) + Thread 0 sees, a, b, c: 1839769744, 2, 3 (inside) + Thread 1 sees, a, b, c: 12789424, 2, 3 (inside) + Thread 2 sees, a, b, c: 12801392, 2, -0.001 (inside) + Thread 0 sees, a, b, c: 1, 2, -0.001 (after) \end{consoleoutput} %$ \end{frame} \begin{frame}[fragile,exercise] \frametitle{Compute $\pi$ in parallel} \framesubtitle{Naive reduction improved} \begin{itemize} \item Create a local variable per thread \item Make each thread compute it's own sum \item After the computation of the integral us a \code{critical} directive to sum the local sum to a \code{shared} sum \end{itemize} \end{frame} +\subsection{Loop clauses} + \begin{frame}[fragile] \frametitle{Loop clauses} \framesubtitle{\code{reduction} clause} \begin{cxxcode}{Syntax} reduction(|\emph{reduction-identifier : list}|) \end{cxxcode} \begin{itemize} \item \emph{reduction-identifier}: one of the operation \textbf{+}, \textbf{--}, \textbf{*}, \textbf{\&}, \textbf{\textbar}, \textbf{\^}, \textbf{\&\&}, \textbf{\textbar \textbar} - \item \emph{list} item on which the reduction applies + \item \emph{list} item on which the reduction applies + \item example: \cxxinline{#pragma omp for reduction(+: sum)} \end{itemize} \end{frame} \begin{frame}[fragile,exercise] \frametitle{Compute $\pi$ in parallel} \framesubtitle{Naive reduction improved} \begin{itemize} \item Use the \code{reduction} clause \item Compare the timings to the previous versions \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Loop clauses} \framesubtitle{\code{schedule} clause} \begin{cxxcode}{Syntax} schedule([modifier [, modifier] : ] kind [, chunk_size]) \end{cxxcode} \begin{itemize} \item \emph{kind} \begin{itemize} \item \code{static} iterations divided in chunks sized \code{chunk\_size} assigned to threads in a round-robin fashion \item \code{dynamic} iterations divided in chunks sized \code{chunk\_size} assigned to threads when they request them until no chunk remains to be distributed \item \code{guided} iterations divided in chunks sized \code{chunk\_size} assigned to threads when they request them. Size of chunks is proportional to the remaining unassigned chunks. \item \code{auto} The decisions is delegated to the compiler and/or the runtime system \item \code{runtime} The decisions is delegated to the runtime system based on ICV \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Loop clauses} \framesubtitle{\code{collapse} clause} \begin{cxxcode}{Syntax} collapse(|\emph{n}|) \end{cxxcode} Specifies how many loop are combine into a logical space \end{frame} \begin{frame}[fragile] \frametitle{Example \code{dgemm}} \framesubtitle{collapse(1)} \centering \cxxfile[title={openmp/dgemm.cc}, minted options app={ firstline=34,lastline=38 }]{examples/openmp/dgemm.cc} \pause \begin{consoleoutput} $ OMP_NUM_THREADS=1 ../build/openmp/dgemm DGEMM with 1 threads, collapse(1): 21.1209 GFLOP/s (verif 2) $ OMP_NUM_THREADS=2 ../build/openmp/dgemm DGEMM with 2 threads, collapse(1): 40.2308 GFLOP/s (verif 2) $ OMP_NUM_THREADS=4 ../build/openmp/dgemm DGEMM with 4 threads, collapse(1): 72.7659 GFLOP/s (verif 2) $ OMP_NUM_THREADS=1 ../build/openmp/dgemm \end{consoleoutput} %$ \end{frame} \begin{frame}[fragile] \frametitle{Example \code{dgemm}} \framesubtitle{collapse(2)} \centering \begin{consoleoutput} DGEMM with 1 threads, collapse(2): 20.358 GFLOP/s (verif 2) $ OMP_NUM_THREADS=2 ../build/openmp/dgemm DGEMM with 2 threads, collapse(2): 40.0818 GFLOP/s (verif 2) $ OMP_NUM_THREADS=4 ../build/openmp/dgemm DGEMM with 4 threads, collapse(2): 72.4462 GFLOP/s (verif 2) \end{consoleoutput} %$ \end{frame} +\subsection{Advanced topics} + +\begin{frame}[fragile] + \frametitle{Advanced topics} + \framesubtitle{Idealized model vs reality (NUMA, Sub-NUMA Clusters, Cluster-on-Die)} + + \onslide<1>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture_essential}{4.75cm}{1.2cm} + \onslide<2>\addimage[width=12cm]{\FIGREP/skylake_sp_2-way_3_upi}{2.7cm}{3cm} + \onslide<3>\addimage[width=12cm]{\FIGREP/skylake_(server)_half_rings}{2cm}{1cm} + + \onslide<4>\begin{tikzpicture}[overlay,remember picture] + \begin{scope}[shift={(current page.south west)}] + \draw[gray, thick] (9.1cm, 4.6cm) -- (11.2cm, 4.45cm); + \draw[gray, thick] (9.1cm, 5.5cm) -- (11.2cm, 6cm); + \end{scope} + \end{tikzpicture} + \onslide<4>\addimage[width=5.5cm]{\FIGREP/skylake_octadeca_core}{3.95cm}{1cm} + \onslide<4>\addimage[width=2cm]{\FIGREP/skylake_sp_core}{11.2cm}{4.4cm} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Implications of memory layout} + \framesubtitle{} + + + \addimage[width=6cm]{\FIGREP/skylake_sp_2-way_3_upi}{6cm}{6cm} + + \begin{itemize} + \item One thread can only saturate 1 channel + \item On memory bound code bandwidth saturate when \# of threads \~{} \# of channels + \item If memory allocated on the other processor memory, data go through CPU + interconnect (UPI $3 \times \SI{10.4}{\giga\transfer\per\second}$) + + \pause + \item How to mitigate this effects ? + \begin{itemize} + \item Loop schedule + \item Memory first touch + \item Thread placements + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Thread Affinity} + \framesubtitle{Thread Affinity Control} + + \begin{itemize} + \item The variable \cmd{OMP_PLACES} describes these places in terms of the available hardware. + \item The variable \cmd{OMP_PROC_BIND} describes how threads are bound to OpenMP places + \item The variable \cmd{OMP_DISPLAY_AFFINITY} helps to debug the affinity + \end{itemize} + + \centering + \begin{consoleoutput} + $ OMP_NUM_THREADS=4 OMP_DISPLAY_AFFINITY=true ./openmp/hello + OMP: pid 2115280 tid 2115280 thread 0 bound to OS proc set {0-31} + OMP: pid 2115280 tid 2115285 thread 3 bound to OS proc set {0-31} + OMP: pid 2115280 tid 2115284 thread 2 bound to OS proc set {0-31} + Hello from thread 0 out of 4 + Hello from thread 3 out of 4 + OMP: pid 2115280 tid 2115283 thread 1 bound to OS proc set {0-31} + Hello from thread 1 out of 4 + Hello from thread 2 out of 4 + \end{consoleoutput} %$ +\end{frame} + +\begin{frame}[fragile] + \frametitle{Thread Affinity} + \framesubtitle{\cmd{OMP_PLACES}} + + Possible values for \cmd{OMP_PLACES} where each place corresponds to: + \begin{description} + \item [threads] a single hardware thread on the + device. + \item [cores] a single core (having one or more + hardware threads) on the device. + \item [ll\_caches] a set of cores that share the last + level cache on the device. + \item [numa\_domains] a set of cores for which their closest + memory on the device is: + \begin{itemize} + \item the same memory; and + \item at a similar distance from the cores. + \end{itemize} + \item [sockets] a single socket (consisting of one or + more cores) on the device. + \end{description} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Thread Affinity} + \framesubtitle{\cmd{OMP_PROC_BIND}} + + Possible values for \cmd{OMP_PROC_BIND}: + \begin{description} + \item [false] threads not bonded + \item [true] threads are bonded (implementation dependant) + \item [primary] collocate threads with the primary thread + \item [close] place threads close to the master in the places list + \item [spread] spread out threads as much as possible + \end{description} +\end{frame} + +\begin{frame}[fragile] + \frametitle{First touch} + \framesubtitle{} + + \begin{itemize} + \item Memory is organized in pages + \item When allocating data ``nothing'' happens + \item Pages are allocated on the memory associated to the first thread initializing it + \end{itemize} + + \pause + + \begin{itemize} + \item To mitigate the problem, initialize the arrays in same order they are accessed + \end{itemize} +\end{frame} +\begin{frame}[fragile] + \frametitle{Data race, false-sharing} + \framesubtitle{} -% \subsubsection{tasking constucts ("task", "taskyield")} - -% \begin{frame}[containsverbatim] -% \frametitle{The \texttt{task} directive} -% \begin{center} -% {\input{day1/images/Fork_join.tex}} -% \end{center} -% Source : wikipedia.org -% \end{frame} - - -% \begin{frame}[containsverbatim] -% \frametitle{The \texttt{task} directive} - -% \begin{exampleblock}{What is an OpenMP task ?} -% \begin{itemize} -% \item{Offers a solution to parallelize irregular problems (unbounded loops, recursives, master/slave schemes, etc..)} -% \item{OpenMP tasks are composed of -% \begin{itemize} -% \item{\textbf{code} : what will be executed} -% \item{\textbf{data} : initialized at task creation time } -% \item{\textbf{ICV's} : Internal Control Variables } -% \end{itemize} -% } -% \end{itemize} -% \end{exampleblock} - -% \begin{exampleblock}{Synchronization} -% \begin{itemize} -% \item{All tasks created by a thread of a team are garanteed to be completed at -% thread exit (end of block)} -% \item{Within a task group, it is possible to synchronize through \texttt{\#pragma -% omp taskwait}} -% \end{itemize} -% \end{exampleblock} -% \end{frame} - - -% \begin{frame}[containsverbatim] -% \frametitle{The \texttt{task} directive} - -% \begin{exampleblock}{Syntax} -% \begin{lstlisting}[language=C,frame=lines] -% #pragma omp task [clause[[,] clause] ...] -% { -% structured-block -% } -% \end{lstlisting} -% where \textit{clause} is one of the following: -% \begin{itemize} -% \item{\code{if(\textit{scalar-logical-expression})}} -% \item{\code{final(\textit{scalar-logical-expression})}} -% \item{\code{untied}} -% \item{\code{default (private | firstprivate | shared | none)}} -% \item{\code{mergeable}} -% \item{\code{private(\textit{list})}, \code{firstprivate(\textit{list})}, \code{shared(\textit{list})}} -% \end{itemize} -% \end{exampleblock} -% \end{frame} - - - -% \begin{frame}[containsverbatim] -% \frametitle{The \texttt{task} directive} - -% \begin{exampleblock}{Execution model} -% \begin{itemize} -% \item{A task \texttt{t} is executed by the thread \texttt{T} of the team that generated it. Immediately or not (depends on the implementation)} -% \item{A thread \texttt{T} can suspend/resume/restart a task \texttt{t}} -% \item{Tasks are \textbf{tied} by default: -% \begin{itemize} -% \item{tied tasks are executed by the same thread} -% \item{tied tasks have scheduling restrictions (deterministic creation, synchronization, destruction)} -% \end{itemize} -% } -% \item{It is possible to untie tasks with the directive \texttt{untied}} -% \end{itemize} -% \end{exampleblock} - - -% \end{frame} - - -% \begin{frame}[containsverbatim] -% \frametitle{A \texttt{task} directive (stupid) example} - -% You probably know this popular example to compute the \texttt{n} first Fibonacci numbers (Fib = [1,1,2,3,5,8,13,21,34,..]): - -% \begin{columns} -% % \begin{column}[l]{7cm} -% \begin{column}{7cm} - -% \begin{lstlisting}[language=C,frame=lines] -% int fibonacci(int n) -% int x,y; -% if (n < 2) { -% return n; -% } else { -% x = fibonacci(n-1); -% y = fibonacci(n-2); -% return (x+y); -% } -% } -% \end{lstlisting} - -% \end{column} -% \begin{column}[c]{3cm} -% {\includegraphics[height=4cm]{day1/images/Fibonacci.jpg}} -% \end{column} -% \end{columns} - -% \end{frame} - -% \begin{frame}[containsverbatim] -% \frametitle{A \texttt{task} directive (stupid) example (Exercise)} - -% \begin{block}{Idea} -% The idea is using the {\tt \#pragma omp tasks} construct by spawning a new task whenever the recursive function is called. -% \begin{verbatim} -% ! start the following call in a task -% x=fibonacci(n-1) -% ! start the following call in a task -% y=fibonacci(n-2) -% ! a synchronization must be done here ! -% fibonacci = (x+y) -% \end{verbatim} - -% \end{block} - -% \begin{alertblock}{Warning: number of tasks and number of physical cores} -% Pay attention to the number of tasks with respect to the number of cores you have on the node (here 48). -% \end{alertblock} -% \end{frame} - - - -% \begin{frame}[containsverbatim] -% \frametitle{A \texttt{task} directive (stupid) example} -% \begin{lstlisting}[language=C,frame=lines] -% int fibonacci(int n){ -% int x,y; -% if (n < 2) { -% return n; -% } else { -% #pragma omp task shared(x) -% x = fibonacci(n-1); -% #pragma omp task shared(y) -% y = fibonacci(n-2); -% #pragma omp taskwait -% return (x+y); -% } -% } -% \end{lstlisting} -% \end{frame} - -% \begin{frame}[containsverbatim] -% \frametitle{A \texttt{task} directive example [output]} -% \begin{verbatim} -% vkeller@mathicsepc13:~$ ./ex12 45 -% \end{verbatim} - -% \begin{alertblock}{Warning !} -% Waaaaaaaaaaaaaaaaaaaay too long : too many tasks are created. -% \end{alertblock} - -% \end{frame} - -% \begin{frame}[containsverbatim] -% \frametitle{A \texttt{task} directive (stupid) example with cutoff} -% \begin{lstlisting}[language=C,frame=lines] -% int fibonacci(int n, int level, int cutoff){ -% int x,y; -% if (n < 2) { -% return n; -% } else if (level < cutoff) { -% #pragma omp task shared(x) -% x = fibonacci(n-1, level+1,cutoff); -% #pragma omp task shared(x) -% y = fibonacci(n-2, level+1,cutoff); -% #pragma omp taskwait -% return (x+y); -% } else { -% x = fibonacci(n-1); -% y = fibonacci(n-2); -% return (x+y); -% } -% } -% \end{lstlisting} -% \end{frame} - - -% \begin{frame}[containsverbatim] -% \frametitle{A \texttt{task} directive (stupid) example [output]} - -% \begin{table} -% \begin{center} -% \begin{tabular}{|l|l|l|l|l|l|} -% \hline -% \textbf{OMP\_NUM\_THREADS} & \textbf{1} & \textbf{2} & \textbf{4} & \textbf{8} & \textbf{16}\\ -% \hline -% \hline -% sequential & 27.4 & 27.4&27.4 &27.4 &27.4 \\ -% \hline -% without cutoff & 27.4 & $>>$60 & $>>$60 & $>>$60 & $>>$60 \\ -% \hline -% with cutoff (level=10) & 27.4 & 14.5 & 7.4 & 4.1 & 3.9 \\ -% \hline -% for loop & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$ \\ -% \hline - -% \end{tabular} -% \end{center} -% \caption{\texttt{N=45}, time is in seconds} -% \end{table} - -% \begin{block}{Remark} -% We get a beautiful speedup by cutting off the tree search. But another simplier algorithm (for loop) performs by 6 orders of magnitude. -% \end{block} -% \end{frame} - - -% \begin{frame}[containsverbatim] -% \frametitle{Tasks remarks} - -% \begin{itemize} -% \item{targeted for many-cores co-processors (like Intel Phi)} -% \item{Can be used to solve non symetrical problems and algorithms} -% \item{Avoid them on a multi-core CPU} -% \end{itemize} - -% \end{frame} - - - -% \subsubsection[Master and Synchronization constructs]{Master and Synchronization constructs ("master", "critical", "barrier", "taskwait", "atomic", "flush", "ordered")} - - -% \begin{frame} -% \frametitle{Synchronization} -% \begin{block}{Synchronization constructs} -% Those directives are sometimes mandatory: -% \begin{itemize} -% \item{\texttt{master} : region is executed by the master thread only } -% \item{\texttt{critical} : region is executed by only one thread at a time } -% \item{\texttt{barrier} : all threads must reach this directive to continue} -% \item{\texttt{taskwait} : all tasks and childs must reach this directive to continue} -% \item{\texttt{atomic (read | write | update | capture)} : the associated storage location is accessed by only one thread/task at a time} -% \item{\texttt{flush} : this operation makes the thread's temporary view of memory consistent with the shared memory} -% \item{\texttt{ordered} : a structured block is executed in the order of the loop iterations } -% \end{itemize} -% \end{block} - -% \end{frame} - - -% % \subsubsection{Data environment \texttt{threadprivate}} -% % -% % \begin{frame}[containsverbatim] -% % \frametitle{The \texttt{threadprivate} directive} -% % -% % \begin{exampleblock}{Syntax} -% % The \texttt{threadprivate} directive specifies that variables are replicated, with each thread having its own copy. -% % \begin{lstlisting}[language=C,frame=lines] -% % #pragma omp threadprivate(list) -% % \end{lstlisting} -% % where \textit{list} is a comma-separated list of named variables and named common blocks -% % \end{exampleblock} -% % \end{frame} -% % -% % \begin{frame}[containsverbatim] -% % \frametitle{The \texttt{threadprivate} directive} -% % -% % \begin{exampleblock}{Syntax} -% % \begin{itemize} -% % \item{The directive must appear after the declaration of listed variables/common blocks} -% % \item{The values of data in the threadprivate variables of non-initial threads are guaranteed to persist between two consecutive active \texttt{parallel} regions if: -% % \begin{itemize} -% % \item{No nested \texttt{parallel} regions} -% % \item{Number of threads for both \texttt{parallel} regions is the same} -% % \item{\texttt{dyn-var} ICV is false for both \texttt{parallel} regions} -% % \end{itemize} -% % } -% % \item{A \texttt{threadprivate} variable is affected by a \texttt{copyin} clause if it appears in the list} -% % \item{A \texttt{threadprivate} variable is \textbf{NOT} affected by a \texttt{copyin} clause if it as the \texttt{allocatable} (not initially allocated) or the \texttt{pointer} (no initial association) attributes} -% % \end{itemize} -% % \end{exampleblock} -% % -% % \end{frame} - - -% % \begin{frame}[containsverbatim] -% % \frametitle{A \texttt{copyin} clause} -% % -% % \begin{exampleblock}{Properties} -% % \begin{itemize} -% % \item{The \texttt{copyin} clause provides a mechanism to copy the value of the master thread's \texttt{threadprivate} variable to the \texttt{threadprivate} variable of each other member of the team executing the \texttt{parallel}region. } -% % \item{If the original list item has the \texttt{POINTER} attribute, each copy receives the same association status of the master thread's copy as if by pointer assignment. } -% % \item{If the original list item does not have the \texttt{POINTER} attribute, each copy becomes defined with the value of the master thread's copy as if by intrinsic assignment, unless it has the allocation status of not currently allocated, in which case each copy will have the same status. } -% % \end{itemize} -% % \end{exampleblock} -% % \end{frame} - - - - -% % \begin{frame}[containsverbatim] -% % \frametitle{A \texttt{copyprivate} clause} -% % -% % \begin{exampleblock}{Properties} -% % \begin{itemize} -% % \item{The \texttt{copyprivate} clause provides a mechanism to use a private variable to broadcast a value from the data environment of one implicit task to the data environments of the other implicit tasks belonging to the \texttt{parallel} region.} -% % \item{To avoid race conditions, concurrent reads or updates of the list item must be synchronized with the update of the list item that occurs as a result of the \texttt{copyprivate} clause.} -% % \end{itemize} -% % \end{exampleblock} -% % \end{frame} - - - -% \subsubsection{Nesting} - -% \begin{frame} -% \frametitle{Nesting regions} - -% \begin{exampleblock}{Nesting} -% It is possible to include parallel regions in a parallel region (i.e. nesting) under restrictions (cf. sec. 2.10, p.111, \textit{OpenMP: Specifications ver. 3.1}) -% \end{exampleblock} - - -% \end{frame} - - - -% \subsection{Runtime Library routines} - -% \begin{frame} -% \frametitle{Runtime Library routines} -% \begin{exampleblock}{Usage} -% \begin{itemize} -% \item{The functions/subroutines are defined in the lib \texttt{libomp.so / libgomp.so}. Don't -% forget to include \texttt{\#include }} -% \item{These functions can be called anywhere in your programs} -% \end{itemize} -% \end{exampleblock} -% \end{frame} - -% \subsubsection{General purpose routines - selection} - -% % \begin{frame} -% % \frametitle{Runtime Library routines} -% % \framesubtitle{General purpose routines} -% % \begin{center} -% % \begin{tabular}{|l|l|} -% % \hline -% % \textbf{routine} & \textbf{behavior} \\ -% % \hline -% % \hline -% % \texttt{omp\_set\_num\_threads} & sets/gets number of threads to be used \\ -% % \texttt{omp\_get\_num\_threads} & for subsequent parallel regions that \\ -% % & do not specify a num\_threads clause \\ -% % & by setting the value of the first \\ -% % & element of the nthreads-var ICV of \\ -% % & the current task \\ -% % \hline -% % \texttt{omp\_get\_max\_threads} & -% % returns an upper bound on the number of \\ -% % & threads that could be used to form a \\ -% % & new team if a parallel region without a \\ -% % & num\_threads clause were encountered \\ -% % & after execution returns from this routine \\ -% % \hline -% % -% % \end{tabular} -% % \end{center} -% % -% % \end{frame} - -% % \begin{frame} -% % \frametitle{Runtime Library routines} -% % \framesubtitle{General purpose routines} -% % -% % \begin{center} -% % \begin{tabular}{|l|l|} -% % \hline -% % \textbf{routine} & \textbf{behavior} \\ -% % \hline -% % \hline -% % \texttt{omp\_get\_thread\_num} & -% % returns the thread number, within the \\ -% % & current team, of the calling thread. \\ -% % \hline -% % \texttt{omp\_get\_num\_procs} & -% % returns the number of processors \\ -% % & available to the program. \\ -% % \hline -% % \texttt{omp\_in\_parallel} & -% % returns true if the call to the routine \\ -% % & is enclosed by an active parallel region \\ -% % & ; otherwise, it returns false \\ -% % \hline -% % \texttt{omp\_set\_dynamic} & -% % gets/sets the dynamic adjustment \\ -% % \texttt{omp\_get\_dynamic} & -% % of the number of threads available \\ -% % & for the execution of subsequent \\ -% % & parallel regions by getting/setting \\ -% % & the value of the dyn-var ICV. \\ -% % \hline -% % \end{tabular} -% % \end{center} -% % -% % \end{frame} -% % -% % \begin{frame} -% % \frametitle{Runtime Library routines} -% % \framesubtitle{General purpose routines} - -% % \begin{center} -% % \begin{tabular}{|l|l|} -% % \hline -% % \textbf{routine} & \textbf{behavior} \\ -% % \hline -% % \hline -% % \texttt{omp\_set\_nested} & -% % gets/sets nested parallelism, \\ -% % \texttt{omp\_get\_nested} & -% % by getting/setting the nest-var ICV. \\ -% % \hline -% % \texttt{omp\_set\_schedule} & -% % gets/sets the schedule that is applied \\ -% % \texttt{omp\_get\_schedule} & -% % when runtime is used as schedule kind, \\ -% % & by getting/setting the value of \\ -% % & the run-sched-var ICV. \\ -% % \hline -% % \texttt{omp\_get\_thread\_limit} & -% % returns the maximum number of \\ -% % & OpenMP threads available to the \\ -% % & program. \\ -% % \hline -% % \end{tabular} -% % \end{center} -% % -% % \end{frame} -% % -% % \begin{frame} -% % \frametitle{Runtime Library routines} -% % \framesubtitle{General purpose routines} - -% % \begin{center} -% % \begin{tabular}{|l|l|} -% % \hline -% % \textbf{routine} & \textbf{behavior} \\ -% % \hline -% % \hline -% % \texttt{omp\_set\_max\_active\_levels} & -% % limits the number of \\ -% % \texttt{omp\_get\_max\_active\_levels} & -% % nested active parallel \\ -% % & regions, by getting/setting \\ -% % & the max-active-levels-var ICV. \\ -% % \hline -% % \texttt{omp\_get\_level} & -% % returns the number of nested \\ -% % & parallel regions enclosing the \\ -% % & task that contains the call. \\ -% % \hline -% % \texttt{omp\_get\_ancestor\_thread\_num} & -% % returns, for a given nested \\ -% % & level of the current thread,\\ -% % & the thread number of the \\ -% % & ancestor or the current \\ -% % & thread \\ -% % \hline -% % \end{tabular} -% % \end{center} -% % -% % \end{frame} -% % -% % \begin{frame} -% % \frametitle{Runtime Library routines} -% % \framesubtitle{General purpose routines} - -% % \begin{center} -% % \begin{tabular}{|l|l|} -% % \hline -% % \textbf{routine} & \textbf{behavior} \\ -% % \hline -% % \hline -% % \texttt{omp\_get\_team\_size} & -% % returns, for a given nested level \\ -% % & of the current thread, the size of \\ -% % & the thread team to which the ancestor \\ -% % & or the current thread belongs \\ -% % \hline -% % \texttt{omp\_get\_active\_level} & -% % returns the number of nested, active \\ -% % & parallel regions enclosing the task \\ -% % & that contains the call. \\ -% % \hline -% % \texttt{omp\_in\_final} & -% % returns true if the routine is executed \\ -% % & in a final task region; otherwise, \\ -% % & it returns false. \\ -% % \hline -% % \end{tabular} -% % \end{center} -% % -% % \end{frame} - -% % \subsubsection{Lock routines} -% % -% % \begin{frame} -% % \frametitle{Runtime Library routines} -% % \framesubtitle{Lock routines} -% % -% % \begin{block}{Remark on lock routines} -% % The following routines are rarely used. They are mentionned for exhaustivity purposes -% % \end{block} -% % -% % \begin{center} -% % \begin{tabular}{|l|l|} -% % \hline -% % \textbf{routine} & \textbf{behavior} \\ -% % \hline -% % \hline -% % \texttt{omp\_init\_lock} & -% % initializes a simple lock \\ -% % \hline -% % \texttt{omp\_destroy\_lock} & -% % uninitializes a simple lock \\ -% % \hline -% % \texttt{omp\_set\_lock} & -% % waits until a simple lock is available \\ -% % & and then sets it \\ -% % \hline -% % \texttt{omp\_unset\_lock} & -% % unsets a simple lock \\ -% % \hline -% % \texttt{omp\_test\_lock} & -% % tests a simple lock, and sets it if \\ -% % & it is available \\ -% % \hline -% % \end{tabular} -% % \end{center} -% % -% % \end{frame} - - - -% % \subsubsection{Nestable lock routines} -% % -% % \begin{frame} -% % \frametitle{Runtime Library routines} -% % \framesubtitle{Nestable lock routines} -% % -% % \begin{block}{Remark on lock nestable routines} -% % The following routines are rarely used. They are mentionned for exhaustivity purposes -% % \end{block} -% % -% % \begin{center} -% % \begin{tabular}{|l|l|} -% % \hline -% % \textbf{routine} & \textbf{behavior} \\ -% % \hline -% % \hline -% % \texttt{omp\_init\_nest\_lock} & -% % initializes a nestable lock \\ -% % \hline -% % \texttt{omp\_destroy\_nest\_lock} & -% % uninitializes a nestable lock \\ -% % \hline -% % \texttt{omp\_set\_nest\_lock} & -% % waits until a nestable lock is available \\ -% % & and then sets it \\ -% % \hline -% % \texttt{omp\_unset\_nest\_lock} & -% % unsets a nestable lock \\ -% % \hline -% % \texttt{omp\_test\_nest\_lock} & -% % tests a nestable lock, and sets it if \\ -% % & it is available \\ -% % \hline -% % \end{tabular} -% % \end{center} -% % -% % \end{frame} - - -% \subsubsection{Timing routines} - -% \begin{frame} -% \frametitle{Runtime Library routines} -% \framesubtitle{Timing routines} - -% \begin{center} -% \begin{tabular}{|l|l|} -% \hline -% \textbf{routine} & \textbf{behavior} \\ -% \hline -% \hline -% \texttt{omp\_get\_wtime} & -% returns elapsed wall clock time in seconds. \\ -% \hline -% \texttt{omp\_get\_wtick} & -% returns the precision of the timer used by \\ -% & \texttt{omp\_get\_wtime} \\ -% \hline -% \end{tabular} -% \end{center} - -% \end{frame} - - - - - -% \subsection{Environment variables} - -% \begin{frame} -% \frametitle{Environment variables} -% \begin{exampleblock}{Usage} -% \begin{itemize} -% \item{Environment variables are used to set the ICVs variables} -% \item{under \texttt{csh} : \texttt{setenv OMP\_VARIABLE "its-value"}} -% \item{under \texttt{bash} : \texttt{export OMP\_VARIABLE="its-value"}} -% \end{itemize} -% \end{exampleblock} -% \end{frame} - -% \begin{frame} -% \frametitle{Environment variables} - -% \begin{center} -% \begin{tabular}{|l|l|} -% \hline -% \textbf{variable} & \textbf{what for ?} \\ -% \hline -% \hline -% \texttt{OMP\_SCHEDULE} -% & sets the run-sched-var ICV that specifies \\ -% & the runtime schedule type and chunk size. \\ -% & It can be set to any of the valid OpenMP \\ -% & schedule types. \\ -% \hline - -% \texttt{OMP\_NUM\_THREADS} -% & sets the nthreads-var ICV that specifies \\ -% & the number of threads to use in parallel \\ -% & regions \\ -% \hline - -% % \texttt{OMP\_DYNAMIC} -% % & sets the dyn-var ICV that specifies the \\ -% % & dynamic adjustment of threads to use for \\ -% % & parallel regions. \\ - -% \hline -% \end{tabular} -% \end{center} - -% \end{frame} - -% % \begin{frame} -% % \frametitle{Environment variables} -% % -% % \begin{center} -% % \begin{tabular}{|l|l|} -% % \hline -% % \textbf{variable} & \textbf{what for ?} \\ -% % \hline -% % \hline -% % -% % \texttt{OMP\_PROC\_BIND} -% % & sets the bind-var ICV that controls whether \\ -% % & threads are bound to processors \\ -% % -% % \hline -% % \texttt{OMP\_NESTED} -% % & sets the nest-var ICV that enables or disables \\ -% % & nested parallelism \\ -% % -% % \hline -% % \texttt{OMP\_STACKSIZE} -% % & sets the stacksize-var ICV that specifies \\ -% % & the size of the stack for threads created by \\ -% % & the OpenMP implementation. \\ -% % -% % \hline -% % \texttt{OMP\_WAIT\_POLICY} -% % & sets the wait-policy-var ICV that controls \\ -% % & the desired behavior of waiting threads. \\ -% % -% % \hline -% % \end{tabular} -% % \end{center} -% % -% % \end{frame} - -% % \begin{frame} -% % \frametitle{Environment variables} -% % -% % \begin{center} -% % \begin{tabular}{|l|l|} -% % \hline -% % \textbf{variable} & \textbf{what for ?} \\ -% % \hline -% % \hline -% % \texttt{OMP\_MAX\_ACTIVE\_LEVELS} -% % & sets the max-active-levels-var ICV \\ -% % & that controls the maximum number of \\ -% % & nested active parallel regions. \\ -% % -% % \hline -% % \texttt{OMP\_THREAD\_LIMIT} -% % & sets the thread-limit-var ICV that \\ -% % & controls the maximum number of \\ -% % & threads participating in the OpenMP \\ -% % & program. \\ -% % -% % \hline -% % \end{tabular} -% % \end{center} -% % -% % \end{frame} - - -% \subsubsection{The apparent ``easiness'' of OpenMP} - -% \begin{frame} -% \frametitle{The apparent ``easiness'' of OpenMP} - -% \begin{block}{} -% \textit{``Compared to MPI, OpenMP is much easier''} -% \end{block} - -% \begin{exampleblock}{In the reality} -% \begin{itemize} -% \item{Parallelization of a non-appropriate algorithm} -% \item{Parallelization of an unoptimized code} -% \item{Race conditions in shared memory environment} -% \item{Memory coherence} -% \item{Compiler implementation of the OpenMP API} -% \item{(Much) more threads/tasks than your machine can support} -% \end{itemize} -% \end{exampleblock} - -% \end{frame} - - -% \subsection{About affinity} - - -% \begin{frame}[fragile] -% \frametitle{OpenMP Thread affinity} - -% \textbf{Affinity = on which core does my thread run ?} - -% \begin{block}{Show and set affinity with Intel executable} -% By setting the \verb+export KMP_AFFINITY=verbose,SCHEDULING+ you are able to see where the OS pin each thread -% \end{block} -% \begin{block}{Show and set affinity with GNU executable} -% By setting the \verb+export GOMP_CPU_AFFINITY=verbose,SCHEDULING+ you are able to see where the OS pin each thread -% \end{block} -% \end{frame} - - - -% \begin{frame}[containsverbatim] -% \frametitle{OpenMP Thread affinity with compact} -% \begingroup -% \fontsize{6pt}{12pt}\linespread{0.5}\selectfont -% \begin{verbatim} -% vkeller@mathicsepc13:~$ export KMP_AFFINITY=verbose,compact -% vkeller@mathicsepc13:~$ ./ex10 -% OMP: Info #204: KMP_AFFINITY: decoding x2APIC ids. -% OMP: Info #202: KMP_AFFINITY: Affinity capable, using global cpuid leaf 11 info -% OMP: Info #154: KMP_AFFINITY: Initial OS proc set respected: {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15} -% OMP: Info #156: KMP_AFFINITY: 16 available OS procs -% OMP: Info #157: KMP_AFFINITY: Uniform topology -% OMP: Info #179: KMP_AFFINITY: 2 packages x 4 cores/pkg x 2 threads/core (8 total cores) -% OMP: Info #206: KMP_AFFINITY: OS proc to physical thread map: -% OMP: Info #171: KMP_AFFINITY: OS proc 0 maps to package 0 core 0 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 8 maps to package 0 core 0 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 1 maps to package 0 core 1 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 9 maps to package 0 core 1 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 2 maps to package 0 core 9 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 10 maps to package 0 core 9 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 3 maps to package 0 core 10 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 11 maps to package 0 core 10 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 4 maps to package 1 core 0 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 12 maps to package 1 core 0 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 5 maps to package 1 core 1 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 13 maps to package 1 core 1 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 6 maps to package 1 core 9 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 14 maps to package 1 core 9 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 7 maps to package 1 core 10 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 15 maps to package 1 core 10 thread 1 -% OMP: Info #144: KMP_AFFINITY: Threads may migrate across 1 innermost levels of machine -% OMP: Info #147: KMP_AFFINITY: Internal thread 0 bound to OS proc set {0,8} -% OMP: Info #147: KMP_AFFINITY: Internal thread 1 bound to OS proc set {0,8} -% OMP: Info #147: KMP_AFFINITY: Internal thread 2 bound to OS proc set {1,9} -% OMP: Info #147: KMP_AFFINITY: Internal thread 3 bound to OS proc set {1,9} -% [DGEMM] Compute time [s] : 0.344645023345947 -% [DGEMM] Performance [GF/s]: 0.580307233391397 -% [DGEMM] Verification : 2000000000.00000 -% \end{verbatim} -% \endgroup -% \end{frame} - - - -% \begin{frame}[containsverbatim] -% \frametitle{OpenMP Thread affinity with scatter} -% \begingroup -% \fontsize{6pt}{12pt}\linespread{0.5}\selectfont -% \begin{verbatim} -% vkeller@mathicsepc13:~$ export KMP_AFFINITY=verbose,scatter -% vkeller@mathicsepc13:~$ ./ex10 -% OMP: Info #204: KMP_AFFINITY: decoding x2APIC ids. -% OMP: Info #202: KMP_AFFINITY: Affinity capable, using global cpuid leaf 11 info -% OMP: Info #154: KMP_AFFINITY: Initial OS proc set respected: {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15} -% OMP: Info #156: KMP_AFFINITY: 16 available OS procs -% OMP: Info #157: KMP_AFFINITY: Uniform topology -% OMP: Info #179: KMP_AFFINITY: 2 packages x 4 cores/pkg x 2 threads/core (8 total cores) -% OMP: Info #206: KMP_AFFINITY: OS proc to physical thread map: -% OMP: Info #171: KMP_AFFINITY: OS proc 0 maps to package 0 core 0 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 8 maps to package 0 core 0 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 1 maps to package 0 core 1 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 9 maps to package 0 core 1 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 2 maps to package 0 core 9 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 10 maps to package 0 core 9 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 3 maps to package 0 core 10 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 11 maps to package 0 core 10 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 4 maps to package 1 core 0 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 12 maps to package 1 core 0 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 5 maps to package 1 core 1 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 13 maps to package 1 core 1 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 6 maps to package 1 core 9 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 14 maps to package 1 core 9 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 7 maps to package 1 core 10 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 15 maps to package 1 core 10 thread 1 -% OMP: Info #144: KMP_AFFINITY: Threads may migrate across 1 innermost levels of machine -% OMP: Info #147: KMP_AFFINITY: Internal thread 0 bound to OS proc set {0,8} -% OMP: Info #147: KMP_AFFINITY: Internal thread 1 bound to OS proc set {4,12} -% OMP: Info #147: KMP_AFFINITY: Internal thread 2 bound to OS proc set {1,9} -% OMP: Info #147: KMP_AFFINITY: Internal thread 3 bound to OS proc set {5,13} -% [DGEMM] Compute time [s] : 0.204235076904297 -% [DGEMM] Performance [GF/s]: 0.979263714301724 -% [DGEMM] Verification : 2000000000.00000 -% \end{verbatim} -% \endgroup -% \end{frame} - - -% \begin{frame}[containsverbatim] -% \frametitle{OpenMP Thread affinity with explicit (a kind of pining)} -% \begingroup -% \fontsize{6pt}{12pt}\linespread{0.5}\selectfont -% \begin{verbatim} -% vkeller@mathicsepc13:~$ export KMP_AFFINITY='proclist=[0,2,4,6],explicit',verbose -% vkeller@mathicsepc13:~$ ./ex10 -% OMP: Info #204: KMP_AFFINITY: decoding x2APIC ids. -% OMP: Info #202: KMP_AFFINITY: Affinity capable, using global cpuid leaf 11 info -% OMP: Info #154: KMP_AFFINITY: Initial OS proc set respected: {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15} -% OMP: Info #156: KMP_AFFINITY: 16 available OS procs -% OMP: Info #157: KMP_AFFINITY: Uniform topology -% OMP: Info #179: KMP_AFFINITY: 2 packages x 4 cores/pkg x 2 threads/core (8 total cores) -% OMP: Info #206: KMP_AFFINITY: OS proc to physical thread map: -% OMP: Info #171: KMP_AFFINITY: OS proc 0 maps to package 0 core 0 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 8 maps to package 0 core 0 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 1 maps to package 0 core 1 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 9 maps to package 0 core 1 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 2 maps to package 0 core 9 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 10 maps to package 0 core 9 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 3 maps to package 0 core 10 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 11 maps to package 0 core 10 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 4 maps to package 1 core 0 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 12 maps to package 1 core 0 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 5 maps to package 1 core 1 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 13 maps to package 1 core 1 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 6 maps to package 1 core 9 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 14 maps to package 1 core 9 thread 1 -% OMP: Info #171: KMP_AFFINITY: OS proc 7 maps to package 1 core 10 thread 0 -% OMP: Info #171: KMP_AFFINITY: OS proc 15 maps to package 1 core 10 thread 1 -% OMP: Info #144: KMP_AFFINITY: Threads may migrate across 1 innermost levels of machine -% OMP: Info #147: KMP_AFFINITY: Internal thread 0 bound to OS proc set {0,8} -% OMP: Info #147: KMP_AFFINITY: Internal thread 3 bound to OS proc set {6,14} -% OMP: Info #147: KMP_AFFINITY: Internal thread 1 bound to OS proc set {2,10} -% OMP: Info #147: KMP_AFFINITY: Internal thread 2 bound to OS proc set {4,12} -% [DGEMM] Compute time [s] : 0.248908042907715 -% [DGEMM] Performance [GF/s]: 0.803509591990774 -% [DGEMM] Verification : 2000000000.00000 -% \end{verbatim} -% \endgroup -% \end{frame} - - - - - - -% \subsubsection{``OpenMP-ization'' strategy} - - -% \begin{frame} -% \frametitle{``OpenMP-ization'' strategy} - -% \begin{itemize} -% \item {\textbf{STEP 1} : Optimize the sequential version: -% \begin{itemize} -% \item {Choose the best algorithm} -% \item {``Help the (right) compiler''} -% \item {Use the existing optimized scientific libraries} -% \end{itemize} -% } -% \item {\textbf{STEP 2} : Parallelize it: -% \begin{itemize} -% \item {Identify the bottlenecks (heavy loops)} -% \item {``auto-parallelization'' is rarely the best !} -% \end{itemize} -% } -% \end{itemize} - -% \begin{alertblock}{Goal} -% Debugging - Profiling - Optimization cycle. Then parallelization ! -% \end{alertblock} - - -% \end{frame} - -% \begin{frame} -% \frametitle{Tricks and tips} - -% \begin{block}{} -% \begin{itemize} -% \item{\textbf{Algorithm}: choose the ``best'' one} -% % \item{\textbf{Implementation}: choose the best one (remember Fibonacci)} -% \item{\textbf{cc-NUMA}: no (real) support from OpenMP side (but OS). A multi-CPU machine is not a real shared memory architecture} -% \item{\textbf{False-sharing}: multiple threads write in the same cache line} -% \item{\textbf{Avoid barrier}. This is trivial. Bus sometimes you can't} -% \item{\textbf{Small number of tasks}. Try to reduce the number of forked tasks} -% \item{\textbf{Asymetrical problem}. OpenMP is well suited for symetrical problems, even if tasks can help} -% \item{\textbf{Tune the schedule}: types, chunks...} -% \item{\textbf{Performance expectations}: a theoretical analysis using the simple Amdahl's law can help} -% \item{\textbf{Parallelization level}: coarse (SPMD) or fine (loop) grain ?} -% \end{itemize} -% \end{block} - -% \end{frame} - - - - -% \subsection{Conclusion} -% \begin{frame} -% \frametitle{A simple conclusion ...} -% \begin{center} -% {\includegraphics[height=2.5cm]{day1/images/warning.png}} -% \begin{alertblock}{The message} -% \textbf{FIRST OPTIMIZE ON ONE CORE, THEN PARALLELIZE (the right algorithm)} -% \end{alertblock} - -% \url{http://openmp.org/wp/openmp-compilers} - -% \end{center} - -% \end{frame} - - -% \subsection{What's new in 4.0 ?} -% \begin{frame} -% \frametitle{What's new with OpenMP 4.0 ?} - -% \begin{itemize} -% \item{ Support for new devices (\verb+Intel Phi+, \verb+GPU+,...) with \verb+omp target+. Offloading on those devices. } -% \item{ Hardware agnostic} -% \item{ League of threads with \verb+omp teams+ and distribute a loop over the team with \verb+omp distribute+ } -% \item{ SIMD support for vectorization \verb+omp simd+ } -% \item{ Task management enhancements (cancelation of a task, groups of tasks, task-to-task synchro)} -% \item{ Set thread affinity with a more standard way than \verb+KMP_AFFINITY+ with the concepts of \verb+places+ (a thread, a core, a socket), \verb+policies+ (spread, close, master) and \verb+control settings+ the new clause \verb+proc_bind+} -% \end{itemize} -% \end{frame} + \begin{itemize} + \item Data race: + \begin{itemize} + \item Data accessed by multiple threads without protection + \item Lead to undetermined results + \end{itemize} + \end{itemize} + \pause + \begin{itemize} + \item False sharing + \begin{itemize} + \item Data smaller than cache-line size + \item Multiple threads accessing data in the same cache line will poison + each other caches + \end{itemize} + \end{itemize} +\end{frame} + + +\subsection{Runtime routines} + +\begin{frame}[fragile] + \frametitle{Runtime routines} + \framesubtitle{} + + Sub set of the routines in OpenMP + \begin{itemize} + \item \cxxinline{omp_get_num_threads()}: number of threads in the current region + \item \cxxinline{omp_get_thread_num()}: id of the current thread + \item \cxxinline{omp_get_max_threads()}: upper bound to the number of + threads that could be used + \item \cxxinline{omp_get_wtime()}: wall clock time in seconds + \item \cxxinline{omp_get_wtick()}: seconds between successive clock ticks + \end{itemize} +\end{frame} + +\begin{frame}[fragile,exercise] + \frametitle{Parallelize the Poisson code using OpenMP} + + \begin{itemize} + \item Now you can apply what you learn to the \texttt{poisson} code. + \item Remember that 90\% of the time is spend in the dumpers. So make sure you + dump only once at the end of the simulation to get a validation image. + \end{itemize} +\end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: % LocalWords: OpenMP, Fortran, preprocessor diff --git a/src/optimization/optimization.tex b/src/optimization/optimization.tex index f3e4fad..cecb032 100644 --- a/src/optimization/optimization.tex +++ b/src/optimization/optimization.tex @@ -1,279 +1,279 @@ \renewcommand{\FIGREP}{src/optimization/figures} \section{Single-core optimization} \label{sec:optimization} \intersec{deneb} \begin{frame} \frametitle{Single-core optimization} \framesubtitle{Goal of this section} \begin{itemize} \item Better grasp how programming can influence performance \item We first review some basic optimization principles to keep in mind \item Deeper understanding of the working principles of the CPU \begin{itemize} \item How data transfers are handled \item Concept of vectorization \end{itemize} \end{itemize} \end{frame} \subsection{Basic optimization concepts} \label{sec:basic_optimization} \begin{frame}[t,fragile] \frametitle{Single-core optimization} \framesubtitle{Basic optimization techniques} \begin{itemize} \item Often, very simple changes to the code lead to significant performance improvements \item The following may seem trivial, but you would be surprised how often they could be used in scientific codes \item The main problem is that we often make a one-to-one mapping between the equations and the algorithm \end{itemize} \vfill \textbf{Do less work}\\ \begin{minipage}[t]{0.475\linewidth} \begin{cxxcode}{} for (int i = 0; i < N; ++i) { a[i] = (alpha + sin(x)) * b[i]; } \end{cxxcode} \end{minipage} \hfill \begin{minipage}[t]{0.475\linewidth} \begin{cxxcode}{} double tmp = alpha + sin(x); for (int i = 0; i < N; ++i) { a[i] = tmp * b[i]; } \end{cxxcode} \end{minipage} \begin{itemize} \item Constant term is re-computed at every iteration of the loop \item Can be taken out of the loop and computed once \end{itemize} \end{frame} \note{ \begin{itemize} \item In very simple cases like here, the compiler is smart enough to do it for you \item The main point is that the compiler will do most of the optimization job. Our goal is to write code that expresses our intention in a clear way so that the compiler can optimize it. \end{itemize} } \begin{frame}[t,fragile] \frametitle{Single-core optimization} \framesubtitle{Basic optimization techniques} \textbf{Avoid branches}\\ \begin{minipage}[t]{0.475\linewidth} \begin{cxxcode}{} for (i = 0; i < N; ++i) { for (j = 0; j < N; ++j) { if (j >= i) { sign = 1.0; } else { sign = -1.0; } b[j] = sign * a[i][j]; } } \end{cxxcode} \end{minipage} \hfill \begin{minipage}[t]{0.475\linewidth} \begin{cxxcode}{} for (i = 0; i < N; ++i) { for (j = i; j < N; ++j) { b[j] = a[i][j]; } for (j = 0; j < i; ++j) { b[j] = -a[i][j]; } } \end{cxxcode} \end{minipage} \begin{itemize} \item Avoid conditional branches in loops \item They can often be written differently or taken out of the loop \end{itemize} \end{frame} \subsection{Memory hierarchy} \label{sec:memory_hierarchy} \begin{frame} \frametitle{Single-core optimization} \framesubtitle{Tale of a smart librarian} \begin{itemize} \item To better understand the concepts behind caching, let's take the example of a librarian \item The first customer enters and asks for a book. The librarian goes into the huge storeroom and returns with the book when he finds it \item After some time, the client returns the book and the librarian puts it back into the storeroom \item A second customer enters and asks for the same book... \item This workflow can take a lot of time depending on how much customers want to read the same book \end{itemize} \end{frame} \begin{frame} \frametitle{Single-core optimization} \framesubtitle{Tale of a smart librarian} \begin{itemize} \item Our librarian is a bit lazy, but clever. Since a lot of customers ask for the same book, he decides to put a small shelf behind his desk to temporarily store the books he retrieves. \item This way he can quickly grab the book instead of going to the storeroom. \item When a customer asks for a book, he will first look on his shelf. If he finds the book, it's a \textit{cache hit} and he returns it to the customer. If not, it's a \textit{cache miss} and he must go back in the storeroom. \item This is a very clever system, especially if there is \textit{temporal locality}, i.e. if the customers often ask for the same books. \item Can he do better ? \end{itemize} \end{frame} \begin{frame} \frametitle{Single-core optimization} \framesubtitle{Tale of a smart librarian} \begin{itemize} \item Oftentimes, our librarian see that people taking one book will go back and ask for the sequels of the book \item He decides to change a bit his workflow. Now, when he goes into the storeroom to retrieve a book, he comes back with a few of them, all on the same shelf \item This way, when the customer brings back a book and asks for the sequel, it is already present on the librarian shelf \item This workflow works well when there is \textit{spatial locality}, i.e. when you ask for a book there is a significant chance that you will read the sequel \end{itemize} \end{frame} \begin{frame} \frametitle{Single-core optimization} \framesubtitle{Data loading} \begin{itemize} \item Now, what is the link between our librarian and the CPU? They work in a similar fashion! \item When a load instruction is issued the L1 cache logic checks if data is already present. If yes, this is a \textit{cache hit} and data can be retrieved very quickly. If no, this is a \textit{cache miss} and the next memory levels are checked. \item If the data is nowhere to be found, then it is loaded from the main memory \item As for our librarian, not only the required data is loaded for each cache miss, but a whole \textit{cache line} \end{itemize} \end{frame} \begin{frame}[t,fragile] \frametitle{Single-core optimization} \framesubtitle{Example: vector multiplication with a scalar} \begin{itemize} \item Simple vector/scalar multiplication \item Focus on data loading (\code{b[i]}) \item Assume only one level of cache with a cache line of two doubles (16 bytes) \end{itemize} \begin{cxxcode}{} for (int i = 0; i < N; ++i) { a[i] = alpha * b[i]; } \end{cxxcode} - \onslide<1>\addimage{\FIGREP/data_loading_1}{7cm}{5cm}{0.5cm} - \onslide<2>\addimage{\FIGREP/data_loading_2}{7cm}{5cm}{0.5cm} - \onslide<3>\addimage{\FIGREP/data_loading_3}{7cm}{5cm}{0.5cm} - \onslide<4>\addimage{\FIGREP/data_loading_4}{7cm}{5cm}{0.5cm} - \onslide<5>\addimage{\FIGREP/data_loading_5}{7cm}{5cm}{0.5cm} - \onslide<6>\addimage{\FIGREP/data_loading_6}{7cm}{5cm}{0.5cm} + \onslide<1>\addimage[width=7cm]{\FIGREP/data_loading_1}{5cm}{0.5cm} + \onslide<2>\addimage[width=7cm]{\FIGREP/data_loading_2}{5cm}{0.5cm} + \onslide<3>\addimage[width=7cm]{\FIGREP/data_loading_3}{5cm}{0.5cm} + \onslide<4>\addimage[width=7cm]{\FIGREP/data_loading_4}{5cm}{0.5cm} + \onslide<5>\addimage[width=7cm]{\FIGREP/data_loading_5}{5cm}{0.5cm} + \onslide<6>\addimage[width=7cm]{\FIGREP/data_loading_6}{5cm}{0.5cm} \end{frame} \begin{frame}[t] \frametitle{Single-core optimization} \framesubtitle{Memory layout and data access} \begin{itemize} \item How do we store ND arrays into memory? \item Memory is a linear storage. Arrays are stored contiguously, one element after the other. \item We have to choose a convention. Row major (C/C++) or column major (Fortran). \item Row major means that elements are stored contiguously according to the last index of the array. In column-major order, they are stored according to the first index. \end{itemize} - \onslide<1>\addimage{\FIGREP/row_major_1}{6cm}{5cm}{0.5cm} - \onslide<2>\addimage{\FIGREP/row_major_2}{6cm}{5cm}{0.5cm} + \onslide<1>\addimage[width=6cm]{\FIGREP/row_major_1}{5cm}{0.5cm} + \onslide<2>\addimage[width=6cm]{\FIGREP/row_major_2}{5cm}{0.5cm} \end{frame} \begin{frame}[t,fragile] \frametitle{Single-core optimization} \framesubtitle{Example: matrix/vector multiplication} \begin{itemize} \item Focus on data loading (\code{a[i][j]}) \item Assume only one level of cache with a cache line of two doubles (16 bytes) \end{itemize} \begin{cxxcode}{} for (int j = 0; j < N; ++j) { for (int i = 0; i < N; ++i) { c[i] += a[i][j] * b[j]; } } \end{cxxcode} - \onslide<1>\addimage{\FIGREP/matrix_vector_1}{7cm}{5cm}{0.5cm} - \onslide<2>\addimage{\FIGREP/matrix_vector_2}{7cm}{5cm}{0.5cm} - \onslide<3>\addimage{\FIGREP/matrix_vector_3}{7cm}{5cm}{0.5cm} - \onslide<4>\addimage{\FIGREP/matrix_vector_4}{7cm}{5cm}{0.5cm} - \onslide<5>\addimage{\FIGREP/matrix_vector_5}{7cm}{5cm}{0.5cm} - \onslide<6>\addimage{\FIGREP/matrix_vector_6}{7cm}{5cm}{0.5cm} - \onslide<7>\addimage{\FIGREP/matrix_vector_7}{7cm}{5cm}{0.5cm} + \onslide<1>\addimage[width=7cm]{\FIGREP/matrix_vector_1}{5cm}{0.5cm} + \onslide<2>\addimage[width=7cm]{\FIGREP/matrix_vector_2}{5cm}{0.5cm} + \onslide<3>\addimage[width=7cm]{\FIGREP/matrix_vector_3}{5cm}{0.5cm} + \onslide<4>\addimage[width=7cm]{\FIGREP/matrix_vector_4}{5cm}{0.5cm} + \onslide<5>\addimage[width=7cm]{\FIGREP/matrix_vector_5}{5cm}{0.5cm} + \onslide<6>\addimage[width=7cm]{\FIGREP/matrix_vector_6}{5cm}{0.5cm} + \onslide<7>\addimage[width=7cm]{\FIGREP/matrix_vector_7}{5cm}{0.5cm} \vspace{5cm} \begin{itemize} \item<4> Non contiguous data accesses are detrimental for performance! \end{itemize} \end{frame} \begin{frame}[t] \frametitle{Single-core optimization} \framesubtitle{Early conclusions} \begin{itemize} \item Caches are small, but very fast memories \item Their purpose is to alleviate long latency and limited bandwidth of the RAM \item Data is fetched by group, called cache line, and stored into the different levels of cache \item In order to fully exploit caches, data in caches must be re-used as much as possible \end{itemize} \vfill \begin{itemize} \item Avoid random memory accesses that case many cache misses and prefer contiguous access \item Be careful of the data types you use and how they are mapped onto memory \end{itemize} \end{frame} \subsection{Single Instruction Multiple Data} \label{sec:simd} \begin{frame}[t] \frametitle{Single-core optimization} \framesubtitle{Single Instruction Multiple Data} \begin{itemize} \item Modern CPUs can apply the same operation to multiple data \item Special registers \cmd{xmm}, \cmd{ymm} and \cmd{zmm} holding 2, 4 or 8 doubles \end{itemize} - \onslide<1>\addimage{\FIGREP/vectorization_1}{7cm}{5cm}{1.2cm} - \onslide<2>\addimage{\FIGREP/vectorization_2}{7cm}{5cm}{1.2cm} + \onslide<1>\addimage[width=7cm]{\FIGREP/vectorization_1}{5cm}{1.2cm} + \onslide<2>\addimage[width=7cm]{\FIGREP/vectorization_2}{5cm}{1.2cm} \end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: diff --git a/src/performance_measurement/performance_measurement.tex b/src/performance_measurement/performance_measurement.tex index 7fbdeb3..83501b1 100644 --- a/src/performance_measurement/performance_measurement.tex +++ b/src/performance_measurement/performance_measurement.tex @@ -1,597 +1,597 @@ \renewcommand{\FIGREP}{src/performance_measurement/figures} \section{Performance measurement} \label{sec:performance_measurement} \intersec{helvetios} \begin{frame} \frametitle{Goal of this section} \framesubtitle{} \begin{itemize} \item Key concepts to quantify performance \begin{itemize} \item Metrics \item Scalings, speedup, efficiency \end{itemize} \item Roofline model \item Using a profiler \end{itemize} \end{frame} \subsection{Performance metrics} \label{sec:metrics} \begin{frame} \frametitle{Performance metrics} \framesubtitle{} \begin{itemize} \item How can we quantify performance? \item We need to define a means to measure it \item We will focus on the most interesting metrics for HPC \end{itemize} \vfill \pause \begin{itemize} \item The first that comes in mind is \textit{time}, e.g. time-to-solution \item Derived metrics: speedup and efficiency \end{itemize} \vfill \pause \begin{itemize} \item Scientific codes do computations on floating point numbers \item A second metric is the number of \textit{floating-point operations per second} (\si{\flops}) \end{itemize} \vfill \pause \begin{itemize} \item Finally, the \textit{memory bandwidth} indicates how much data does your code transfers per unit of time \end{itemize} \end{frame} \note{ \begin{itemize} \item My code is super fast, it runs in $2.5\si{\ns}$! \item It seems fast, but is it? How fast can your hardware go? \item To really understand how much your code exploit the hardware, we use the \si{\flops} and memory BW \item Your hardware has theoretical maximum values for those \item You can compare the values from your code to the max to see how well you use the hardware \end{itemize} } \subsection{Scalings, speedup and efficiency} \label{sec:scalings} \begin{frame} \frametitle{Speedup and efficiency} \framesubtitle{} \begin{itemize} \item Two important metrics are derived from timings \item Compare timings with $n$ processes, $T_{n}$, against the reference timing, $T_\text{ref}$ \end{itemize} \vfill \begin{minipage}{0.3\linewidth} \begin{center} \textbf{Speedup} \end{center} \begin{equation*} S(n) = \frac{T_\text{ref}}{T_{n}} \end{equation*} \end{minipage} \hspace{0.5cm} \begin{minipage}{0.3\linewidth} \begin{center} \textbf{Efficiency} \end{center} \begin{equation*} E(n) = \frac{S(n)}{n} \end{equation*} \end{minipage} \vfill \begin{itemize} \item We want $S(n)$ as close to $n$ and $E(n)$ as close to 1 (100\%) as possible \end{itemize} \end{frame} \begin{frame}[t] \frametitle{Strong scaling} \framesubtitle{} \begin{itemize} \item Scalings are a way to assess how well a program performs when adding computational resources \item Strong scaling: add resources, keep total amount of work constant \begin{equation*} S(n) = \frac{T_{1}}{T_{n}}, \qquad E(n) = \frac{S(n)}{n} = \frac{T_{1}}{nT_{n}} \end{equation*} \item Strong scaling is an indication on how much profitable it is to add resources to solve your problem \end{itemize} - \addimage{\FIGREP/strong_scaling}{6cm}{5cm}{1cm} + \addimage[width=6cm]{\FIGREP/strong_scaling}{5cm}{1cm} \end{frame} \begin{frame}[t] \frametitle{Weak scaling} \framesubtitle{} \begin{itemize} \item Weak scaling: add resources and maintain amount of work per resource constant \begin{equation*} S(n) = \frac{nT_{1}}{T_{n}}, \qquad E(n) = \frac{S(n)}{n} = \frac{T_{1}}{T_{n}} \end{equation*} \item Weak scalings are an indication on how well your code will perform on a bigger machine (and with a bigger problem) \item These scalings are always required for a proposal \begin{itemize} \item For strong scalings the metric is speedup (how do I improve performance) \item For weak scalings the metric is efficiency (how well performance is kept) \end{itemize} \end{itemize} - \addimage{\FIGREP/weak_scaling}{6cm}{5cm}{1cm} + \addimage[width=6cm]{\FIGREP/weak_scaling}{5cm}{1cm} \end{frame} \subsection{Amdahl's law} \label{sec:amdahl} \begin{frame}[t] \frametitle{Amdahl's law} \framesubtitle{} \begin{itemize} \item Amdahl's law gives you an upper bound to the achievable speedup for a fixed problem size \item By definition it is a strong scaling analysis \vfill \pause \item Assume a fraction $p$ of your code is (perfectly) parallel and timing with 1 process is $T_{1}$ \item Timing with $n$ processes is \begin{equation*} T_{n} = (1-p) T_{1} + \frac{p}{n}T_{1} = \left[ (1-p) + \frac{p}{n}\right] T_{1} \end{equation*} \pause \item Speedup becomes \begin{equation*} S(n) = \frac{T_{1}}{T_{n}} = \frac{1}{(1-p) + \frac{p}{n}} \end{equation*} \vfill \pause \item In the limit of infinite resources \begin{equation*} \lim_{n\rightarrow\infty}S(n) = \frac{1}{1-p} \end{equation*} \end{itemize} - \onslide<2->\addimage{\FIGREP/amdahl_illustration}{3cm}{12.5cm}{1.0cm} + \onslide<2->\addimage[width=3cm]{\FIGREP/amdahl_illustration}{12.5cm}{1.0cm} \end{frame} \begin{frame}[b] \frametitle{Amdahl's law} \framesubtitle{} \begin{itemize} \item Limited by the serial part (very sensitive)! \item Does this mean we cannot exploit large HPC machines? \pause \item No, in general with more resources, we simulate larger systems $\Rightarrow$ weak scaling (see \href{https://en.wikipedia.org/wiki/Gustafson\%27s_law}{Gustafson law}) \end{itemize} - \onslide<1->\addimage{\FIGREP/amdahl_speedup}{8.cm}{4cm}{2cm} + \onslide<1->\addimage[width=8.cm]{\FIGREP/amdahl_speedup}{4cm}{2cm} \end{frame} \begin{frame} \frametitle{\si{\flops} and memory bandwidth} \framesubtitle{} \begin{itemize} \item How to measure \si{\flops}? \begin{itemize} \item By hand, dividing the number of operations by the running time \item Using tools such as PAPI, Tau, likwid, Intel Amplxe, etc. \end{itemize} \end{itemize} \vfill \begin{itemize} \item Memory bandwidth measures the amount of data transferred by unit of time [\si{\byte\per\second}, \si{\kibi\byte\per\second}, \si{\mebi\byte\per\second}, \si{\gibi\byte\per\second}, ...] \item How to measure it? \begin{itemize} \item By hand dividing the amount of data transferred by the running time \item Using tools such as STREAM, PAPI, Tau, Intel Amplxe, etc \end{itemize} \end{itemize} \end{frame} \begin{frame}[t,fragile] \frametitle{Performance measurement} \framesubtitle{A simple DAXPY example} \begin{itemize} \item Assume Intel Xeon Gold 6132 (Gacrux) \end{itemize} \cxxfile[% - title={optimization/daxpy.cpp}, + title={optimization/daxpy.cc}, minted options app={ % highlightlines={2, 7}, firstline=25, lastline=27, firstnumber=1, - }]{examples/optimization/daxpy.cpp} + }]{examples/optimization/daxpy.cc} \begin{itemize} \item My code runs in \SI{174.25}{\ms}. It is amazingly fast! \end{itemize} \pause \vfill \begin{itemize} \item Each iteration has 2 FLOP (1 add and 1 mul) and there are \cmd{N = 1e8} iterations \item Our hardware can achieve a theoretical peak performance of $\SI{1.16}{\tera\flops}$ \item Our code $\SI{2d8}{\flop} / \SI{174.25d3}{\second} = \SI{0.001}{\tera\flops}$... \end{itemize} \end{frame} \subsection{Roofline model} \label{sec:roofline} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{} \begin{itemize} \item How well am I exploiting the hardware resources? \item The roofline model is a performance model allowing to have an estimate to this question \end{itemize} \vspace{1cm} \pause \begin{itemize} \item Key concept: the arithmetic intensity, $AI$, of an algorithm is \# \si{\flop\per\byte} of data transferred \item It measures data reuse \end{itemize} - \addimage{\FIGREP/ai}{8.cm}{4cm}{0.5cm} + \addimage[width=8.cm]{\FIGREP/ai}{4cm}{0.5cm} \end{frame} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{} \begin{itemize} \item Roofline model is plotted on \textbf{log-log scale} \begin{itemize} \item x-axis is the $AI$ \item y-axis is \si{\flops} \end{itemize} \pause \item The hardware limits are defined by \begin{equation*} P = \min(P_{\text{max}}, b_{s} \cdot AI) \end{equation*} \begin{itemize} \item $P_{\text{max}}$ is the CPU peak \si{\flops} \item$AI$ is the intensity \item $b_{s}$ is the memory BW \end{itemize} \end{itemize} - \onslide<1>\addimage{\FIGREP/roofline_1}{5cm}{5.5cm}{0.5cm} - \onslide<2>\addimage{\FIGREP/roofline_2}{5cm}{5.5cm}{0.5cm} - \onslide<3>\addimage{\FIGREP/roofline_3}{5cm}{5.5cm}{0.5cm} + \onslide<1>\addimage[width=5cm]{\FIGREP/roofline_1}{5.5cm}{0.5cm} + \onslide<2>\addimage[width=5cm]{\FIGREP/roofline_2}{5.5cm}{0.5cm} + \onslide<3>\addimage[width=5cm]{\FIGREP/roofline_3}{5.5cm}{0.5cm} \end{frame} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{} \begin{itemize} \item Refinements can be made to the Roofline model \item Adding a memory hierarchy with caches \item Adding different levels of DLP (Data-Level parallelism) \item They give you hint on what to optimize for \end{itemize} - \addimage{\FIGREP/roofline_extended}{7cm}{4.5cm}{0.5cm} + \addimage[width=7cm]{\FIGREP/roofline_extended}{4.5cm}{0.5cm} \end{frame} \begin{frame}[fragile,t] \frametitle{Roofline model} \framesubtitle{How to find the peak performance} \begin{itemize} \item Theoretical peak performance\\ \vspace{-2ex} \begin{minipage}{.4\linewidth} \begin{align*} P_{\text{max}} = & \textcolor{white}{\times} \text{Number of FP ports (ILP)} \\ & \times \text{flops} / \text{cycles (e.g. 2 for FMA)} \\ & \times \text{vector size (DLP)} \\ & \times \text{frequency (in GHz)} \\ & \times \text{number of cores (TLP)} \end{align*} \end{minipage} \vspace{3ex} \item Example: \href{https://en.wikichip.org/wiki/intel/microarchitectures/skylake_(server)}{Intel Xeon Gold 6132}\\ \vspace{-2ex} \begin{minipage}{.4\linewidth} \begin{align*} P_{\text{max}} = & \textcolor{white}{\times} 2 \text{ (ports)} \\ & \times \SI{2}{\flop\per\cycle} \text{ (2 for FMA)} \\ & \times \frac{\SI{512}{\bit} \text{ (AVX512)} }{\SI{64}{\bit}\text{ (double)}} \\ & \times \SI{2.3}{\giga\hertz} \\ & \times 14 \text{ (cores)} \\ = & \SI{1.16}{\tera\flops} \end{align*} \end{minipage} - \addimage{\FIGREP/skylake_server_block_diagram}{6cm}{9cm}{0.8cm} + \addimage[width=6cm]{\FIGREP/skylake_server_block_diagram}{9cm}{0.8cm} \pause \vspace{3ex} \item Or use a software that estimates it \end{itemize} \end{frame} \begin{frame}[t] \frametitle{Roofline model} \framesubtitle{How to find the memory bandwidth} \begin{itemize} \item Theoretical memory bandwidth of the memory \begin{align*} \text{BW}_{\text{max}} = &\textcolor{white}{\times} \text{Number of transfers per second} \\ & \times \text{Bus width} \\ & \times \text{Number of interfaces} \end{align*} \item In general, we suppose that RAM matches CPU bandwidth (found on the CPU spec. list) \item Example: \href{https://en.wikichip.org/wiki/intel/xeon_gold/6132}{Intel Xeon Gold 6132} \begin{align*} \text{BW}_{\text{max}} = &\textcolor{white}{\times} \SI{2666}{\mega\transfer\per\second} \text{ (DDR4 2666)} \\ & \times \SI{8}{\byte\per\transfer} \text{ (64bit bus)}\\ & \times 6 \end{align*} \begin{itemize} \item $\SI{20.83}{\gibi\byte\per\second}$ for 1 channel \item Maximum of $\SI{124.97}{\gibi\byte\per\second}$ \end{itemize} \pause \item Or use a software that estimates it \end{itemize} \begin{itemize} \item A corollary from ``theoretical'' is that it is not achievable in practice! \end{itemize} \end{frame} \begin{frame}[t,fragile] \frametitle{Roofline model} \framesubtitle{How to find arithmetic intensity} \begin{itemize} \item For very simple algorithms, you can compute the AI \item Let's take back the DAXPY example \cxxfile[% - title={optimization/daxpy.cpp}, + title={optimization/daxpy.cc}, minted options app={ % highlightlines={2, 7}, firstline=25, lastline=27, firstnumber=1, - }]{examples/optimization/daxpy.cpp} + }]{examples/optimization/daxpy.cc} \item There are 2 operations (1 add and 1 mul) \item Three 8-byte memory operations (2 loads and 1 store) \item The AI is then $2/24 = 1/12$ \pause \item For more complex algorithms, use a tool, e.g. Intel Advisor \end{itemize} \end{frame} \subsection{Profiling} \label{sec:profiling} \begin{frame} \frametitle{Profiling} \framesubtitle{A precious ally for optimization} \begin{itemize} \item Where is my application spending most of its time? \begin{itemize} \item (bad) measure time ``by hand'' using timings and prints \item (good) use a tool made for this, e.g. Intel Amplifier, Score-P, gprof \end{itemize} \end{itemize} \vfill \begin{itemize} \item In addition to timings, profilers give you a lot more information on \begin{itemize} \item Memory usage \item Hardware counters \item CPU activity \item MPI communications \item etc. \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile,exercise] \frametitle{Profiling} \framesubtitle{Interactive demonstration} \begin{itemize} \item For the purpose of this exercise, we will use MiniFE \begin{itemize} \item 3D implicit finite-elements on an unstructured mesh \item C++ mini application \item \url{https://github.com/Mantevo/miniFE} \item You don't need to understand what the code does! \end{itemize} \item We will use Intel VTune, part of the \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html\#base-kit}{OneAPI Base toolkit (free)} \end{itemize} \vfill \begin{itemize} \item Download miniFE \item Compile the basic version found in \cmd{ref/src} \item Profile the code using the hotspot analysis \item Open Intel VTune and select your timings \item Play around and find the 5 most time-consuming functions \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{Compile MiniFE} \begin{itemize} \item Download miniFE \begin{bashcode} $> git clone https://github.com/Mantevo/miniFE.git $> cd miniFE \end{bashcode} \item Compile the basic version found in \code{ref/src} \begin{itemize} \item You will need to load a compiler and an MPI library \begin{bashcode} $> module load intel intel-mpi intel-vtune \end{bashcode}%$ \item Change the \cmd{Makefile} to set \cmd{CXX=mpiicpc} and \cmd{CC=mpiicc} and compile \begin{bashcode} $> make \end{bashcode}%$ \item Make sure to compile your code with \cmd{-g -O3} \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{Profile MiniFE} \begin{itemize} \item Profile the code using \begin{bashcode} $> srun -n 1 amplxe-cl -collect hotspots -r prof_results -- ./miniFE.x -nx 128 -ny 128 -nz 128 \end{bashcode}%$ \item This will profile for the ``hotspots'' and store the timings in \cmd{prof\_results} \item You can have more info on the types of analysis with \begin{bashcode} $> amplxe-cl -h collect \end{bashcode}%$ \item Open Intel VTune and select your timings \begin{bashcode} $> amplxe-gui prof_results/prof_results.amplxe \end{bashcode}%$ \item Play around and find the 5 most time-consuming functions \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{What do we learn?} \begin{itemize} \item 50.0\% of the time spent in matrix/vector multiplications \item 12.5\% of time spent imposing boundary conditions \item etc. \item Does the problem size influence the timings? \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{Smaller problem} \begin{itemize} \item This time, we profile a problem of size $(16, 16, 16)$ \item 13.6\% of the time is spent opening libraries \item 13.6\% of the time is spent initializing MPI \item etc. \item Depending on the problem size, different parts of the code will dominate \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Profiling} \framesubtitle{Some tips and tricks} \begin{itemize} \item Profile a code without bugs! \item Choose the right problem size (representative of your simulations) \item Focus on the functions taking the most time first \item If the profile is not explicit, try refactoring into smaller functions \begin{itemize} \item Some profilers, e.g. ScoreP, let you define custom regions \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile,t] \frametitle{Profiling} \framesubtitle{Optimization} \begin{itemize} \item We now have a pretty good idea of which part of the code to optimize \item Different options are possible (by order of complexity) \begin{enumerate} \item Compiler and linker flags \item Optimized external libraries \item Handmade optimization (loop reordering, better data access, etc.) \item Algorithmic changes \end{enumerate} \pause \item Example of matrix/vector multiplication. Graph shows complexity ($\mathcal{O}(n^{\omega})$) for different algorithms \end{itemize} - \onslide<2>\addimage{\FIGREP/matmul}{7cm}{4.5cm}{0.5cm} + \onslide<2>\addimage[width=7cm]{\FIGREP/matmul}{4.5cm}{0.5cm} \end{frame} \begin{frame}[fragile,t] \frametitle{Parallelization} \framesubtitle{When to parallelize} \begin{itemize} \item Only when your code has \textit{no bugs} and is \textit{optimized} \item Are your ready to parallelize? \begin{enumerate} \item Is it worth to parallelize my code? Does my algorithm scale? \item Performance prediction? \item Profiling? \item Bottelnecks? \item Which parallel paradigm should I use? What is the target architecture (SMP, cluster, GPU, hybrid, etc)? \end{enumerate} \end{itemize} \end{frame} \begin{frame}[fragile,t] \frametitle{Parallelization} \framesubtitle{When to parallelize} In 1991, David H. Bailey published a famous paper: \href{https://www.davidhbailey.com/dhbpapers/twelve-ways.pdf}{Twelve ways to fool the masses when giving performance results on parallel computers} \vspace{1cm} \textit{6: Compare your results against scalar, unoptimized code on Crays.} - \addimage{\FIGREP/dhb}{7cm}{4.5cm}{0.5cm} + \addimage[width=7cm]{\FIGREP/dhb}{4.5cm}{0.5cm} \end{frame} \subsection{Pareto principle} \label{sec:pareto} \begin{frame} \frametitle{Pareto principle} \framesubtitle{The 80/20 rule} \begin{itemize} \item General principle that states that 80\% of the effect comes from 20\% of causes \item Applies in many domains and especially in optimization \item 80\% of the time is spent in 20\% of your code \item Concentrate on those 20\% and don't arbitrarily optimize \end{itemize} \end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: