openmp.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, May 30, 21:45

openmp.tex
View Options

	\renewcommand{\FIGREP}{src/openmp/figures}

	\section{OpenMP}
	\label{sec:openmp}
	\intersec{fidis}

	\subsection{Task parallelism}
	\label{sec:task_parallelism}

	\begin{frame}
	\frametitle{Goal of this section}
	\framesubtitle{}

	\begin{itemize}
	\item Understand the context of shared memory
	\item Understand more in detail the architecture of a node
	\item Get familiar with the OpenMP execution and memory model
	\item Getting some speedup with Task Level Parallelism
	\end{itemize}
	\end{frame}

	\subsection{Introduction}
	\label{sec:openmp:introduction}
	\begin{frame}
	\frametitle{Releases history, present and future}
	\framesubtitle{}

	\begin{itemize}
	\item{October 1997: Fortran version 1.0 }
	\item{Late 1998: C/C++ version 1.0 }
	\item{June 2000: Fortran version 2.0 }
	\item{April 2002: C/C++ version 2.0 }
	\item{June 2005: Combined C/C++ and Fortran version 2.5}
	\item{May 2008: Combined C/C++ and Fortran version 3.0}
	\item{\textbf{July 2011: Combined C/C++ and Fortran version 3.1}}
	\item{July 2013: Combined C/C++ and Fortran version 4.0}
	\item{November 2015: Combined C/C++ and Fortran version 4.5}
	\item{November 2018: Combined C/C++ and Fortran version 5.0}
	\item{November 2020: Combined C/C++ and Fortran version 5.1}
	\end{itemize}
	\addimage[width=5cm]{\FIGREP/logo_OpenMP.png}{10cm}{5cm}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Terminology}
	\framesubtitle{Selected extract of the specification}

	\begin{itemize}
	\item Specification:
	\begin{itemize}
	\item \href{https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5-1.pdf}{Full specification}
	\item \href{https://www.openmp.org/wp-content/uploads/OpenMPRefCard-5.1-web.pdf}{RefCard}
	\end{itemize}
	\item Terms:
	\begin{description}
	\item[thread] an execution entity with a stack \\
	and a static memory (\textit{threadprivate memory})
	\vspace{1ex}
	\addimage[width=1cm]{\FIGREP/thread}{11cm}{4.5cm}
	\item[OpenMP thread] a \textit{thread} managed by the OpenMP runtime\\
	\vspace{2em}
	\addimage[width=1cm]{\FIGREP/omp-thread}{11cm}{3.5cm}
	\item[processor] an hardware unit on which one or more \textit{OpenMP thread} can execute
	\item[directive] a base language mechanism to specify OpenMP program behavior
	\item[construct] an OpenMP executable directive and the associated
	statement, loop nest or structured block, if any, not including the
	code in any called routines. That is, the lexical extent of an
	executable directive.
	\end{description}
	\end{itemize}
	\end{frame}

	\begin{frame}
	\frametitle{Memory Model}
	\framesubtitle{Shared memory}

	\onslide<1>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture}{4.75cm}{1.2cm}
	\onslide<2>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture_wo_gpus}{4.75cm}{1.2cm}
	\onslide<3>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture_w_cores}{4.75cm}{1.2cm}
	\onslide<4>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture_essential}{4.75cm}{1.2cm}
	\end{frame}

	\begin{frame}
	\frametitle{Execution Model}
	\framesubtitle{Fork/join}

	\addimage{\FIGREP/fork-join}{4.75cm}{1.2cm}
	\end{frame}


	\begin{frame}[fragile]
	\frametitle{Compiling an OpenMP code}
	\framesubtitle{pragmas and compilation}
	\begin{itemize}
	\item OpenMP directives are written as pragmas: \cxxinline{#pragma omp}
	\item Use the conditional compilation flag \cxxinline{#if defined _OPENMP} for the preprocessor
	\end{itemize}

	\hspace{2cm}
	\pause

	\begin{itemize}
	\item Compilation using the GNU compiler:
	\begin{bashcode}
	$> g++ -fopenmp ex1.c -o ex1
	\end{bashcode}%$
	\item Compilation using the Intel compiler:
	\begin{bashcode}
	$> icpc -qopenmp ex1.c -o ex1
	\end{bashcode}%$
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Hello World in C++}
	\framesubtitle{Simple version}

	\centering
	\cxxfile[title={openmp/hello.cc}]{examples/openmp/hello.cc}

	\pause
	\begin{consoleoutput}
	$ OMP_NUM_THREADS=4 ./openmp/hello
	Hello from thread 2 out of 4
	Hello from thread 1 out of 4
	Hello from thread 0 out of 4
	Hello from thread 3 out of 4
	\end{consoleoutput} %$
	\end{frame}


	\begin{frame}[fragile]
	\frametitle{Hello World in C++}
	\framesubtitle{With condition compilation}

	\centering
	\cxxfile[%
	title={openmp/hello\_cond.cc},
	minted options app={
	firstline=6,
	}]{examples/openmp/hello_cond.cc}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Number of concurrent threads}

	\begin{itemize}
	\item Default implementation dependent (usually max hardware thread)
	\item At runtime in the code
	\begin{cxxcode}{}
	omp_set_num_threads(nthreads);
	\end{cxxcode}

	\item With en environment variable
	\begin{bashcode}
	$> export OMP_NUM_THREADS=4
	\end{bashcode}%$
	\end{itemize}
	\end{frame}

	\subsection{The \texttt{parallel} construct}

	\begin{frame}[fragile]
	\frametitle{The \texttt{parallel} construct}

	This is the mother of all constructs in OpenMP. It starts a parallel execution.
	\begin{cxxcode}{Syntax}
	#pragma omp parallel [clause[[,] clause]...]
	{
	\|\emph{structured-block}\|
	}
	\end{cxxcode}

	where \textit{clause} is one of the following:
	\begin{itemize}
	\item \code{if} or \code{num\_threads} : conditional clause
	\item \code{default(private $\vert$ firstprivate $\vert$ shared $\vert$ none)} : default data scoping
	\item \code{private(\textit{list})}, \code{firstprivate(\textit{list})},
	\code{shared(\textit{list})} or \code{copyin(\textit{list})} : data
	scoping
	\item \code{reduction(\textit{operator}$\,$:$\,$\textit{list})}
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile,exercise]
	\frametitle{Hello $\pi$}
	\framesubtitle{}

	\begin{itemize}
	\item In the \texttt{pi.cc} add a function call to get the number of
	threads.
	\item Compile using the porper options for OpenMP
	\item Test that it works by varying the number of threads \code{export
	OMP\_NUM\_THREADS}
	\item To vary the number of threads in a \texttt{\bf sbatch} job you can set
	the number of threads to the number of cpus per task.
	\begin{bashcode}
	#!/bin/bash
	#SBATCH -c <nthreads>

	export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
	<my_openmp_executable>
	\end{bashcode} %$
	\end{itemize}

	\end{frame}

	\subsection{worksharing constructs ("subsubsections", "single", "workshare")}

	\begin{frame}[fragile]
	\frametitle{Worksharing constructs}

	Work-sharing constructs are possible in three ``flavours'' :
	\begin{itemize}
	\item \code{sections} construct
	\item \code{single} construct
	\item \code{workshare} construct (only in Fortran)
	\end{itemize}

	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Worksharing constructs}
	\framesubtitle{The \code{sections} construct}

	\begin{cxxcode}{Syntax}
	#pragma omp [parallel] sections [clause]
	{
	#pragma omp section
	{
	code_block
	}
	}
	\end{cxxcode}

	where \textit{clause} is one of the following:
	\begin{itemize}
	\item \code{private(\textit{list})}, \code{firstprivate(\textit{list})},
	\code{lastprivate(\textit{list})}
	\item \code{reduction(\textit{operator} : \textit{list})}
	\item Each \code{section} within a \code{sections} construct is assigned to
	one and only one thread
	\end{itemize}
	\end{frame}


	\begin{frame}[fragile]
	\frametitle{A \texttt{sections} construct}
	\framesubtitle{Example}
	\centering
	\cxxfile[%
	title={openmp/sections.cc},
	minted options app={
	firstline=6,
	lastline=14,
	}]{examples/openmp/sections.cc}

	\pause
	\begin{consoleoutput}
	$ ./openmp/sections
	Thread 0 handling section 1
	Thread 1 handling section 2
	Thread 2 handling section 3
	\end{consoleoutput} %$

	\end{frame}


	\begin{frame}[fragile]
	\frametitle{Worksharing constructs}
	\framesubtitle{The \code{single} construct}

	Only one thread (usualy the first entering thread) executes the
	\code{single} region.

	\begin{cxxcode}{Syntax}
	#pragma omp single [clause[[,] clause] ...]
	{
	\|\emph{structured-block}\|
	}
	\end{cxxcode}
	where \textit{clause} is one of the following:
	\begin{itemize}
	\item \code{private(\textit{list})}, \code{firstprivate(\textit{list})}
	\item \code{nowait}
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{The \code{master} directive}
	\framesubtitle{Deprecated for \code{masked}}

	Only the master thread execute the section. It can be used in any OpenMP construct

	\begin{cxxcode}{Syntax}
	#pragma omp master
	{
	\|\emph{structured-block}\|
	}
	\end{cxxcode}
	\end{frame}


	\subsection[Loops]{The Worksharing-loop construct}

	\begin{frame}[fragile]
	\frametitle{The Worksharing-loop construct}
	\framesubtitle{The \code{for} construct}

	Parallelization of the following loop
	\begin{cxxcode}{Syntax}
	#pragma omp for [clause[[,] clause] ... ]
	{
	\|\emph{for-loop}\|
	}
	\end{cxxcode}
	where \textit{clause} is one of the following:
	\begin{itemize}
	\item \code{schedule(\textit{kind[, chunk\_size]})}
	\item \code{collapse(\textit{n})}
	\item \code{ordered}
	\item \code{private(\textit{list})}, \code{firstprivate(\textit{list})},
	\code{lastprivate(\textit{list})}
	% \item \code{reduction(\textit{operator}\,:\,\textit{list})}
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Example of \code{for} construct}

	\centering
	\cxxfile[title={openmp/for.cc}, minted options app={
	firstline=6,
	lastline=13,
	}]{examples/openmp/for.cc}

	\pause
	\begin{consoleoutput}
	$ ./openmp/for
	Thread 0 handling i=0
	Thread 0 handling i=1
	Thread 0 handling i=2
	Thread 1 handling i=3
	Thread 1 handling i=4
	Thread 1 handling i=5
	\end{consoleoutput} %$
	\end{frame}

	\begin{frame}[fragile,exercise]
	\frametitle{Compute $\pi$ in parallel}
	\framesubtitle{First try at parallel code}

	\begin{itemize}
	\item Add a \code{parallel for} work sharing construct around the integral
	computation
	\item Run the code
	\item Run the code
	\item Run the code
	\item What can you observe on the value of $\pi$ ?
	\end{itemize}
	\end{frame}

	\subsection{The Synchronization constructs}

	\begin{frame}[fragile]
	\frametitle{The Synchronization constructs}
	\framesubtitle{The \code{critical} construct}

	Restricts execution of the associated structured block to a single thread at a time.
	\begin{cxxcode}{Syntax}
	#pragma omp critical [(name) [[,] \|\textbf{hint}\|(hint-expression)]]
	{
	\|\emph{structured-block}\|
	}
	\end{cxxcode}

	\begin{itemize}
	\item \code{name} optional to identify the construct
	\item \code{hint-expression} information on the expected execution
	\begin{itemize}
	\item \cxxinline{omp_sync_hint_none}
	\item \cxxinline{omp_sync_hint_uncontended}
	\item \cxxinline{omp_sync_hint_contended}
	\item \cxxinline{omp_sync_hint_nonspeculative}
	\item \cxxinline{omp_sync_hint_speculative}
	\end{itemize}
	% \item \code{reduction(\textit{operator}\,:\,\textit{list})}
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile,exercise]
	\frametitle{Compute $\pi$ in parallel}
	\framesubtitle{Naive reduction}

	\begin{itemize}
	\item To solve the raise condition from the previous exercise we can protect
	the computation of the sum.
	\item Add a \code{critical} directive to protect the sum
	\item Run the code
	\item What can you observe on the execution time while varying the number of
	threads
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{The Synchronization constructs}
	\framesubtitle{The \code{barrier} construct}

	Specifies an explicit barrier.
	\begin{cxxcode}{Syntax}
	#pragma omp barrier
	\end{cxxcode}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{The Synchronization constructs}
	\framesubtitle{The \code{atomic} construct}

	Ensures a specific storage location is accessed atomically.
	\begin{cxxcode}{Syntax}
	#pragma omp atomic [clause[[,] clause] ... ]
	\|\emph{statement}\|
	\end{cxxcode}

	where \textit{clause} is one of the following:
	\begin{itemize}
	\item \emph{atomic-clauses} \code{read}, \code{write}, \code{update}
	\item \emph{memory-order-clauses} \code{seq\_cst}, \code{acq\_rel},
	\code{releases}, \code{acquire}, \code{relaxed}
	\item or one of \code{capture}, \code{compare},
	\code{hint(hint-expression)}, \code{fail(seq\_cst $\vert$ acquire $\vert$
	relaxed)}, or \code{weak}
	\end{itemize}
	\end{frame}

	\subsection{Data sharing clauses}

	\begin{frame}[fragile]
	\frametitle{Data sharing clauses}
	\framesubtitle{What are the variables values}
	\begin{itemize}
	\item Most common source of errors
	\item Determine which variables are \code{private} to a thread, which are
	\code{shared} among all the threads
	\item In case of a \code{private} variable the variable values can be defined using:
	\begin{itemize}
	\item \code{firstprivate} defines the value when entering the region
	\item \code{lastprivate} defines the value when exiting the region (OpenMP
	5.1 in C/C++)
	\end{itemize}
	\item \code{default(private $\vert$ firstprivate $\vert$ shared $\vert$ none)} can be specified\\
	\code{default(none)} means each variables should appear in a
	\code{shared} or \code{private} list
	\end{itemize}
	\end{frame}


	\begin{frame}[fragile]
	\frametitle{Data sharing clauses}
	\framesubtitle{\code{shared} and \code{private}}
	These attributes determines the scope (visibility) of a single or list of variables
	\begin{cxxcode}{Syntax}
	shared(list1), \|\texttt{private}\|(list2)
	\end{cxxcode}

	\begin{itemize}
	\item The \code{private} clause: the data is private to each thread and
	non-initialized. Each thread has its own copy. \cxxinline{#pragma
	omp parallel private(i)}
	\item The \code{shared} clause: the data is shared among all the
	threads. It is accessible (and non-protected) by all the threads
	simultaneously. \cxxinline{#pragma omp parallel shared(array)}
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Data sharing clauses}
	\framesubtitle{\code{firstprivate} and \code{lastprivate}}
	These clauses determines the attributes of the variables within a parallel region:
	\begin{cxxcode}{Syntax}
	firstprivate(list1), lastprivate(list2)
	\end{cxxcode}

	\begin{itemize}
	\item The \code{firstprivate} super-set of \code{private}, variable is
	initialized to a copie of variable before the region
	\item The \code{lastprivate} super-set of \code{private} the value of the
	last thread exiting the region is copied
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Data sharing clauses}
	\framesubtitle{Example}

	\centering
	\cxxfile[%
	title={openmp/private.cc},
	minted options app={
	firstline=8,
	lastline=19,
	}]{examples/openmp/private.cc}

	\pause
	\begin{consoleoutput}
	$ ./openmp/private
	Thread 0 sees, a, b, c: 1, 2, 3 (before)
	Thread 0 sees, a, b, c: 1839769744, 2, 3 (inside)
	Thread 1 sees, a, b, c: 12789424, 2, 3 (inside)
	Thread 2 sees, a, b, c: 12801392, 2, -0.001 (inside)
	Thread 0 sees, a, b, c: 1, 2, -0.001 (after)
	\end{consoleoutput} %$
	\end{frame}

	\begin{frame}[fragile,exercise]
	\frametitle{Compute $\pi$ in parallel}
	\framesubtitle{Naive reduction improved}

	\begin{itemize}
	\item Create a local variable per thread
	\item Make each thread compute it's own sum
	\item After the computation of the integral us a \code{critical} directive
	to sum the local sum to a \code{shared} sum
	\end{itemize}
	\end{frame}

	\subsection{Loop clauses}

	\begin{frame}[fragile]
	\frametitle{Loop clauses}
	\framesubtitle{\code{reduction} clause}

	\begin{cxxcode}{Syntax}
	reduction(\|\emph{reduction-identifier : list}\|)
	\end{cxxcode}

	\begin{itemize}
	\item \emph{reduction-identifier}: one of the operation \textbf{+},
	\textbf{--}, \textbf{*}, \textbf{\&}, \textbf{\textbar}, \textbf{\^},
	\textbf{\&\&}, \textbf{\textbar \textbar}
	\item \emph{list} item on which the reduction applies
	\item example: \cxxinline{#pragma omp for reduction(+: sum)}
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile,exercise]
	\frametitle{Compute $\pi$ in parallel}
	\framesubtitle{Naive reduction improved}

	\begin{itemize}
	\item Use the \code{reduction} clause
	\item Compare the timings to the previous versions
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Loop clauses}
	\framesubtitle{\code{schedule} clause}

	\begin{cxxcode}{Syntax}
	schedule([modifier [, modifier] : ] kind [, chunk_size])
	\end{cxxcode}

	\begin{itemize}
	\item \emph{kind}
	\begin{itemize}
	\item \code{static} iterations divided in chunks sized \code{chunk\_size}
	assigned to threads in a round-robin fashion
	\item \code{dynamic} iterations divided in chunks sized \code{chunk\_size}
	assigned to threads when they request them until no chunk remains to
	be distributed
	\item \code{guided} iterations divided in chunks sized \code{chunk\_size}
	assigned to threads when they request them. Size of chunks is
	proportional to the remaining unassigned chunks.
	\item \code{auto} The decisions is delegated to the compiler and/or the
	runtime system
	\item \code{runtime} The decisions is delegated to the runtime system
	based on ICV
	\end{itemize}
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Loop clauses}
	\framesubtitle{\code{collapse} clause}

	\begin{cxxcode}{Syntax}
	collapse(\|\emph{n}\|)
	\end{cxxcode}

	Specifies how many loop are combine into a logical space
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Example \code{dgemm}}
	\framesubtitle{collapse(1)}

	\centering
	\cxxfile[title={openmp/dgemm.cc}, minted options app={
	firstline=34,lastline=38
	}]{examples/openmp/dgemm.cc}

	\pause
	\begin{consoleoutput}
	$ OMP_NUM_THREADS=1 ../build/openmp/dgemm
	DGEMM with 1 threads, collapse(1): 21.1209 GFLOP/s (verif 2)
	$ OMP_NUM_THREADS=2 ../build/openmp/dgemm
	DGEMM with 2 threads, collapse(1): 40.2308 GFLOP/s (verif 2)
	$ OMP_NUM_THREADS=4 ../build/openmp/dgemm
	DGEMM with 4 threads, collapse(1): 72.7659 GFLOP/s (verif 2)
	$ OMP_NUM_THREADS=1 ../build/openmp/dgemm
	\end{consoleoutput} %$
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Example \code{dgemm}}
	\framesubtitle{collapse(2)}

	\centering
	\begin{consoleoutput}
	DGEMM with 1 threads, collapse(2): 20.358 GFLOP/s (verif 2)
	$ OMP_NUM_THREADS=2 ../build/openmp/dgemm
	DGEMM with 2 threads, collapse(2): 40.0818 GFLOP/s (verif 2)
	$ OMP_NUM_THREADS=4 ../build/openmp/dgemm
	DGEMM with 4 threads, collapse(2): 72.4462 GFLOP/s (verif 2)
	\end{consoleoutput} %$
	\end{frame}

	\subsection{Advanced topics}

	\begin{frame}[fragile]
	\frametitle{Advanced topics}
	\framesubtitle{Idealized model vs reality (NUMA, Sub-NUMA Clusters, Cluster-on-Die)}

	\onslide<1>\addimage[width=7.5cm]{\FIGREP/detailed_node_architecture_essential}{4.75cm}{1.2cm}
	\onslide<2>\addimage[width=12cm]{\FIGREP/skylake_sp_2-way_3_upi}{2.7cm}{3cm}
	\onslide<3>\addimage[width=12cm]{\FIGREP/skylake_(server)_half_rings}{2cm}{1cm}

	\onslide<4>\begin{tikzpicture}[overlay,remember picture]
	\begin{scope}[shift={(current page.south west)}]
	\draw[gray, thick] (9.1cm, 4.6cm) -- (11.2cm, 4.45cm);
	\draw[gray, thick] (9.1cm, 5.5cm) -- (11.2cm, 6cm);
	\end{scope}
	\end{tikzpicture}
	\onslide<4>\addimage[width=5.5cm]{\FIGREP/skylake_octadeca_core}{3.95cm}{1cm}
	\onslide<4>\addimage[width=2cm]{\FIGREP/skylake_sp_core}{11.2cm}{4.4cm}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Implications of memory layout}
	\framesubtitle{}


	\addimage[width=6cm]{\FIGREP/skylake_sp_2-way_3_upi}{6cm}{6cm}

	\begin{itemize}
	\item One thread can only saturate 1 channel
	\item On memory bound code bandwidth saturate when \# of threads \~{} \# of channels
	\item If memory allocated on the other processor memory, data go through CPU
	interconnect (UPI $3 \times \SI{10.4}{\giga\transfer\per\second}$)

	\pause
	\item How to mitigate this effects ?
	\begin{itemize}
	\item Loop schedule
	\item Memory first touch
	\item Thread placements
	\end{itemize}
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Thread Affinity}
	\framesubtitle{Thread Affinity Control}

	\begin{itemize}
	\item The variable \cmd{OMP_PLACES} describes these places in terms of the available hardware.
	\item The variable \cmd{OMP_PROC_BIND} describes how threads are bound to OpenMP places
	\item The variable \cmd{OMP_DISPLAY_AFFINITY} helps to debug the affinity
	\end{itemize}

	\centering
	\begin{consoleoutput}
	$ OMP_NUM_THREADS=4 OMP_DISPLAY_AFFINITY=true ./openmp/hello
	OMP: pid 2115280 tid 2115280 thread 0 bound to OS proc set {0-31}
	OMP: pid 2115280 tid 2115285 thread 3 bound to OS proc set {0-31}
	OMP: pid 2115280 tid 2115284 thread 2 bound to OS proc set {0-31}
	Hello from thread 0 out of 4
	Hello from thread 3 out of 4
	OMP: pid 2115280 tid 2115283 thread 1 bound to OS proc set {0-31}
	Hello from thread 1 out of 4
	Hello from thread 2 out of 4
	\end{consoleoutput} %$
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Thread Affinity}
	\framesubtitle{\cmd{OMP_PLACES}}

	Possible values for \cmd{OMP_PLACES} where each place corresponds to:
	\begin{description}
	\item [threads] a single hardware thread on the
	device.
	\item [cores] a single core (having one or more
	hardware threads) on the device.
	\item [ll\_caches] a set of cores that share the last
	level cache on the device.
	\item [numa\_domains] a set of cores for which their closest
	memory on the device is:
	\begin{itemize}
	\item the same memory; and
	\item at a similar distance from the cores.
	\end{itemize}
	\item [sockets] a single socket (consisting of one or
	more cores) on the device.
	\end{description}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Thread Affinity}
	\framesubtitle{\cmd{OMP_PROC_BIND}}

	Possible values for \cmd{OMP_PROC_BIND}:
	\begin{description}
	\item [false] threads not bonded
	\item [true] threads are bonded (implementation dependant)
	\item [primary] collocate threads with the primary thread
	\item [close] place threads close to the master in the places list
	\item [spread] spread out threads as much as possible
	\end{description}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{First touch}
	\framesubtitle{}

	\begin{itemize}
	\item Memory is organized in pages
	\item When allocating data ``nothing'' happens
	\item Pages are allocated on the memory associated to the first thread initializing it
	\end{itemize}

	\pause

	\begin{itemize}
	\item To mitigate the problem, initialize the arrays in same order they are accessed
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile]
	\frametitle{Data race, false-sharing}
	\framesubtitle{}

	\begin{itemize}
	\item Data race:
	\begin{itemize}
	\item Data accessed by multiple threads without protection
	\item Lead to undetermined results
	\end{itemize}
	\end{itemize}

	\pause
	\begin{itemize}
	\item False sharing
	\begin{itemize}
	\item Data smaller than cache-line size
	\item Multiple threads accessing data in the same cache line will poison
	each other caches
	\end{itemize}
	\end{itemize}
	\end{frame}


	\subsection{Runtime routines}

	\begin{frame}[fragile]
	\frametitle{Runtime routines}
	\framesubtitle{}

	Sub set of the routines in OpenMP
	\begin{itemize}
	\item \cxxinline{omp_get_num_threads()}: number of threads in the current region
	\item \cxxinline{omp_get_thread_num()}: id of the current thread
	\item \cxxinline{omp_get_max_threads()}: upper bound to the number of
	threads that could be used
	\item \cxxinline{omp_get_wtime()}: wall clock time in seconds
	\item \cxxinline{omp_get_wtick()}: seconds between successive clock ticks
	\end{itemize}
	\end{frame}

	\begin{frame}[fragile,exercise]
	\frametitle{Parallelize the Poisson code using OpenMP}

	\begin{itemize}
	\item Now you can apply what you learn to the \texttt{poisson} code.
	\item Remember that 90\% of the time is spend in the dumpers. So make sure you
	dump only once at the end of the simulation to get a validation image.
	\end{itemize}
	\end{frame}

	%%% Local Variables:
	%%% mode: latex
	%%% TeX-master: "../../phys_743_parallel_programming"
	%%% End:

	% LocalWords: OpenMP, Fortran, preprocessor

openmp.texNo OneTemporaryActions

File Metadata

openmp.texView Options

Event Timeline

openmp.tex
No OneTemporary
Actions

openmp.tex
View Options