diff --git a/src/hybrid/hybrid.tex b/src/hybrid/hybrid.tex index 2a9b77b..fda0976 100644 --- a/src/hybrid/hybrid.tex +++ b/src/hybrid/hybrid.tex @@ -1,369 +1,341 @@ \renewcommand{\FIGREP}{src/hybrid/figures} \section{Hybrid Programming} +\subsection{What will we learn today?} \begin{frame} - \frametitle{What will we learn today ?} + \frametitle{What will we learn today?} \begin{itemize} - \item {Hybrid programming models comparison - \begin{itemize} - \item {Pure MPI} - \item {MPI+OpenMP} - \item {(MPI + MPI one-sided (MPI-3) )} - \end{itemize} - } - \item {How to write a (production) project proposal} + \item Hybrid MPI + OpenMP programming + \item Partitioned point-to-point communications + \item Matching probe/receive \end{itemize} \end{frame} - -\subsection{Hybrid programming models} +\subsection{Hybrid programming model} \begin{frame} \frametitle{Situation} \onslide<1>\addimage[width=14cm]{\FIGREP/situation_1}{2cm}{2cm} \onslide<2>\addimage[width=14cm]{\FIGREP/situation_2}{2cm}{2cm} \end{frame} \begin{frame} - \frametitle{Situation : problems} - \begin{itemize} - \item {Thread safety ?} - \item {Which thread/process can/will call the MPI library ?} - \item {MPI process placement in the case of multi-CPU processors ?} - \item {Data visibility ? OpenMP private ? } - \item {Does my problem fits with the targeted machine ?} - \item {Levels of parallelism within my problem ?} - \end{itemize} -\end{frame} - - + \frametitle{Situation} + \framesubtitle{Problems} -\begin{frame} - \frametitle{Hybrid vs. Pure MPI} - \textbf{Pure MPI} \begin{itemize} - \item[\textcolor{green!40!black}{\textbf{+}}] no code modification - \item[\textcolor{green!40!black}{\textbf{+}}] most of the libraries support multi-thread - \item[\textcolor{red}{\textbf{--}}] does application topology fits system topology ? - \item[\textcolor{red}{\textbf{--}}] useless communications - \end{itemize} - \textbf{Hybrid} - \begin{itemize} - \item[\textcolor{green!40!black}{\textbf{+}}] no message within an NUMA region - \item[\textcolor{green!40!black}{\textbf{+}}] less (no) topology problems - \item[\textcolor{red}{\textbf{--}}] all threads sleep when master communicates - \item[\textcolor{red}{\textbf{--}}] MPI-libs must support (at least) thread safety + \item Thread safety? + \item Which thread/process can/will call the MPI library? + \item MPI process placement in the case of multi-CPU processors? + \item Data visibility? OpenMP private? + \item Does my problem fits with the targeted machine? + \item Levels of parallelism within my problem? \end{itemize} \end{frame} - +\subsection{A simple hello world example} \begin{frame}[fragile] \frametitle{Hybrid MPI/OpenMP hello world} \cxxfile[% title={hybrid/hello\_world.cc}, minted options app={ firstline=1, }]{examples/hybrid/hello_world.cc} - - % Need to overlap communications (from master) with computation between the others. If possible ! \end{frame} \begin{frame}[fragile] \frametitle{Hybrid MPI/OpenMP hello world} Compilation using the GNU g++ compiler: \begin{bashcode} $> mpicxx -fopenmp hello_world.cc -o hello_world \end{bashcode}%$ Compilation using the Intel C++ compiler: \begin{bashcode} $> mpiicpc -qopenmp hello_world.cc -o hello_world \end{bashcode}%$ \end{frame} - \begin{frame}[fragile] \frametitle{Submission script the clusters} \begin{bashcode} #!/bin/bash #SBATCH --nodes 1 #SBATCH --ntasks 2 #SBATCH --cpus-per-task 3 export OMP_NUM_THREADS=3 srun -n 2./hello_world \end{bashcode} \vfill It will start 2 MPI processes each will spawn 3 threads \vfill \begin{consoleoutput} Hello from thread 0 out of 3 from process 0 out of 2 Hello from thread 1 out of 3 from process 0 out of 2 Hello from thread 0 out of 3 from process 1 out of 2 Hello from thread 1 out of 3 from process 1 out of 2 Hello from thread 2 out of 3 from process 0 out of 2 Hello from thread 2 out of 3 from process 1 out of 2 \end{consoleoutput} \end{frame} - +\subsection{Prepare your code for hybrid execution} \begin{frame}[fragile] \frametitle{Changes to your code} \begin{itemize} - \item Change your MPI initialisation routine + \item Change your MPI initialization routine \begin{itemize} \item \code{MPI\_Init} is replaced by \code{MPI\_Init\_thread} \item \code{MPI\_Init\_thread} has two additional parameters for the level of thread support required, and for the level of thread support provided by the library implementation \end{itemize} \begin{cxxcode}{} int MPI_Init_thread(int *argc, char ***argv, int required, int *provided) \end{cxxcode} \item Make sure that the \textit{provided} support matches the \textit{required} one \begin{cxxcode}{} if (provided < required) - MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); \end{cxxcode} - \item Add OpenMP directives as long as you stick to the level of thread safety you specified in the call to \code{MPI\_Init\_thread} + \item Add OpenMP directives as long as you stick to the level of thread + safety you specified in the call to \code{MPI\_Init\_thread} \end{itemize} \end{frame} - \begin{frame}[fragile] - \frametitle{The 4 Options for Thread Support} - % \framesubtitle{I: user informs the MPI library} + \frametitle{The 4 options for thread support} \begin{itemize} \item \code{MPI\_THREAD\_SINGLE} \begin{itemize} - \item Only one thread will execute + \item Only one thread will execute (no multi-threading) \item Standard MPI-only application \end{itemize} \item \code{MPI\_THREAD\_FUNNELED} \begin{itemize} \item Only the Master Thread will make calls to the MPI library - % \item $\rightarrow$ The thread that calls \code{MPI\_Init\_thread} is the master thread \item A thread can determine whether it is the master thread by a call to \cxxinline{MPI\_Is\_thread\_main} \end{itemize} \item \code{MPI\_THREAD\_SERIALIZED} \begin{itemize} \item Only one thread at a time will make calls to the MPI library, but all threads are eligible to make such calls \end{itemize} - \end{itemize} - - % \begin{Verbatim}[formatcom=\color{blue}] - % int MPI_Is_main_thread(int * flag); - % \end{Verbatim} -\end{frame} - - -\begin{frame}[fragile] - \frametitle{The 4 Options for Thread Support} - % \framesubtitle{II: The MPI Library is responsible for Thread Safety} - - \begin{itemize} \item \code{MPI\_THREAD\_MULTIPLE} \begin{itemize} \item Any thread may call the MPI library at any time - \item The MPI library is responsible for thread safety within that - library, and for any libraries that it in turn uses - \item Codes that rely on the level of \code{MPI\_THREAD\_MULTIPLE} may run - significantly slower than the case where one of the other options - has been chosen - \item You might need to link in a separate library in order to get this - level of support \end{itemize} \end{itemize} - - + \vfill + \pause In most cases \code{MPI\_THREAD\_FUNNELED} provides the best choice for hybrid programs \vfill + \pause \begin{cxxcode}{} int MPI_Query_thread(int * thread_level_provided); \end{cxxcode} - Returns the level of thread support provided by the MPI library - -\end{frame} - - - -\begin{frame} - \frametitle{Topology problems} - - % The problem: we have a domain with 80 subdomains as follows: - % - % \begin{center} - % \begin{tikzpicture}% - % \draw[step=4mm] (0,0) grid (6.4,2.0); - % \end{tikzpicture} - % \end{center} - - How to deal with : - - \begin{itemize} - \item topology / mapping ? (Which physical core is assigned to which process/thread) - \item sub-domain decomposition ? - \item halos size ? halos shapes ? - \item unnecessary communications ? - \item \textbf{computation to communication ratio} ? - \end{itemize} - - Pure MPI ? Hybrid ? - \\ - \textbf{A good solution is : one MPI process per NUMA region} - -\end{frame} - - - -\begin{frame} - \frametitle{Halo regions} - % Neil, p. 54 - \begin{itemize} - \item Halo regions are local copies of remote data that are needed for computations - \item Halo regions need to be copied fequently - \item Using threads reduces the size of halo region copies that need to be stored - \item Reducing halo region sizes also reduces communication requirements - \end{itemize} -\end{frame} - - - -\begin{frame} - \frametitle{Take-home messages} - \begin{itemize} - \item Always take into account the problems related to the physical topology - \item A real application is not as easy as a hello world. - \item Some clusters have different connectivity topologies: match them to your problem. Examples of hardware topologies : - \begin{itemize} - \item all-to-all - \item 2D/3D torus - \item tree - \item ... - \end{itemize} - \item One MPI process per physical node - \end{itemize} \end{frame} \begin{frame}[fragile] - \frametitle{Main messages} + \frametitle{The 4 options for thread support} + \begin{itemize} - \item Do not use hybrid if the pure MPI code scales ok - \item Be aware of intranode MPI behavior - \item Always observe the topology dependence of + \item Thread support values are monotonic, i.e.\\ + {\footnotesize\code{MPI\_THREAD\_SINGLE} < \code{MPI\_THREAD\_FUNNELED} < + \code{MPI\_THREAD\_SERIALIZED} < \code{MPI\_THREAD\_MULTIPLE}} + \item Different processes in \code{MPI\_COMM\_WORLD} can have different + thread safety + \item The level(s) of provided thread support depends on the implementation + \item The rules for thread support attribution are done in the following order: \begin{itemize} - \item Intranode MPI - \item Threads overheads + \item return provided = required + \item return the least supported level such that provided > required + \item return the highest supported level \end{itemize} - \item Finally: Always compare the best pure MPI code with the best hybrid code! - \end{itemize} - -\end{frame} - -\begin{frame}[fragile] - \frametitle{Examples that \textit{can} benefit of an hybrid approach} - \begin{itemize} - \item MPI codes with a lot of all-to-all communications - \item MPI codes with a very poor load balancing at the algorithmic level (less communications) - \item MPI codes with memory limitations - \item MPI codes that can be easily \textit{fine-grained parallelized} (at loop level) \end{itemize} \end{frame} \subsection{MPI partitioned communications} \begin{frame}[fragile] \frametitle{MPI partitioned communications} \begin{itemize} \item New feature from MPI 4.0 standard (June 2021!) \item We have already talked about persistent point-to-point communications \item Partitioned comms are just persistent comms where the message is - constructed in partitions + constructed in partitions \item Typical case: multi-threading with each thread building a portion of - the message + the message \end{itemize} \end{frame} -\subsection{MPI partitioned communications} \begin{frame}[fragile] \frametitle{MPI partitioned communications} \begin{itemize} \item Remember the typical cycle for persistent point-to-point communications \\ \begin{center} Init \qquad (Start \quad Test/Wait)* \qquad Free\\ \end{center} where * means zero or more \item Partitioned are very similar \\ \begin{center} PInit \qquad (PStart \quad PReady)* \qquad Free\\ \end{center} \end{itemize} \begin{cxxcode}{} MPI_Psend_init(msg, parts, count, MPI_INT, dest, tag, info, MPI_COMM_WORLD, &request); MPI_Start(&request); #pragma omp parallel for shared(request) for (int i = 0; i < parts; ++i) { /* compute and fill partition #i, then mark ready: */ MPI_Pready(i, request); } while(!flag) { /* Do useful work */ MPI_Test(&request, &flag, MPI_STATUS_IGNORE); /* Do useful work */ } MPI_Request_free(&request); \end{cxxcode} \end{frame} \subsection{MPI matching probe} \begin{frame}[fragile] \frametitle{MPI matching probe} \begin{itemize} \item We have already talked about \code{MPI\_Probe} to obtain information about a message waiting to be received \item This is typically used when the size of the message is unknown (probe, allocate, receive) \pause \vfill \item Care must be taken because it is a stateful method: \\ \textit{A subsequent receive [...] will receive the message that was matched by the probe, \textbf{if no other intervening receive occurs after the probe} [...]} \pause \vfill \item Problem with multi-threading! \item Imagine two threads $A$ and $B$ that must do a Probe, Allocation, and Receive \[ A_{P} \longrightarrow A_{A} \longrightarrow A_{R} \longrightarrow B_{P} \longrightarrow B_{A} \longrightarrow B_{R} \] but may also be \[ A_{P} \longrightarrow B_{P} \longrightarrow B_{A} \longrightarrow B_{R} \longrightarrow A_{A} \longrightarrow A_{R} \] Thread $B$ stole thread $A$'s message! \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{MPI matching probe} \begin{itemize} \item The solution of this problem is the matching probe \item MPI provides two versions, \code{MPI\_Improbe} and \code{MPI\_Mprobe} \item It allows to receive only a message matching a specific probe \end{itemize} + \vfill + \pause + \begin{itemize} + \item Counter part operation are the matching receive \code{MPI\_Imrecv} and + \code{MPI\_Mrecv} + \item They are used to receive messages that have been previously matched by + a matching probe + \end{itemize} \end{frame} + +\begin{frame} + \frametitle{Concluding remarks} + + \begin{itemize} + \item Always keep in mind that you are mixing threads and processes + \item You will need to test your code performance on every machine + \item There are no magic rule on the best configuration to use + \item Often 1 MPI task per NUMA region seems to give the best performance + \end{itemize} +\end{frame} + +% \begin{frame} +% \frametitle{Topology problems} +% How to deal with : + +% \begin{itemize} +% \item topology / mapping ? (Which physical core is assigned to which process/thread) +% \item sub-domain decomposition ? +% \item halos size ? halos shapes ? +% \item unnecessary communications ? +% \item \textbf{computation to communication ratio} ? +% \end{itemize} + +% Pure MPI ? Hybrid ? +% \\ +% \textbf{A good solution is : one MPI process per NUMA region} +% \end{frame} + +% \begin{frame} +% \frametitle{Halo regions} +% % Neil, p. 54 +% \begin{itemize} +% \item Halo regions are local copies of remote data that are needed for computations +% \item Halo regions need to be copied frequently +% \item Using threads reduces the size of halo region copies that need to be stored +% \item Reducing halo region sizes also reduces communication requirements +% \end{itemize} +% \end{frame} + +% \begin{frame} +% \frametitle{Take-home messages} +% \begin{itemize} +% \item Always take into account the problems related to the physical topology +% \item A real application is not as easy as a hello world. +% \item Some clusters have different connectivity topologies: match them to your problem. Examples of hardware topologies : +% \begin{itemize} +% \item all-to-all +% \item 2D/3D torus +% \item tree +% \item ... +% \end{itemize} +% \item One MPI process per physical node +% \end{itemize} +% \end{frame} + +% \begin{frame}[fragile] +% \frametitle{Main messages} +% \begin{itemize} +% \item Do not use hybrid if the pure MPI code scales ok +% \item Be aware of intranode MPI behavior +% \item Always observe the topology dependence of +% \begin{itemize} +% \item Intranode MPI +% \item Threads overheads +% \end{itemize} +% \item Finally: Always compare the best pure MPI code with the best hybrid code! +% \end{itemize} + +% \end{frame} + +% \begin{frame}[fragile] +% \frametitle{Examples that \textit{can} benefit of an hybrid approach} +% \begin{itemize} +% \item MPI codes with a lot of all-to-all communications +% \item MPI codes with a very poor load balancing at the algorithmic level (less communications) +% \item MPI codes with memory limitations +% \item MPI codes that can be easily \textit{fine-grained parallelized} (at loop level) +% \end{itemize} +% \end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: