diff --git a/src/hybrid/hybrid.tex b/src/hybrid/hybrid.tex index f1a4986..7a642f3 100644 --- a/src/hybrid/hybrid.tex +++ b/src/hybrid/hybrid.tex @@ -1,294 +1,294 @@ \renewcommand{\FIGREP}{src/hybrid/figures} \section{Hybrid Programming} \begin{frame} \frametitle{What will we learn today ?} \begin{itemize} \item {Hybrid programming models comparison \begin{itemize} \item {Pure MPI} \item {MPI+OpenMP} \item {(MPI + MPI one-sided (MPI-3) )} \end{itemize} } \item {How to write a (production) project proposal} \end{itemize} \end{frame} \subsection{Hybrid programming models} \begin{frame} \frametitle{Situation} \onslide<1>\addimage[width=14cm]{\FIGREP/situation_1}{2cm}{2cm} \onslide<2>\addimage[width=14cm]{\FIGREP/situation_2}{2cm}{2cm} \end{frame} \begin{frame} \frametitle{Situation : problems} \begin{itemize} \item {Thread safety ?} \item {Which thread/process can/will call the MPI library ?} \item {MPI process placement in the case of multi-CPU processors ?} \item {Data visibility ? OpenMP private ? } \item {Does my problem fits with the targeted machine ?} \item {Levels of parallelism within my problem ?} \end{itemize} \end{frame} \begin{frame} \frametitle{Hybrid vs. Pure MPI} \textbf{Pure MPI} \begin{itemize} \item[\textcolor{green!40!black}{\textbf{+}}] no code modification \item[\textcolor{green!40!black}{\textbf{+}}] most of the libraries support multi-thread \item[\textcolor{red}{\textbf{--}}] does application topology fits system topology ? \item[\textcolor{red}{\textbf{--}}] useless communications \end{itemize} \textbf{Hybrid} \begin{itemize} \item[\textcolor{green!40!black}{\textbf{+}}] no message within an SMP node \item[\textcolor{green!40!black}{\textbf{+}}] less (no) topology problems \item[\textcolor{red}{\textbf{--}}] all threads sleep when master communicates \item[\textcolor{red}{\textbf{--}}] MPI-libs must support (at least) thread safety \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Hybrid MPI/OpenMP hello world} \cxxfile[% title={hybrid/hello\_world.cc}, minted options app={ firstline=1, }]{examples/hybrid/hello_world.cc} % Need to overlap communications (from master) with computation between the others. If possible ! \end{frame} \begin{frame}[fragile] \frametitle{Hybrid MPI/OpenMP hello world} Compilation using the GNU g++ compiler: \begin{bashcode} $> mpicxx -fopenmp hello_world.cc -o hello_world \end{bashcode}%$ Compilation using the Intel C++ compiler: \begin{bashcode} $> mpiicpc -qopenmp hello_world.cc -o hello_world \end{bashcode}%$ \vfill \textcolor{red}{\textbf{Warning}}\\ When using Intel MPI : it is mandatory to link against the thread-safe library (\code{-mt\_mpi}) or at least to check if the executable has been linked against this lib (\code{ldd hello} should print \code{libmpi\_mt.so.12}) if the \code{mpiicc} or \code{mpiifort} has been used. \end{frame} \begin{frame}[fragile] \frametitle{Submission script the clusters} \begin{bashcode} #!/bin/bash #SBATCH --nodes 1 #SBATCH --ntasks 2 #SBATCH --cpus-per-task 3 export OMP_NUM_THREADS=3 srun -n 2./hello_world \end{bashcode} \vfill It will start 2 MPI processes each will spawn 3 threads \vfill \begin{consoleoutput} Hello from thread 0 out of 3 from process 0 out of 2 Hello from thread 1 out of 3 from process 0 out of 2 Hello from thread 0 out of 3 from process 1 out of 2 Hello from thread 1 out of 3 from process 1 out of 2 Hello from thread 2 out of 3 from process 0 out of 2 Hello from thread 2 out of 3 from process 1 out of 2 \end{consoleoutput} \end{frame} \begin{frame}[fragile] \frametitle{Changes to your code} \begin{itemize} \item Change your MPI initialisation routine \begin{itemize} \item \code{MPI\_Init} is replaced by \code{MPI\_Init\_thread} \item \code{MPI\_Init\_thread} has two additional parameters for the level of thread support required, and for the level of thread support provided by the library implementation \end{itemize} \begin{cxxcode}{} int MPI_Init_thread(int *argc, char ***argv, int required, int *provided) \end{cxxcode} - + \item Make sure that the \textit{provided} support matches the \textit{required} one \item Add OpenMP directives as long as you stick to the level of thread safety you specified in the call to \code{MPI\_Init\_thread} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{The 4 Options for Thread Support} % \framesubtitle{I: user informs the MPI library} \begin{itemize} \item \code{MPI\_THREAD\_SINGLE} \begin{itemize} \item Only one thread will execute \item Standard MPI-only application \end{itemize} \item \code{MPI\_THREAD\_FUNNELED} \begin{itemize} \item Only the Master Thread will make calls to the MPI library % \item $\rightarrow$ The thread that calls \code{MPI\_Init\_thread} is the master thread \item A thread can determine whether it is the master thread by a call to \cxxinline{MPI\_Is\_thread\_main} \end{itemize} \item \code{MPI\_THREAD\_SERIALIZED} \begin{itemize} \item Only one thread at a time will make calls to the MPI library, but all threads are eligible to make such calls \end{itemize} \end{itemize} % \begin{Verbatim}[formatcom=\color{blue}] % int MPI_Is_main_thread(int * flag); % \end{Verbatim} \end{frame} \begin{frame}[fragile] \frametitle{The 4 Options for Thread Support} % \framesubtitle{II: The MPI Library is responsible for Thread Safety} \begin{itemize} \item \code{MPI\_THREAD\_MULTIPLE} \begin{itemize} \item Any thread may call the MPI library at any time \item The MPI library is responsible for thread safety within that library, and for any libraries that it in turn uses \item Codes that rely on the level of \code{MPI\_THREAD\_MULTIPLE} may run significantly slower than the case where one of the other options has been chosen \item You might need to link in a separate library in order to get this level of support \end{itemize} \end{itemize} In most cases \code{MPI\_THREAD\_FUNNELED} provides the best choice for hybrid programs \vfill \begin{cxxcode}{} int MPI_Query_thread(int * thread_level_provided); \end{cxxcode} Returns the level of thread support provided by the MPI library \end{frame} \begin{frame} \frametitle{Topology problems} % The problem: we have a domain with 80 subdomains as follows: % % \begin{center} % \begin{tikzpicture}% % \draw[step=4mm] (0,0) grid (6.4,2.0); % \end{tikzpicture} % \end{center} How to deal with : \begin{itemize} \item topology / mapping ? (Which physical core is assigned to which process/thread) \item sub-domain decomposition ? \item halos size ? halos shapes ? \item unnecessary communications ? \item \textbf{computation to communication ratio} ? \end{itemize} Pure MPI ? Hybrid ? \\ \textbf{A good solution is : one MPI process per ``SMP'' node} \end{frame} \begin{frame} \frametitle{Halo regions} % Neil, p. 54 \begin{itemize} \item Halo regions are local copies of remote data that are needed for computations \item Halo regions need to be copied fequently \item Using threads reduces the size of halo region copies that need to be stored \item Reducing halo region sizes also reduces communication requirements \end{itemize} \end{frame} \begin{frame} \frametitle{Take-home messages} \begin{itemize} \item Always take into account the problems related to the physical topology \item A real application is not as easy as a hello world. \item Some clusters have different connectivity topologies: match them to your problem. Examples of hardware topologies : \begin{itemize} \item all-to-all \item 2D/3D torus \item tree \item ... \end{itemize} \item One MPI process per physical node \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Main messages} \begin{itemize} \item Do not use hybrid if the pure MPI code scales ok \item Be aware of intranode MPI behavior \item Always observe the topology dependence of \begin{itemize} \item Intranode MPI \item Threads overheads \end{itemize} \item Finally: Always compare the best pure MPI code with the best hybrid code! \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Examples that \textit{can} benefit of an hybrid approach} \begin{itemize} \item MPI codes with a lot of all-to-all communications \item MPI codes with a very poor load balancing at the algorithmic level (less communications) \item MPI codes with memory limitations \item MPI codes that can be easily \textit{fine-grained parallelized} (at loop level) \end{itemize} \end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: