Page MenuHomec4science

hybrid.tex
No OneTemporary

File Metadata

Created
Wed, May 1, 05:19

hybrid.tex

\renewcommand{\FIGREP}{src/hybrid/figures}
\section{Hybrid Programming}
\begin{frame}
\frametitle{What will we learn today ?}
\begin{itemize}
\item {Hybrid programming models comparison
\begin{itemize}
\item {Pure MPI}
\item {MPI+OpenMP}
\item {(MPI + MPI one-sided (MPI-3) )}
\end{itemize}
}
\item {How to write a (production) project proposal}
\end{itemize}
\end{frame}
\subsection{Hybrid programming models}
\begin{frame}
\frametitle{Situation}
\onslide<1>\addimage[width=14cm]{\FIGREP/situation_1}{2cm}{2cm}
\onslide<2>\addimage[width=14cm]{\FIGREP/situation_2}{2cm}{2cm}
\end{frame}
\begin{frame}
\frametitle{Situation : problems}
\begin{itemize}
\item {Thread safety ?}
\item {Which thread/process can/will call the MPI library ?}
\item {MPI process placement in the case of multi-CPU processors ?}
\item {Data visibility ? OpenMP private ? }
\item {Does my problem fits with the targeted machine ?}
\item {Levels of parallelism within my problem ?}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Hybrid vs. Pure MPI}
\textbf{Pure MPI}
\begin{itemize}
\item[\textcolor{green!40!black}{\textbf{+}}] no code modification
\item[\textcolor{green!40!black}{\textbf{+}}] most of the libraries support multi-thread
\item[\textcolor{red}{\textbf{--}}] does application topology fits system topology ?
\item[\textcolor{red}{\textbf{--}}] useless communications
\end{itemize}
\textbf{Hybrid}
\begin{itemize}
\item[\textcolor{green!40!black}{\textbf{+}}] no message within an NUMA region
\item[\textcolor{green!40!black}{\textbf{+}}] less (no) topology problems
\item[\textcolor{red}{\textbf{--}}] all threads sleep when master communicates
\item[\textcolor{red}{\textbf{--}}] MPI-libs must support (at least) thread safety
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Hybrid MPI/OpenMP hello world}
\cxxfile[%
title={hybrid/hello\_world.cc},
minted options app={
firstline=1,
}]{examples/hybrid/hello_world.cc}
% Need to overlap communications (from master) with computation between the others. If possible !
\end{frame}
\begin{frame}[fragile]
\frametitle{Hybrid MPI/OpenMP hello world}
Compilation using the GNU g++ compiler:
\begin{bashcode}
$> mpicxx -fopenmp hello_world.cc -o hello_world
\end{bashcode}%$
Compilation using the Intel C++ compiler:
\begin{bashcode}
$> mpiicpc -qopenmp hello_world.cc -o hello_world
\end{bashcode}%$
\end{frame}
\begin{frame}[fragile]
\frametitle{Submission script the clusters}
\begin{bashcode}
#!/bin/bash
#SBATCH --nodes 1
#SBATCH --ntasks 2
#SBATCH --cpus-per-task 3
export OMP_NUM_THREADS=3
srun -n 2./hello_world
\end{bashcode}
\vfill
It will start 2 MPI processes each will spawn 3 threads
\vfill
\begin{consoleoutput}
Hello from thread 0 out of 3 from process 0 out of 2
Hello from thread 1 out of 3 from process 0 out of 2
Hello from thread 0 out of 3 from process 1 out of 2
Hello from thread 1 out of 3 from process 1 out of 2
Hello from thread 2 out of 3 from process 0 out of 2
Hello from thread 2 out of 3 from process 1 out of 2
\end{consoleoutput}
\end{frame}
\begin{frame}[fragile]
\frametitle{Changes to your code}
\begin{itemize}
\item Change your MPI initialisation routine
\begin{itemize}
\item \code{MPI\_Init} is replaced by \code{MPI\_Init\_thread}
\item \code{MPI\_Init\_thread} has two additional parameters for the level of thread support
required, and for the level of thread support provided by the library
implementation
\end{itemize}
\begin{cxxcode}{}
int MPI_Init_thread(int *argc, char ***argv, int required, int *provided)
\end{cxxcode}
\item Make sure that the \textit{provided} support matches the
\textit{required} one
\begin{cxxcode}{}
if (provided < required)
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
\end{cxxcode}
\item Add OpenMP directives as long as you stick to the level of thread safety you specified in the call to \code{MPI\_Init\_thread}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{The 4 Options for Thread Support}
% \framesubtitle{I: user informs the MPI library}
\begin{itemize}
\item \code{MPI\_THREAD\_SINGLE}
\begin{itemize}
\item Only one thread will execute
\item Standard MPI-only application
\end{itemize}
\item \code{MPI\_THREAD\_FUNNELED}
\begin{itemize}
\item Only the Master Thread will make calls to the MPI library
% \item $\rightarrow$ The thread that calls \code{MPI\_Init\_thread} is the master thread
\item A thread can determine whether it is the master thread by a call to \cxxinline{MPI\_Is\_thread\_main}
\end{itemize}
\item \code{MPI\_THREAD\_SERIALIZED}
\begin{itemize}
\item Only one thread at a time will make calls to the MPI library, but all threads are eligible to make such calls
\end{itemize}
\end{itemize}
% \begin{Verbatim}[formatcom=\color{blue}]
% int MPI_Is_main_thread(int * flag);
% \end{Verbatim}
\end{frame}
\begin{frame}[fragile]
\frametitle{The 4 Options for Thread Support}
% \framesubtitle{II: The MPI Library is responsible for Thread Safety}
\begin{itemize}
\item \code{MPI\_THREAD\_MULTIPLE}
\begin{itemize}
\item Any thread may call the MPI library at any time
\item The MPI library is responsible for thread safety within that
library, and for any libraries that it in turn uses
\item Codes that rely on the level of \code{MPI\_THREAD\_MULTIPLE} may run
significantly slower than the case where one of the other options
has been chosen
\item You might need to link in a separate library in order to get this
level of support
\end{itemize}
\end{itemize}
In most cases \code{MPI\_THREAD\_FUNNELED} provides the best choice for hybrid
programs
\vfill
\begin{cxxcode}{}
int MPI_Query_thread(int * thread_level_provided);
\end{cxxcode}
Returns the level of thread support provided by the MPI library
\end{frame}
\begin{frame}
\frametitle{Topology problems}
% The problem: we have a domain with 80 subdomains as follows:
%
% \begin{center}
% \begin{tikzpicture}%
% \draw[step=4mm] (0,0) grid (6.4,2.0);
% \end{tikzpicture}
% \end{center}
How to deal with :
\begin{itemize}
\item topology / mapping ? (Which physical core is assigned to which process/thread)
\item sub-domain decomposition ?
\item halos size ? halos shapes ?
\item unnecessary communications ?
\item \textbf{computation to communication ratio} ?
\end{itemize}
Pure MPI ? Hybrid ?
\\
\textbf{A good solution is : one MPI process per NUMA region}
\end{frame}
\begin{frame}
\frametitle{Halo regions}
% Neil, p. 54
\begin{itemize}
\item Halo regions are local copies of remote data that are needed for computations
\item Halo regions need to be copied fequently
\item Using threads reduces the size of halo region copies that need to be stored
\item Reducing halo region sizes also reduces communication requirements
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Take-home messages}
\begin{itemize}
\item Always take into account the problems related to the physical topology
\item A real application is not as easy as a hello world.
\item Some clusters have different connectivity topologies: match them to your problem. Examples of hardware topologies :
\begin{itemize}
\item all-to-all
\item 2D/3D torus
\item tree
\item ...
\end{itemize}
\item One MPI process per physical node
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Main messages}
\begin{itemize}
\item Do not use hybrid if the pure MPI code scales ok
\item Be aware of intranode MPI behavior
\item Always observe the topology dependence of
\begin{itemize}
\item Intranode MPI
\item Threads overheads
\end{itemize}
\item Finally: Always compare the best pure MPI code with the best hybrid code!
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Examples that \textit{can} benefit of an hybrid approach}
\begin{itemize}
\item MPI codes with a lot of all-to-all communications
\item MPI codes with a very poor load balancing at the algorithmic level (less communications)
\item MPI codes with memory limitations
\item MPI codes that can be easily \textit{fine-grained parallelized} (at loop level)
\end{itemize}
\end{frame}
\subsection{MPI partitioned communications}
\begin{frame}[fragile]
\frametitle{MPI partitioned communications}
\begin{itemize}
\item New feature from MPI 4.0 standard (June 2021!)
\item We have already talked about persistent point-to-point communications
\item Partitioned comms are just persistent comms where the message is
constructed in partitions
\item Typical case: multi-threading with each thread building a portion of
the message
\end{itemize}
\end{frame}
\subsection{MPI partitioned communications}
\begin{frame}[fragile]
\frametitle{MPI partitioned communications}
\begin{itemize}
\item Remember the typical cycle for persistent point-to-point
communications \\
\begin{center}
Init \qquad (Start \quad Test/Wait)* \qquad Free\\
\end{center}
where * means zero or more
\item Partitioned are very similar \\
\begin{center}
PInit \qquad (PStart \quad PReady)* \qquad Free\\
\end{center}
\end{itemize}
\begin{cxxcode}{}
MPI_Psend_init(msg, parts, count, MPI_INT, dest, tag, info, MPI_COMM_WORLD, &request);
MPI_Start(&request);
#pragma omp parallel for shared(request)
for (int i = 0; i < parts; ++i) {
/* compute and fill partition #i, then mark ready: */
MPI_Pready(i, request);
}
while(!flag) {
/* Do useful work */
MPI_Test(&request, &flag, MPI_STATUS_IGNORE);
/* Do useful work */
}
MPI_Request_free(&request);
\end{cxxcode}
\end{frame}
\subsection{MPI matching probe}
\begin{frame}[fragile]
\frametitle{MPI matching probe}
\begin{itemize}
\item We have already talked about \code{MPI\_Probe} to obtain information
about a message waiting to be received
\item This is typically used when the size of the message is unknown (probe,
allocate, receive)
\pause
\vfill
\item Care must be taken because it is a stateful method: \\
\textit{A subsequent receive [...] will receive the message that was
matched by the probe, \textbf{if no other intervening receive occurs after the
probe} [...]}
\pause
\vfill
\item Problem with multi-threading!
\item Imagine two threads $A$ and $B$ that must do a Probe, Allocation, and
Receive
\[
A_{P} \longrightarrow A_{A} \longrightarrow A_{R} \longrightarrow B_{P} \longrightarrow B_{A} \longrightarrow B_{R}
\]
but may also be
\[
A_{P} \longrightarrow B_{P} \longrightarrow B_{A} \longrightarrow B_{R} \longrightarrow A_{A} \longrightarrow A_{R}
\]
Thread $B$ stole thread $A$'s message!
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{MPI matching probe}
\begin{itemize}
\item The solution of this problem is the matching probe
\item MPI provides two versions, \code{MPI\_Improbe} and \code{MPI\_Mprobe}
\item It allows to receive only a message matching a specific probe
\end{itemize}
\end{frame}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../../phys_743_parallel_programming"
%%% End:

Event Timeline