diff --git a/src/hybrid/hybrid.tex b/src/hybrid/hybrid.tex index 7a642f3..2a9b77b 100644 --- a/src/hybrid/hybrid.tex +++ b/src/hybrid/hybrid.tex @@ -1,294 +1,369 @@ \renewcommand{\FIGREP}{src/hybrid/figures} \section{Hybrid Programming} \begin{frame} \frametitle{What will we learn today ?} \begin{itemize} \item {Hybrid programming models comparison \begin{itemize} \item {Pure MPI} \item {MPI+OpenMP} \item {(MPI + MPI one-sided (MPI-3) )} \end{itemize} } \item {How to write a (production) project proposal} \end{itemize} \end{frame} \subsection{Hybrid programming models} \begin{frame} \frametitle{Situation} \onslide<1>\addimage[width=14cm]{\FIGREP/situation_1}{2cm}{2cm} \onslide<2>\addimage[width=14cm]{\FIGREP/situation_2}{2cm}{2cm} \end{frame} \begin{frame} \frametitle{Situation : problems} \begin{itemize} \item {Thread safety ?} \item {Which thread/process can/will call the MPI library ?} \item {MPI process placement in the case of multi-CPU processors ?} \item {Data visibility ? OpenMP private ? } \item {Does my problem fits with the targeted machine ?} \item {Levels of parallelism within my problem ?} \end{itemize} \end{frame} \begin{frame} \frametitle{Hybrid vs. Pure MPI} \textbf{Pure MPI} \begin{itemize} \item[\textcolor{green!40!black}{\textbf{+}}] no code modification \item[\textcolor{green!40!black}{\textbf{+}}] most of the libraries support multi-thread \item[\textcolor{red}{\textbf{--}}] does application topology fits system topology ? \item[\textcolor{red}{\textbf{--}}] useless communications \end{itemize} \textbf{Hybrid} \begin{itemize} - \item[\textcolor{green!40!black}{\textbf{+}}] no message within an SMP node + \item[\textcolor{green!40!black}{\textbf{+}}] no message within an NUMA region \item[\textcolor{green!40!black}{\textbf{+}}] less (no) topology problems \item[\textcolor{red}{\textbf{--}}] all threads sleep when master communicates \item[\textcolor{red}{\textbf{--}}] MPI-libs must support (at least) thread safety \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Hybrid MPI/OpenMP hello world} \cxxfile[% title={hybrid/hello\_world.cc}, minted options app={ firstline=1, }]{examples/hybrid/hello_world.cc} % Need to overlap communications (from master) with computation between the others. If possible ! \end{frame} \begin{frame}[fragile] \frametitle{Hybrid MPI/OpenMP hello world} Compilation using the GNU g++ compiler: \begin{bashcode} $> mpicxx -fopenmp hello_world.cc -o hello_world \end{bashcode}%$ Compilation using the Intel C++ compiler: \begin{bashcode} $> mpiicpc -qopenmp hello_world.cc -o hello_world \end{bashcode}%$ - \vfill - \textcolor{red}{\textbf{Warning}}\\ - When using Intel MPI : it is mandatory to link against the thread-safe library - (\code{-mt\_mpi}) or at least to check if the executable has been linked - against this lib (\code{ldd hello} should print \code{libmpi\_mt.so.12}) if the - \code{mpiicc} or \code{mpiifort} has been used. - \end{frame} \begin{frame}[fragile] \frametitle{Submission script the clusters} \begin{bashcode} #!/bin/bash #SBATCH --nodes 1 #SBATCH --ntasks 2 #SBATCH --cpus-per-task 3 export OMP_NUM_THREADS=3 srun -n 2./hello_world \end{bashcode} \vfill It will start 2 MPI processes each will spawn 3 threads \vfill \begin{consoleoutput} Hello from thread 0 out of 3 from process 0 out of 2 Hello from thread 1 out of 3 from process 0 out of 2 Hello from thread 0 out of 3 from process 1 out of 2 Hello from thread 1 out of 3 from process 1 out of 2 Hello from thread 2 out of 3 from process 0 out of 2 Hello from thread 2 out of 3 from process 1 out of 2 \end{consoleoutput} \end{frame} \begin{frame}[fragile] \frametitle{Changes to your code} \begin{itemize} \item Change your MPI initialisation routine \begin{itemize} \item \code{MPI\_Init} is replaced by \code{MPI\_Init\_thread} \item \code{MPI\_Init\_thread} has two additional parameters for the level of thread support required, and for the level of thread support provided by the library implementation \end{itemize} \begin{cxxcode}{} int MPI_Init_thread(int *argc, char ***argv, int required, int *provided) \end{cxxcode} - \item Make sure that the \textit{provided} support matches the \textit{required} one + \item Make sure that the \textit{provided} support matches the + \textit{required} one + \begin{cxxcode}{} + if (provided < required) + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); + \end{cxxcode} \item Add OpenMP directives as long as you stick to the level of thread safety you specified in the call to \code{MPI\_Init\_thread} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{The 4 Options for Thread Support} % \framesubtitle{I: user informs the MPI library} - \begin{itemize} \item \code{MPI\_THREAD\_SINGLE} \begin{itemize} \item Only one thread will execute \item Standard MPI-only application \end{itemize} \item \code{MPI\_THREAD\_FUNNELED} \begin{itemize} \item Only the Master Thread will make calls to the MPI library % \item $\rightarrow$ The thread that calls \code{MPI\_Init\_thread} is the master thread \item A thread can determine whether it is the master thread by a call to \cxxinline{MPI\_Is\_thread\_main} \end{itemize} \item \code{MPI\_THREAD\_SERIALIZED} \begin{itemize} \item Only one thread at a time will make calls to the MPI library, but all threads are eligible to make such calls \end{itemize} \end{itemize} % \begin{Verbatim}[formatcom=\color{blue}] % int MPI_Is_main_thread(int * flag); % \end{Verbatim} \end{frame} \begin{frame}[fragile] \frametitle{The 4 Options for Thread Support} % \framesubtitle{II: The MPI Library is responsible for Thread Safety} \begin{itemize} \item \code{MPI\_THREAD\_MULTIPLE} \begin{itemize} \item Any thread may call the MPI library at any time \item The MPI library is responsible for thread safety within that library, and for any libraries that it in turn uses \item Codes that rely on the level of \code{MPI\_THREAD\_MULTIPLE} may run significantly slower than the case where one of the other options has been chosen \item You might need to link in a separate library in order to get this level of support \end{itemize} \end{itemize} In most cases \code{MPI\_THREAD\_FUNNELED} provides the best choice for hybrid programs \vfill \begin{cxxcode}{} int MPI_Query_thread(int * thread_level_provided); \end{cxxcode} Returns the level of thread support provided by the MPI library \end{frame} \begin{frame} \frametitle{Topology problems} % The problem: we have a domain with 80 subdomains as follows: % % \begin{center} % \begin{tikzpicture}% % \draw[step=4mm] (0,0) grid (6.4,2.0); % \end{tikzpicture} % \end{center} How to deal with : \begin{itemize} \item topology / mapping ? (Which physical core is assigned to which process/thread) \item sub-domain decomposition ? \item halos size ? halos shapes ? \item unnecessary communications ? \item \textbf{computation to communication ratio} ? \end{itemize} Pure MPI ? Hybrid ? \\ - \textbf{A good solution is : one MPI process per ``SMP'' node} + \textbf{A good solution is : one MPI process per NUMA region} \end{frame} \begin{frame} \frametitle{Halo regions} % Neil, p. 54 \begin{itemize} \item Halo regions are local copies of remote data that are needed for computations \item Halo regions need to be copied fequently \item Using threads reduces the size of halo region copies that need to be stored \item Reducing halo region sizes also reduces communication requirements \end{itemize} \end{frame} \begin{frame} \frametitle{Take-home messages} \begin{itemize} \item Always take into account the problems related to the physical topology \item A real application is not as easy as a hello world. \item Some clusters have different connectivity topologies: match them to your problem. Examples of hardware topologies : \begin{itemize} \item all-to-all \item 2D/3D torus \item tree \item ... \end{itemize} \item One MPI process per physical node \end{itemize} \end{frame} - - - - \begin{frame}[fragile] \frametitle{Main messages} \begin{itemize} \item Do not use hybrid if the pure MPI code scales ok \item Be aware of intranode MPI behavior \item Always observe the topology dependence of \begin{itemize} \item Intranode MPI \item Threads overheads \end{itemize} \item Finally: Always compare the best pure MPI code with the best hybrid code! \end{itemize} \end{frame} - \begin{frame}[fragile] \frametitle{Examples that \textit{can} benefit of an hybrid approach} \begin{itemize} \item MPI codes with a lot of all-to-all communications \item MPI codes with a very poor load balancing at the algorithmic level (less communications) \item MPI codes with memory limitations \item MPI codes that can be easily \textit{fine-grained parallelized} (at loop level) \end{itemize} +\end{frame} +\subsection{MPI partitioned communications} +\begin{frame}[fragile] + \frametitle{MPI partitioned communications} + \begin{itemize} + \item New feature from MPI 4.0 standard (June 2021!) + \item We have already talked about persistent point-to-point communications + \item Partitioned comms are just persistent comms where the message is + constructed in partitions + \item Typical case: multi-threading with each thread building a portion of + the message + \end{itemize} \end{frame} +\subsection{MPI partitioned communications} +\begin{frame}[fragile] + \frametitle{MPI partitioned communications} + \begin{itemize} + \item Remember the typical cycle for persistent point-to-point + communications \\ + \begin{center} + Init \qquad (Start \quad Test/Wait)* \qquad Free\\ + \end{center} + where * means zero or more + \item Partitioned are very similar \\ + \begin{center} + PInit \qquad (PStart \quad PReady)* \qquad Free\\ + \end{center} + \end{itemize} + + \begin{cxxcode}{} + MPI_Psend_init(msg, parts, count, MPI_INT, dest, tag, info, MPI_COMM_WORLD, &request); + MPI_Start(&request); + #pragma omp parallel for shared(request) + for (int i = 0; i < parts; ++i) { + /* compute and fill partition #i, then mark ready: */ + MPI_Pready(i, request); + } + while(!flag) { + /* Do useful work */ + MPI_Test(&request, &flag, MPI_STATUS_IGNORE); + /* Do useful work */ + } + MPI_Request_free(&request); + \end{cxxcode} +\end{frame} + +\subsection{MPI matching probe} +\begin{frame}[fragile] + \frametitle{MPI matching probe} + \begin{itemize} + \item We have already talked about \code{MPI\_Probe} to obtain information + about a message waiting to be received + \item This is typically used when the size of the message is unknown (probe, + allocate, receive) + \pause + \vfill + \item Care must be taken because it is a stateful method: \\ + \textit{A subsequent receive [...] will receive the message that was + matched by the probe, \textbf{if no other intervening receive occurs after the + probe} [...]} + \pause + \vfill + \item Problem with multi-threading! + \item Imagine two threads $A$ and $B$ that must do a Probe, Allocation, and + Receive + \[ + A_{P} \longrightarrow A_{A} \longrightarrow A_{R} \longrightarrow B_{P} \longrightarrow B_{A} \longrightarrow B_{R} + \] + but may also be + \[ + A_{P} \longrightarrow B_{P} \longrightarrow B_{A} \longrightarrow B_{R} \longrightarrow A_{A} \longrightarrow A_{R} + \] + Thread $B$ stole thread $A$'s message! + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{MPI matching probe} + \begin{itemize} + \item The solution of this problem is the matching probe + \item MPI provides two versions, \code{MPI\_Improbe} and \code{MPI\_Mprobe} + \item It allows to receive only a message matching a specific probe + \end{itemize} +\end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: diff --git a/src/mpi/mpi_advanced.tex b/src/mpi/mpi_advanced.tex index 30df1ad..23afe8e 100644 --- a/src/mpi/mpi_advanced.tex +++ b/src/mpi/mpi_advanced.tex @@ -1,786 +1,786 @@ \renewcommand{\FIGREP}{src/mpi/figures} \section{Advanced MPI} \intersec{izar} \begin{frame} \frametitle{Advanced MPI} \framesubtitle{Goals of this section} \begin{itemize} \item Overview of more advanced functionalities \item Persistent communications \item Advanced collective communications \item Describing your own datatype \item Redefining communicators \item Associating a topology to a communicator \item Parallel I/O \item One sided communications \end{itemize} \end{frame} \subsection{Persistent point to point} \begin{frame}[fragile] \frametitle{Persistent communications} \framesubtitle{} \begin{itemize} \item \cxxinline{MPI_Send_init} \cxxinline{MPI_Recv_init}, initialize the communication \item Same signature as non-blocking communications \item \cxxinline{MPI_Start}, \cxxinline{MPI_Startall} to start the communication \item Completion is checked the same way as for non-blocking \end{itemize} \end{frame} \begin{frame}[exercise, fragile] \frametitle{Persistent communications} \framesubtitle{} \begin{itemize} \item Replace the non-blocking communication in the Poisson code by persistent ones \end{itemize} \end{frame} \subsection{Advanced collective communications} \subsubsection{V versions} \begin{frame}[fragile] \frametitle{Collective communications} \framesubtitle{V extension to \cxxinline{MPI\_Gather}} \begin{cxxcode}{Syntax} int MPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root, MPI_Comm comm); \end{cxxcode} \begin{itemize} \item \cxxinline{recvcounts} is now an array, one entry per rank \item \cxxinline{displs} array of displacements defining where to place the $i^{\mathrm{th}}$ receive data \item receive different sizes per process \item receive in an array with strides \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Collective communications} \framesubtitle{Gatherv semantic} \begin{cxxcode}{Semantic equivalent} // Every process MPI_Send(sendbuf, sendcount, sendtype, root, /*...*/); // On root process for(i = 0; i < nb_process; ++i) MPI_Recv(recvbuf+displs[j] * extent(recvtype), recvcounts[j], recvtype, i, /*...*/); \end{cxxcode} \end{frame} \begin{frame}[fragile] \frametitle{Collective communications} \framesubtitle{V extension to \cxxinline{MPI\_Scatter}} \begin{cxxcode}{Syntax} int MPI_Scatterv(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm); \end{cxxcode} \begin{itemize} \item \cxxinline{sendcounts} is now an array, one entry per rank \item \cxxinline{displs} array of displacements defining where to place the $i^{\mathrm{th}}$ receive data \item receive different sizes \item receive in an array with strides \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Collective communications} \framesubtitle{Scatterv semantic} \begin{cxxcode}{Semantic equivalent} // On root process for(i = 0; i < nb_process; ++i) MPI_Send(sendbuf+displs[i]*extent(sendtype), sendcounts[i], sendtype, i, /*...*/) // Every process MPI_Recv(recvbuf, recvcount, recvtype, i, /*...*/). \end{cxxcode} \end{frame} \subsubsection{Non-blocking collective communications} \begin{frame}[fragile] \frametitle{Non-blocking collective communications} \framesubtitle{} \begin{itemize} \item \code{I} variant of collective communications \item extra parameter \cxxinline{request} \item \cxxinline{MPI_Ibarrier}, \cxxinline{MPI_Ibcast} \item \cxxinline{MPI_Igather}, \cxxinline{MPI_Igatherv}, \cxxinline{MPI_Iscatter}, \cxxinline{MPI_Iscatterv} \item \cxxinline{MPI_Iallgather}, \cxxinline{MPI_Iallgatherv}, \cxxinline{MPI_Ialltoall} \item \cxxinline{MPI_Ireduce}, \cxxinline{MPI_Iallreduce}, \cxxinline{MPI_Iscan}, \cxxinline{MPI_Iexscan} \end{itemize} \end{frame} \subsubsection{Persistent collective communications} \begin{frame}[fragile] \frametitle{Persistent collective communications} \framesubtitle{} \begin{itemize} \item \code{\_init} variant of collective communications \item extra parameter \cxxinline{request} \item \cxxinline{MPI_Barrier_init}, \cxxinline{MPI_Bcast_init} \item \cxxinline{MPI_Gather_init}, \cxxinline{MPI_Gatherv_init}, \cxxinline{MPI_Scatter_init}, \cxxinline{MPI_Scatterv_init} \item \cxxinline{MPI_Allgather_init}, \cxxinline{MPI_Allgatherv_init}, \cxxinline{MPI_Alltoall_init} \item \cxxinline{MPI_Reduce_init}, \cxxinline{MPI_Allreduce_init}, \cxxinline{MPI_Scan_init}, \cxxinline{MPI_Exscan_init} \end{itemize} \end{frame} \begin{frame}[exercise, fragile] \frametitle{Persistent collective} \framesubtitle{} \begin{itemize} \item Replace the the \cxxinline{MPI_Allreduce} by a persistent one \end{itemize} \end{frame} \subsection{Derived Datatypes} \begin{frame}[fragile] \frametitle{Derived Datatypes} \framesubtitle{Definition of a datatypes} \begin{itemize} \item \cxxinline{MPI_Datatype} opaque type containing a \emph{Typemap} \begin{itemize} \item $Typemap = \{(type_{0},disp_{0}), \dotsb, (type_{n - 1},disp_{n - 1})\}$ \item sequence of basic datatypes \item sequence of displacements (in bytes) \end{itemize} \item \code{extent} is the span from the first byte to the last one, with alignment requirement \begin{align*} lb(Typemap) &= \underset{j}{min}(disp_{j}),\\ ub(Typemap) &= \underset{j}{max}(disp_{j} + \mathrm{sizeof}(type_{j})) + \epsilon, and\\ extent(Typemap) &= ub(Typemap) - lb(Typemap) \end{align*} $\epsilon$ is there to account for alignment requirements \end{itemize} \end{frame} \begin{frame} \frametitle{Derived Datatypes} \framesubtitle{Base datatypes} \begin{minipage}{.45\linewidth} \small \begin{tabular}{ll} \toprule MPI datatype & C datatype\\ \midrule \cxxinline{MPI_CHAR} & \cxxinline{char} \\ \cxxinline{MPI_SHORT} & \cxxinline{signed short int} \\ \cxxinline{MPI_INT} & \cxxinline{signed int} \\ \cxxinline{MPI_LONG} & \cxxinline{signed long int} \\ \cxxinline{MPI_LONG_LONG_INT} & \cxxinline{signed long long int} \\ \cxxinline{MPI_LONG_LONG} & \cxxinline{signed long long int} \\ \cxxinline{MPI_SIGNED_CHAR} & \cxxinline{signed char} \\ \cxxinline{MPI_UNSIGNED_CHAR} & \cxxinline{unsigned char} \\ \cxxinline{MPI_UNSIGNED_SHORT} & \cxxinline{unsigned short int} \\ \cxxinline{MPI_UNSIGNED} & \cxxinline{unsigned int} \\ \cxxinline{MPI_UNSIGNED_LONG} & \cxxinline{unsigned long int} \\ \cxxinline{MPI_UNSIGNED_LONG_LONG} & \cxxinline{unsigned long long int} \\ \bottomrule \end{tabular} \end{minipage} \hspace{1cm} \begin{minipage}{.45\linewidth} \small \begin{tabular}{ll} \toprule MPI datatype & C datatype\\ \midrule \cxxinline{MPI_FLOAT} & \cxxinline{float} \\ \cxxinline{MPI_DOUBLE} & \cxxinline{double} \\ \cxxinline{MPI_LONG_DOUBLE} & \cxxinline{long double} \\ \cxxinline{MPI_WCHAR} & \cxxinline{wchar_t} \\ \cxxinline{MPI_C_BOOL} & \cxxinline{_Bool} \\ \cxxinline{MPI_INT8_T} & \cxxinline{int8_t} \\ \cxxinline{MPI_INT16_T} & \cxxinline{int16_t} \\ \cxxinline{MPI_INT32_T} & \cxxinline{int32_t} \\ \cxxinline{MPI_INT64_T} & \cxxinline{int64_t} \\ \cxxinline{MPI_UINT8_T} & \cxxinline{uint8_t} \\ \cxxinline{MPI_UINT16_T} & \cxxinline{uint16_t} \\ \cxxinline{MPI_UINT32_T} & \cxxinline{uint32_t} \\ \cxxinline{MPI_UINT64_T} & \cxxinline{uint64_t} \\ \bottomrule \end{tabular} \end{minipage} \end{frame} \begin{frame} \frametitle{Derived Datatypes} \framesubtitle{Base datatypes} \begin{minipage}{.45\linewidth} \small \begin{tabular}{ll} \toprule MPI datatype & C++ datatype\\ \midrule \cxxinline{MPI_CXX_BOOL} & \cxxinline{bool} \\ \cxxinline{MPI_CXX_FLOAT_COMPLEX} & \cxxinline{std::complex} \\ \cxxinline{MPI_CXX_DOUBLE_COMPLEX} & \cxxinline{std::complex} \\ \cxxinline{MPI_CXX_LONG_DOUBLE_COMPLEX} & \cxxinline{std::complex}\\ \bottomrule \end{tabular} \end{minipage} \hspace{1.8cm} \begin{minipage}{.3\linewidth} \small \begin{tabular}{ll} \toprule MPI datatype & C datatype\\ \midrule \cxxinline{MPI_AINT} & \cxxinline{MPI_Aint} \\ \cxxinline{MPI_OFFSET} & \cxxinline{MPI_Offset} \\ \cxxinline{MPI_COUNT} & \cxxinline{MPI_Count} \\ \cxxinline{MPI_BYTE} & \\ \cxxinline{MPI_PACKED} & \\ \bottomrule \end{tabular} \end{minipage} \end{frame} \note{ \begin{itemize} \item \cxxinline{MPI_CHAR} is a printable character where \cxxinline{MPI_BYTE} is a type of exactly 8bit not printable as a character \item \cxxinline{MPI_PACKED} for pack/unpacked \end{itemize} } \begin{frame}[fragile] \frametitle{Derived Datatypes} \framesubtitle{Arrays} \begin{cxxcode}{Syntax} int MPI_Type_contiguous(int count, MPI_Datatype oldtype, MPI_Datatype *newtype); int MPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype, MPI_Datatype *newtype); \end{cxxcode} \begin{itemize} \item array of contiguous array or with strided blocks of same type \item \cxxinline{count}: number of repetition (blocks) \item \cxxinline{blocklength}: number of element per block \item \cxxinline{stride}: number of element between start of each block \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Derived Datatypes} \framesubtitle{Array variants} \begin{itemize} \item \cxxinline{MPI_Type_create_hvector}: same as \cxxinline{MPI_Type_vector} with \cxxinline{stride} expressed in bytes \item \cxxinline{MPI_Type_create_indexed_block} same as \cxxinline{MPI_Type_vector} with array of and \cxxinline{displacements} \item \cxxinline{MPI_Type_create_hindexed_block}: same as \cxxinline{MPI_Type_create_indexed_block} with \cxxinline{displacements} in bytes \item \cxxinline{MPI_Type_indexed}: same as \cxxinline{MPI_Type_create_indexed_block} with arrays of \cxxinline{blocklengths} \item \cxxinline{MPI_Type_create_hindexed}: same as \cxxinline{MPI_Type_indexed} with \cxxinline{displacements} in bytes \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Derived Datatypes} \framesubtitle{Structures} \begin{cxxcode}{Syntax} int MPI_Type_create_struct(int count, const int array_of_blocklengths[], const MPI_Aint array_of_displacements[], const MPI_Datatype array_of_types[], MPI_Datatype *newtype) \end{cxxcode} \begin{itemize} \item \cxxinline{count}: number of repetition (blocks) \item \cxxinline{array_of_blocklengths}: sizes per block \item \cxxinline{array_of_displacements}: displacements between blocks in bytes \item \cxxinline{array_of_types}: types contained in each blocks \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Derived Datatypes} \framesubtitle{Usefull helper functions} \begin{itemize} \item \cxxinline{MPI_Get_address}: get the address of a variable \item \cxxinline{MPI_Aint_diff}: get the difference between 2 addresses \item \cxxinline{MPI_Aint_add}: get the sum of 2 addresses \item \cxxinline{MPI_Type_size}: get the size of a datatype \item \cxxinline{MPI_Get_type_extent}: get the lower bound and the extent of a type \item \cxxinline{MPI_Type_create_resized}: reset the lower bound and the extent of a type \end{itemize} \end{frame} \note{ \begin{itemize} \item Prefer \cxxinline{MPI_Get_address} over \& \item extent could be badly set then not possible to communicate multiple objects of same datatype \end{itemize} } \begin{frame}[fragile] \frametitle{Derived Datatypes} \framesubtitle{Commit/free} \begin{cxxcode}{Syntax} int MPI_Type_commit(MPI_Datatype *datatype); int MPI_Type_free(MPI_Datatype *datatype); \end{cxxcode} \begin{itemize} \item new datatypes should be committed before being usable in communications \item committed types need to be freed once not used anymore \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Derived Datatypes} \framesubtitle{Example} \cxxfile[title={mpi/datatypes.cc}, minted options app={ firstline=13, lastline=41, fontsize=\tiny}]{examples/mpi/datatypes.cc} \end{frame} \begin{frame}[fragile, exercise] \frametitle{Derived Datatypes} \framesubtitle{Send lines in poisson code} \begin{itemize} \item Create a \cxxinline{MPI_Datatype line_t} representing a line of data \item Exchange data of type \cxxinline{line_t} instead of \cxxinline{MPI_FLOAT} \end{itemize} \end{frame} \subsection{Pack/Unpack} \begin{frame}[fragile] \frametitle{Pack/Unpack} \framesubtitle{Pack} \begin{cxxcode}{Syntax} int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, int outsize, int *position, MPI_Comm comm); \end{cxxcode} \begin{itemize} \item \cxxinline{inbuf}, \cxxinline{incount}, \cxxinline{datatype} correspond to the description of data to pack \item \cxxinline{outbuf}, \cxxinline{outsize} description of the buffer where to pack \item \cxxinline{position} current position in the packing buffer \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Pack/Unpack} \framesubtitle{Unpack} \begin{cxxcode}{Syntax} int MPI_Unpack(const void *inbuf, int insize, int *position, void *outbuf, int outcount, MPI_Datatype datatype, MPI_Comm comm); \end{cxxcode} \begin{itemize} \item \cxxinline{inbuf}, \cxxinline{incount}, description of the buffer from which to unpack \item \cxxinline{position} current position in the unpacking buffer \item \cxxinline{outbuf}, \cxxinline{outsize}, and \cxxinline{datatype} correspond to the description of data to unpack \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Pack/Unpack} \framesubtitle{Example} \cxxfile[title={mpi/pack\_unpack.cc}, minted options app={ firstline=26, lastline=39 }]{examples/mpi/pack_unpack.cc} \end{frame} \subsection{Groups and Communicator} \begin{frame}[containsverbatim] \frametitle{Groups and Communicators} \begin{itemize} \item a \code{communicator}: \begin{itemize} \item Encapsulate a \code{context}, a \code{group}, a \code{virtual topology} and \code{attributes} \item Two kinds \code{intra-communicator} and \code{inter-communicator} \end{itemize} \item a \code{group}: \begin{itemize} \item ordered set of processes \item each process has an unique ID (rank within the group) and can belong to several different groups \item a group can be used to create a new communicator \end{itemize} \end{itemize} \end{frame} \note{ \begin{itemize} \item \code{intra} communications inside a group \item \code{inter} communications between groups \end{itemize} } \begin{frame}[containsverbatim] \frametitle{Groups and Communicators} \framesubtitle{Creating new communicators} \begin{itemize} \item duplicating or splitting an existing one \cxxinline{MPI_Comm_dup}, \cxxinline{MPI_Comm_split} \item creating communicator from a group \cxxinline{MPI_Comm_create}, \cxxinline{MPI_Comm_create_group} \item need to create groups \begin{itemize} \item from a communicator \cxxinline{MPI_Comm_group} \item boolean operations \cxxinline{MPI_Group_union}, \cxxinline{MPI_Group_intersection}, \cxxinline{MPI_Group_difference} \item specifying ranks \cxxinline{MPI_Group_incl}, \cxxinline{MPI_Group_excl} \end{itemize} \item destroy created objects \cxxinline{MPI_Comm_free}, \cxxinline{MPI_Group_free} \end{itemize} \end{frame} \subsection{Virutal Topologies} \begin{frame} \frametitle{Virtual Topologies} \framesubtitle{} \begin{itemize} \item potential performance gain by mapping process to hardware \item helps for program readability \item types of topologies: Cartesian, Graph, Distributed Graph \item collective communication on neighborhoods \end{itemize} \end{frame} \note{ Details only on the cartesian on } \begin{frame}[fragile] \frametitle{Virtual Topologies} \framesubtitle{Cartesian topology} \begin{cxxcode}{Syntax} int MPI_Cart_create(MPI_Comm comm_old, int ndims, const int dims[], const int periods[], int reorder, MPI_Comm *comm_cart); \end{cxxcode} \begin{itemize} \item create a communicator with cartesian information \item convenient functions: \begin{itemize} \item \cxxinline{MPI_Dims_create} helps creating balanced distribution of process \item \cxxinline{MPI_Cart_shift} helps determining neighboors \item \cxxinline{MPI_Cart_rank} get the rank based on coordinates \item \cxxinline{MPI_Cart_coords} get coordinates based on rank \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile] - \frametitle{Virutal topology} + \frametitle{Virtual topology} \framesubtitle{Neighborhoods collective} \begin{itemize} \item \cxxinline{MPI_Neighbor_allgather} assuming we are on process with rank $i$, gather data from all rank $j$ if edge $(j, i)$ exists and send same data to all $j$ where edge $(i,j)$ exists \item \cxxinline{MPI_Neighbor_alltoall} compare to allgather, sends different data to all $j$ process \item vector variant are available \code{v} \item immediate variant are available \code{I} \item persistent variant are available \code{\_init} \item \cxxinline{MPI_Neighbor_alltoall} as one in all flavors the \code{w}, different datatypes are echanged with all neighbors \end{itemize} \end{frame} \begin{frame}[exercise, fragile] \frametitle{Virtual topology} \framesubtitle{} \begin{itemize} \item Rewrite the parallelism using a cartesian communicator \item Use neighbor collective communications \end{itemize} \end{frame} \subsection{Parallel I/O} \begin{frame}[containsverbatim] \frametitle{Parallel I/O overview} \begin{itemize} \item I/O is often (if not always) the main bottleneck in a parallel application \item MPI provides a mechanism to read/write in parallel \end{itemize} \begin{center} \input{src/mpi/figures/parallelFS.tex} \end{center} \end{frame} \begin{frame}[containsverbatim] \frametitle{Introducing remarks} \begin{itemize} \item MPI IO API works on your desktop/laptop \item Most of the large HPC systems have a \textbf{parallel file system} (like GPFS, Lustre, \emph{etc}.) \item If the file is distributed smartly on a parallel file system: performance increases \item MPI IO offers a high-level API to access a distributed file (no needs to implement complex POSIX calls) \item \textbf{does not work with ASCII files} \item Most of the standard file format support MPI IO (\emph{e.g}. HDF5, NetCDF, \emph{etc}..) \end{itemize} \end{frame} \begin{frame}[containsverbatim] \frametitle{Poisson so far} \begin{center} \input{src/mpi/figures/sofar.tex} \end{center} \end{frame} \begin{frame}[containsverbatim] \frametitle{Poisson ideal} \begin{center} \input{src/mpi/figures/sogoal.tex} \end{center} \end{frame} \begin{frame}[fragile] \frametitle{Open/Close a file in parallel} \begin{cxxcode}{Syntax} int MPI_File_open(MPI_Comm comm, const char *filename, int amode, MPI_Info info, MPI_File *fh); int MPI_File_close(MPI_File *fh); \end{cxxcode} \begin{itemize} \item \cxxinline{comm}: the communicator that contains the writing/reading MPI processes \item \cxxinline{filename}: a file name \item \cxxinline{amode}: file access mode, \cxxinline{MPI_MODE_RDONLY}, \cxxinline{MPI_MODE_WRONLY}, \cxxinline{MPI_MODE_RDWR}, \cxxinline{MPI_MODE_CREATE}, \emph{e.t.c.} \item \cxxinline{info}: file info object (\cxxinline{MPI_INFO_NULL} is a valid info) \item \cxxinline{fh}: file handle \end{itemize} \textbf{Collective calls !!} \end{frame} \begin{frame}[containsverbatim] \frametitle{Parallel IO} \framesubtitle{Terminology} \begin{itemize} \item \code{etype} is the elementary type of the data of the parallel accessed file \item \code{offset} is a position in the file in term of multiple of etypes \item \code{displacement} of a position within the file is the number of bytes from the beginning of the file \end{itemize} \begin{center} \includegraphics{src/mpi/figures/offset} %\input{day3/images/offset.tex} \end{center} \end{frame} \begin{frame}[containsverbatim] \frametitle{Parallel IO} \framesubtitle{Simple independent read/write} \begin{cxxcode}{Syntax} int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype, MPI_Status *status); int MPI_File_write_at(MPI_File fh, MPI_Offset offset, const void *buf, int count, MPI_Datatype datatype, MPI_Status *status); \end{cxxcode} \begin{itemize} \item Can be used from a single (or group) of processes \item \cxxinline{offset} must be specified in the \cxxinline{buf} buffer \item \cxxinline{count} elements of type \cxxinline{datatype} are written \end{itemize} \end{frame} \begin{frame}[containsverbatim] \frametitle{Parallel IO} \framesubtitle{\code{view} by each process} \begin{cxxcode}{Syntax} int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, MPI_Datatype filetype, const char *datarep, MPI_Info info); int MPI_File_get_view(MPI_File fh, MPI_Offset *disp, MPI_Datatype *etype, MPI_Datatype *filetype, char *datarep); \end{cxxcode} \begin{itemize} \item initially, each process view the file as a linear byte stream and each process views data in its own native representation \item \cxxinline{disp} is the displacement (defines the beginning of the data of the file that belongs to the process) in byte \item \cxxinline{etype} is the unit of data access and positioning \item \cxxinline{filetype} is a single \cxxinline{etype} of a multiple of it \end{itemize} \end{frame} \begin{frame}[containsverbatim] \frametitle{Parallel IO} \frametitle{Setting up a \code{view}} \begin{center} \input{day3/images/displacements.tex} \end{center} (source : MPI 2.2 specifications) \end{frame} \begin{frame}[containsverbatim] \frametitle{Parallel IO} \framesubtitle{Simple independent read/write without offset} \begin{cxxcode}{Syntax} int MPI_File_read(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status); int MPI_File_write(MPI_File fh, const void *buf, int count, MPI_Datatype datatype, MPI_Status *status); \end{cxxcode} \end{frame} \begin{frame}[containsverbatim] \frametitle{Parallel IO} \framesubtitle{Collective read/write with/without offset} \begin{cxxcode}{Syntax} int MPI_File_write_all(MPI_File fh, const void *buf, int count, MPI_Datatype datatype, MPI_Status *status); int MPI_File_read_all(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status); \end{cxxcode} \end{frame} \subsection{One Sided} \begin{frame}[containsverbatim] \frametitle{What we did not view} \begin{itemize} \item One Sided communications \begin{itemize} \item \cxxinline{MPI_Put}, \cxxinline{MPI_Get} \item \cxxinline{MPI_Win_*} \item shared memory \end{itemize} \item Process management \begin{itemize} \item \cxxinline{MPI_Comm_spawn} \item Communications on inter-communicators \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile,t] \frametitle{Parallelization of the poisson code} \begin{minipage}{.45\linewidth} \centering \begin{overprint} \only<1>{\includegraphics[width=.8\linewidth]{\FIGREP/grid_2D_1}} \only<2>{\includegraphics[width=.8\linewidth]{\FIGREP/grid_2D_2}} \only<3>{\includegraphics[width=.8\linewidth]{\FIGREP/grid_2D_3}} \only<4->{\includegraphics[width=.8\linewidth]{\FIGREP/grid_2D_4}} \end{overprint} \end{minipage} \begin{minipage}{.45\linewidth} \begin{overprint} \onslide<1> \begin{itemize} \item Parallelize the Poisson 2D problem using the Messages Passing Interface (MPI) \end{itemize} \onslide<2> \begin{itemize} \item This time, we want to make a 2D domain decomposition using Cartesian topology \item Use \code{MPI\_Dims\_create} and \code{MPI\_Cart\_create} to create a Cratesian topology \end{itemize} \onslide<3> \begin{itemize} \item The $p$ processes are split into $(p_{x}, p_{y})$ to make the Cartesian grid \item Each domain has size $(N/p_{x}, N/p_{y})$ (1 per process) \item Use \code{MPI\_Cart\_shift} to find the neighboring domains \end{itemize} \onslide<4> \begin{itemize} \item Adding \emph{ghost} lines before and after \item Use the \emph{ghost} lines to receive the missing local data \item You will need to define a new \textit{matrix column} datatype and update the \textit{matrix line} datatype \end{itemize} \onslide<5> \begin{itemize} \item Use the \code{MPI\_neighbor\_alltoallw} routine \item You can use the number of iteration as a check \item Remove the \cxxinline{dump()} function to start \end{itemize} \end{overprint} \end{minipage} \end{frame} %%% Local Variables: %%% mode: latex %%% TeX-master: "../../phys_743_parallel_programming" %%% End: