mpi-basics.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Oct 15, 03:00

mpi-basics.tex
View Options

	\subsection{Introduction}

	\begin{frame}
	\frametitle{What will we learn today ?}
	\begin{itemize}
	\item {The distributed memory programming paradigm MPI}
	\item {Point-to-Point communications}
	\item {Collective communications}
	\item {Synchronizations}
	\end{itemize}
	\end{frame}


	\begin{frame}
	\frametitle{Reminder}
	\begin{center}
	%\includegraphics[width=8cm]{day2/images/distributed.png}
	%{\includegraphics[width=\textwidth]{day2/images/distributed-memory.pdf}}
	\input{day2/images/distributed-memory}
	\end{center}
	\end{frame}

	\begin{frame}
	\frametitle{Goals and scope of MPI}
	\begin{itemize}
	\item {Provide a source-code portability}
	\item {efficiency across different architectures}
	\item {easier debugging}
	\item {parallel I/O}
	\end{itemize}
	\end{frame}

	\begin{frame}[containsverbatim]
	\frametitle{MPI}
	\begin{itemize}
	\item {Run multiple instances of the same program : \verb+"mpirun -np p myApp myArgs"+ starts \verb+p+ instances of the program \verb+"myApp myArgs"+}
	\item {Instances exchange information by sending messages to each other}
	\item {Communications take place within a \verb+communicator+ : a set of processes indexed from $0$ to $communicatorSize-1$. A special communicator named \verb+MPI_COMM_WORLD+ contains all the processes}
	\end{itemize}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{Hello World}

	\begin{lstlisting}[language=C,frame=lines]
	int main(int argc, char *argv[]){
	int size, rank;
	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	printf("I'm process %d out of %d\n", rank, size);

	MPI_Finalize();
	}
	\end{lstlisting}
	\begin{itemize}
	\item{Compile with \verb+mpicc hello.c -o hello+}
	\item{Run with \verb+mpirun -np 4 ./hello+}
	\end{itemize}
	\end{frame}


	\subsection{One-to-One communications}

	\begin{frame}[containsverbatim]
	\frametitle{Types of communications in MPI}
	\begin{itemize}
	\item {Point-to-Point (One-to-One)}
	\item {Collectives (One-to-All, All-to-One, All-to-All)}
	\item {One-sided (One-to...)}
	\item {Blocking and Non-Blocking of all types}
	\end{itemize}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{MPI\_Send() and MPI\_Recv()}
	\begin{itemize}
	\item {\verb+MPI_Send(buf, count, datatype, destination, +\\ \verb+ tag, communicator)+}
	\item {\verb+MPI_Recv(buf, count, datatype, source, + \\ \verb+ tag, communicator, status)+}
	\item {Sends (receives) \verb+count+ elements of type \verb+datatype+ to (from) \verb+destination+ (\verb+source+) buffer \verb+buf+.}
	\item {{\bf Each send must be matched by a receive!}
	\begin{itemize}
	\item{You must know \verb+source+, \verb+tag+, \verb+communicator+ and size (\verb+count+ * \verb+sizeof(datatype)+) of incoming message}
	\item{If you don’t know, use \verb+MPI_Probe+ / \verb+MPI_Iprobe+ to get that information}
	\end{itemize}
	}
	\end{itemize}
	\textcolor{red}{Mismatches cause race conditions or deadlocks}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{ping.c}
	\begin{lstlisting}[language=C,frame=lines]
	int main(int argc, char *argv[]) {
	int rank, size;
	int buf[100];
	MPI_Status status;
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	if (rank == 0) {
	MPI_Send(buf, 100, MPI_INT, 1, 0, MPI_COMM_WORLD);
	} else {
	MPI_Recv(buf, 100, MPI_INT, 0, 0, MPI_COMM_WORLD,&status);
	}
	MPI_Finalize();
	}
	\end{lstlisting}

	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{watch out for deadlocks !}
	\begin{itemize}
	\item{ping.c runs ok with 2 processes : \\
	\begin{tabular}{ c c c }
	\underline{Process 0} & & \underline{Process 1} \\
	send(1,0) & $\longrightarrow$ & recv(0,0) \\
	finalize() & & finalize() \\
	\end{tabular}
	}
	\item{Deadlock if more than two processes : \\
	\begin{tabular}{ c c c c c}
	\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
	send(1,0) & $\longrightarrow$ & recv(0,0) & & recv(0,0) \\
	finalize() & & finalize() \\
	\end{tabular}
	}
	\end{itemize}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{ping$\_$correct.c}
	\begin{lstlisting}[language=C,frame=lines]
	int main(int argc, char *argv[]) {
	int rank, size;
	int buf[100];
	MPI_Status status;
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	if (rank == 0) {
	MPI_Send(buf, 100, MPI_INT, 1, 0, MPI_COMM_WORLD);
	} else if (rank==1) {
	MPI_Recv(buf, 100, MPI_INT, 0, 0, MPI_COMM_WORLD,&status);
	}
	MPI_Finalize();
	}
	\end{lstlisting}
	\end{frame}

	\begin{frame}[containsverbatim]
	\frametitle{Blocking point-to-point communications}
	\begin{itemize}
	\item {\verb+MPI_Send+ : Returns when buffer can be reused}
	\item {\verb+MPI_Ssend+ : (S for Synchronous) : returns when other end posted matching recv}
	\item {\verb+MPI_Recv+ : Returns when message has been received }
	\item {\verb+MPI_Sendrecv+ : Sends and receive within the same call to avoid deadlocks}
	\item {\verb+MPI_Bsend+ : (B for Buffer) : returns immediately, send buffer can be reused immediately }
	\item {\verb+MPI_Rsend+ : (R for Ready) : returns only when the send buffer can be safely reused}
	\item {\verb+MPI_Sendrecv_replace+ : sends, receive and replace buffer values using only one buffer}
	\end{itemize}
	\end{frame}



	\begin{frame}[containsverbatim]
	\frametitle{Non-Blocking point-to-point communications}
	\begin{itemize}
	\item {\verb+MPI_Isend+ and \verb+MPI_Irecv+ : Do not wait for message to be buffered send/recv. Fills an additional \verb+MPI_Request+ parameter that identifies the request}
	\item {Do not use / modify / delete buffers until request completed}
	\item {Wait calls block until request(s) completed :
	\begin{itemize}
	\item {\verb+MPI_Wait(request, status)+}
	\item {\verb+MPI_Waitall(count, array_of_requests,+\\ \verb+ array_of_statuses)+}
	\end{itemize}
	}
	\item {\verb+MPI_Issend+ : Non-blocking version of \verb+MPI_Ssend+}
	\item {\verb+MPI_Ibsend+ : Non-blocking version of \verb+MPI_Bsend+}
	\item {\verb+MPI_Irsend+ : Non-blocking version of \verb+MPI_Rsend+}
	\end{itemize}
	\end{frame}

	%\begin{frame}[containsverbatim]
	%\frametitle{MPI$\_$Request}
	%
	%An $MPI\_Request$ object has different purposes;
	%
	%\begin{itemize}
	% \item {The $MPI\_Send$ and $MPI\_Recv$ requests: internally identified as requests with specific types ($MPIR\_SEND$ and $MPIR\_RECV$}
	% \item {Persistent communication operations : they look like sends and receives, but are actually different from an implementation perspective}
	% \item {An $MPI\_Request$ is used to represent an operation, such as a nonblocking send, at several different stages of completion. The state of the operation is encoded within the request}
	% \item {}
	%\end{itemize}
	%\end{frame}



	\begin{frame}[containsverbatim]
	\frametitle{Non-Blocking point-to-point communications (cont'd)}

	Playing with \verb+MPI_Request+

	\begin{itemize}
	\item {\verb+MPI_Waitsome+ : Waits for an MPI request to complete }
	\item {\verb+MPI_Waitany+ : Waits for any specified MPI Request to complete }
	\item {\verb+MPI_Test+ : Tests for the completion of a request }
	\item {\verb+MPI_Testall+ : Tests for the completion of all previously initiated requests }
	\item {\verb+MPI_Testany+ : Tests for completion of any previously initiated requests }
	\item {\verb+MPI_Testsome+ : Tests for some given requests to complete }
	\end{itemize}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{iping.c}
	\begin{lstlisting}[language=C,frame=lines]
	int main(int argc, char *argv[]) {
	int rank;
	int buf[100];
	MPI_Request request;
	MPI_Status status;
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	if (rank == 0)
	MPI_Isend(buf, 100, MPI_INT, 1, 0,MPI_COMM_WORLD, &request);
	else if (rank == 1)
	MPI_Irecv(buf, 100, MPI_INT, 0, 0,MPI_COMM_WORLD, &request);
	MPI_Wait(&request, &status);
	MPI_Finalize();
	}
	\end{lstlisting}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{exchange.c}
	Process 0 and 1 exchange the content of their buffer with non-blocking \label{call}
	\begin{lstlisting}[language=C,frame=lines]
	if (rank == 0) {
	MPI_Isend(buf1, 10, MPI_INT, 1, 0, MPI_COMM_WORLD, &request);
	MPI_Recv(buf2, 10, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	}else if (rank == 1){
	MPI_Isend(buf1, 10, MPI_INT, 0, 0, MPI_COMM_WORLD, &request);
	MPI_Recv(buf2, 10, MPI_INT, 0, 0,MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	}

	MPI_Wait(&request, &status);
	memcpy(buf1, buf2, 10*sizeof(int));
	\end{lstlisting}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{exchange$\_$sendrecv.c}
	Process 0 and 1 exchange the content of their buffer with sendrecv()
	\begin{lstlisting}[language=C,frame=lines]
	if (rank == 0) {
	MPI_Sendrecv(buf1, 10, MPI_INT, 1, 0, buf2, 10, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	} else if (rank == 1) {
	MPI_Sendrecv(buf1, 10, MPI_INT, 0, 0, buf2, 10, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	}

	memcpy(buf1, buf2, 10*sizeof(int));
	\end{lstlisting}
	\end{frame}

	\begin{frame}[containsverbatim]
	\frametitle{exchange$\_$sendrecv$\_$replace.c}
	Process 0 and 1 exchange the content of their buffer with sendrecv$\_$replace()
	\begin{lstlisting}[language=C,frame=lines]
	if (rank == 0){
	MPI_Sendrecv_replace(buf1, 10, MPI_INT, 1, 0, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	} else if (rank == 1) {
	MPI_Sendrecv_replace(buf1, 10, MPI_INT, 0, 0, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	}
	\end{lstlisting}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{Wildcard receives}
	MPI$\_$ANY$\_$SOURCE and MPI$\_$ANY$\_$TAG are wildcards :
	\begin{lstlisting}[language=C,frame=lines]
	switch(rank) {
	case 0: MPI_Send(..., 1, 0, ...); break;
	case 1:
	MPI_Recv(..., MPI_ANY_SOURCE, MPI_ANY_TAG, ...);
	MPI_Recv(..., MPI_ANY_SOURCE, MPI_ANY_TAG, ...);
	break;
	case 2: MPI_Send(..., 1, 1, ...); break;
	}
	\end{lstlisting}
	leads to : \\

	\begin{tabular}{ c c c c c}
	\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
	send(1,0) & $\longrightarrow$ & recv(,) & $\longleftarrow$ & send(1,1) \\
	& $\searrow$ & recv(,) & $\swarrow$ & \\
	\end{tabular}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{MPI$\_$Barrier(communicator)}
	Returns when all processes in communicator have joined
	\begin{lstlisting}[language=C,frame=lines]
	switch(rank) {
	case 0:
	MPI_Send(..., dest = 1, tag = 0, ...);
	MPI_Barrier(MPI_COMM_WORLD); break;
	case 1:
	MPI_Recv(..., MPI_ANY_SOURCE, MPI_ANY_TAG, ...);
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Recv(..., MPI_ANY_SOURCE, MPI_ANY_TAG, ...); break;
	case 2:
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Send(..., src = 1, tag = 1, ...); break;
	}\end{lstlisting}
	\end{frame}

	\begin{frame}[containsverbatim]
	\frametitle{MPI$\_$Barrier(communicator) (cont'd)}
	MPI$\_$ANY$\_$SOURCE and MPI$\_$ANY$\_$TAG are wildcards :

	\begin{tabular}{ c c c c c}
	\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
	send(1,0) & $\longrightarrow$ & recv(,) & & \\
	\hline
	{\bf barrier} & & {\bf barrier} & & {\bf barrier} \\
	\hline
	& & recv(,) & $\longleftarrow$ & send(1,1) \\
	\end{tabular}

	Order of calls on process 1 is important

	\begin{itemize}
	\item{recv -- barrier -- recv \textcolor{green}{correct}}
	\item{barrier -- recv -- recv \textcolor{orange}{message race or deadlock (depends on msg size)}}
	\item{recv -- recv -- barrier \textcolor{red}{deadlock}}
	\end{itemize}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{\textcolor{orange}{barrier -- recv -- recv}}

	\begin{tabular}{ c c c c c}
	\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
	send(1,0) & $\longrightarrow$ & & & \\
	& $\searrow$ & & & \\
	\hline
	{\bf barrier} & & {\bf barrier} & & {\bf barrier} \\
	\hline
	& & recv(,) & $\longleftarrow$ & send(1,1) \\
	& & recv(,) & $\swarrow$ & \\
	\end{tabular}

	Order of calls on process 1 is important

	\begin{itemize}
	\item{recv -- barrier -- recv \textcolor{green}{correct}}
	\item{barrier -- recv -- recv \textcolor{orange}{message race or deadlock (depends on msg size)}}
	\item{recv -- recv -- barrier \textcolor{red}{deadlock}}
	\end{itemize}
	\end{frame}

	\begin{frame}[containsverbatim]
	\frametitle{\textcolor{red}{recv -- recv -- barrier}}

	\begin{tabular}{ c c c c c}
	\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
	send(1,0) & $\longrightarrow$ & & & \\
	& & recv(*,0) & & \\
	& & recv(2,*) & & \\
	\hline
	{\bf barrier} & & \textcolor{red}{{\bf barrier}} & & {\bf barrier} \\
	\hline
	& & & $\longleftarrow$ & \textcolor{red}{send(1,1)} \\
	\end{tabular}

	Process 1 never enters the barrier since its second recv is never matched
	\end{frame}


	\subsection{Collective communications}


	\begin{frame}[containsverbatim]
	\frametitle{Collective communications}

	\begin{center}
	\input{day2/images/collectives.tex}
	\end{center}

	\end{frame}



	\begin{frame}[containsverbatim]
	\frametitle{Collective communications}

	\begin{itemize}
	\item{\verb+MPI_Bcast+ : Sends the same data to every process}
	\item{\verb+MPI_Scatter+ : Sends pieces of a buffer to every process of the communicator}
	\item{\verb+MPI_Gather+ : Retrieves pieces of data from every process}
	\item{\verb+MPI_Allgather+ : All pieces retrieved by all processes}
	\item{\verb+MPI_Reduce+ : Performs a reduction operation (\verb+MPI_SUM+, ...) across all nodes. E.g. dot product on distributed vectors}
	\item{\verb+MPI_Allreduce+ : Result distributed to all processes }
	\item{\verb+MPI_Alltoall+ : Sends all data to all processes}
	\item{{\bf Every process of the communicator must participate}. Parameters must match. Mismatches cause race conditions or deadlocks}
	\end{itemize}

	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{Receiving image parts in order}
	\begin{lstlisting}[language=C,frame=lines]
	/* Generate image parts */
	...
	/* Each process sends */
	MPI_Isend(imgPart, partSize, MPI_BYTE, 0, 0, MPI_COMM_WORLD, &request);

	// Process 0 receives all parts into buf
	if (rank == 0){
	char buf = malloc(nProcspartSize);
	for (int i=0; i<nProcs; i++){
	MPI_Recv(buf + i*partSize, partSize, MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	}

	MPI_Wait(&request, MPI_STATUS_IGNORE);
	\end{lstlisting}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{Receiving image parts out-of-order}
	\begin{lstlisting}[language=C,frame=lines]
	/* Generate image parts */
	...
	/* Each process sends */
	MPI_Isend(imgPart, partSize, MPI_BYTE, 0, 0, MPI_COMM_WORLD, &request);
	// Process 0 receives all parts into buf
	if (rank == 0) {
	char buf = malloc(nProcspartSize);
	MPI_Status s; int count;
	for (int i=0; i<nProcs; i++) {
	MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, comm, &s);
	MPI_Get_count(&s, MPI_BYTE, &count);
	MPI_Recv(buf + s.MPI_SOURCE*count, count, MPI_BYTE, s.MPI_SOURCE, s.MPI_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	} } MPI_Wait(&request, MPI_STATUS_IGNORE);
	\end{lstlisting}
	\end{frame}

	\begin{frame}[containsverbatim]
	\frametitle{Receiving image parts with a collective}
	\begin{lstlisting}[language=C,frame=lines]
	/* Generate image parts */
	...
	/* Process 0 is the root of the collective, i.e. the receiver of all parts */
	int root = 0;
	char *buf = NULL;
	if (rank == root) /* Only the root must allocate buf */
	buf = malloc(nProcs*partSize)

	MPI_Gather(part, partSize, MPI_BYTE, buf, partSize, MPI_BYTE, root, MPI_COMM_WORLD);
	\end{lstlisting}
	\end{frame}




	\begin{frame}[containsverbatim]
	\frametitle{Collectives within conditions}

	Avoid collective calls within conditional clauses. What happens if :

	\begin{lstlisting}[language=C,frame=lines]
	int root = 0;
	char *buf = NULL;
	if (rank == root) { /* Only the root must allocate buf */
	buf = malloc(nProcs*partSize)

	MPI_Gather(part, partSize, MPI_BYTE, buf, partSize, MPI_BYTE, root, MPI_COMM_WORLD);
	}else{
	MPI_Send(part, ... , ... , rank, MPI_TAG);
	}
	\end{lstlisting}
	\end{frame}


	\subsection{Customization}


	\begin{frame}[containsverbatim]
	\frametitle{Customized communicators and datatypes}

	\begin{itemize}
	\item{ You can define your own communicators :
	\begin{itemize}
	\item{\verb+MPI_Comm_dup+ duplicates a communicator (e.g. to enable private communications within library functions)}
	\item{\verb+MPI_Comm_split+ splits a communicator into multiple smaller communicators (useful when using 2D and 3D domain decompositions) }
	\end{itemize}
	}
	\item{ You can define custom datatypes :
	\begin{itemize}
	\item{Simple structs (\verb+MPI_Type_struct+)}
	\item{Vectors (\verb+MPI_Type_vector+)}
	\item{{\bf NO POINTERS}}
	\end{itemize}
	}
	\end{itemize}

	\end{frame}

	\subsection{Summary}

	\begin{frame}[containsverbatim]
	\frametitle{Does this program terminate? (assume 2 processes)}
	\begin{lstlisting}[language=C,frame=lines]
	int rank;
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(&rank, MPI_COMM_WORLD);
	if (rank)
	MPI_Send(&rank, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
	else
	MPI_Recv(&rank, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

	if (rank)
	MPI_Send(&rank, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
	else
	MPI_Recv(&rank, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);

	MPI_Finalize();
	\end{lstlisting}
	\end{frame}

	\begin{frame}[containsverbatim]
	\frametitle{Timing MPI programs}

	\begin{itemize}
	\item {\verb+MPI_Wtime()+ : returns a double precision floating point number, the time in seconds since some abritrary point of time in the past }
	\item {\verb+MPI_Wtick()+ : returns a double precision floating point number, the time in seconds between successive ticks of the clock }
	\item { both functions are not synchronized across the running processors. }
	\end{itemize}
	\end{frame}


	\begin{frame}[containsverbatim]
	\frametitle{Next time}

	That's all for MPI 1.0. What's new in MPI2 :

	\begin{itemize}
	\item {Parallel I/O}
	\item {One-sided communications}
	\item {Dynamic process management}
	\end{itemize}

	\end{frame}


	\begin{frame}
	\frametitle{Implementation of MPI}
	You don't need to have always access to a cluster or a supercomputer to develop parallel applications. Here follows some implementations of MPI :
	\begin{itemize}
	\item {\textbf{Linux} OpenMPI, MPICH2, IntelMPI (licensed)}
	\item {\textbf{MacOS X} OpenMPI (without Fortran for the official port, with if installing by hand), MPICH2 (via homebrew)}
	\item {\textbf{Windows} OpenMPI, MPICH1 (older version), MS-MPI }
	\end{itemize}
	\end{frame}



	%%% Local Variables:
	%%% mode: latex
	%%% TeX-master: "../cours_vince"
	%%% End:

mpi-basics.texNo OneTemporaryActions

File Metadata

mpi-basics.texView Options

Event Timeline

mpi-basics.tex
No OneTemporary
Actions

mpi-basics.tex
View Options