\frametitle{What will we learn today ?}
\item {The distributed memory programming paradigm MPI}
\item {Point-to-Point communications}
\item {Collective communications}
\item {Synchronizations}
\frametitle{Goals and scope of MPI}
\item {Provide a source-code portability}
\item {efficiency across different architectures}
\item {easier debugging}
\item {parallel I/O}
\item {Run multiple instances of the same program : \verb+"mpirun -np p myApp myArgs"+ starts \verb+p+ instances of the program \verb+"myApp myArgs"+}
\item {Instances exchange information by sending messages to each other}
\item {Communications take place within a \verb+communicator+ : a set of processes indexed from $0$ to $communicatorSize-1$. A special communicator named \verb+MPI_COMM_WORLD+ contains all the processes}
\frametitle{Hello World}
int main(int argc, char *argv[]){
int size, rank;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
printf("I'm process %d out of %d\n", rank, size);
\item{Compile with \verb+mpicc hello.c -o hello+}
\item{Run with \verb+mpirun -np 4 ./hello+}
\subsection{One-to-One communications}
\frametitle{Types of communications in MPI}
\item {Point-to-Point (One-to-One)}
\item {Collectives (One-to-All, All-to-One, All-to-All)}
\item {One-sided (One-to...)}
\item {Blocking and Non-Blocking of all types}
\frametitle{MPI\_Send() and MPI\_Recv()}
\item {\verb+MPI_Send(buf, count, datatype, destination, +\\ \verb+ tag, communicator)+}
\item {\verb+MPI_Recv(buf, count, datatype, source, + \\ \verb+ tag, communicator, status)+}
\item {Sends (receives) \verb+count+ elements of type \verb+datatype+ to (from) \verb+destination+ (\verb+source+) buffer \verb+buf+.}
\item {{\bf Each send must be matched by a receive!}
\item{You must know \verb+source+, \verb+tag+, \verb+communicator+ and size (\verb+count+ * \verb+sizeof(datatype)+) of incoming message}
\item{If you don’t know, use \verb+MPI_Probe+ / \verb+MPI_Iprobe+ to get that information}
\textcolor{red}{Mismatches cause race conditions or deadlocks}
int main(int argc, char *argv[]) {
int rank, size;
int buf[100];
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (rank == 0) {
MPI_Send(buf, 100, MPI_INT, 1, 0, MPI_COMM_WORLD);
} else {
MPI_Recv(buf, 100, MPI_INT, 0, 0, MPI_COMM_WORLD,&status);
\frametitle{watch out for deadlocks !}
\item{ping.c runs ok with 2 processes : \\
\begin{tabular}{ c c c }
\underline{Process 0} & & \underline{Process 1} \\
send(1,0) & $\longrightarrow$ & recv(0,0) \\
finalize() & & finalize() \\
\item{Deadlock if more than two processes : \\
\begin{tabular}{ c c c c c}
\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
send(1,0) & $\longrightarrow$ & recv(0,0) & & recv(0,0) \\
finalize() & & finalize() \\
int main(int argc, char *argv[]) {
int rank, size;
int buf[100];
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (rank == 0) {
MPI_Send(buf, 100, MPI_INT, 1, 0, MPI_COMM_WORLD);
} else if (rank==1) {
MPI_Recv(buf, 100, MPI_INT, 0, 0, MPI_COMM_WORLD,&status);
\frametitle{Blocking point-to-point communications}
\item {\verb+MPI_Send+ : Returns when buffer can be reused}
\item {\verb+MPI_Ssend+ : (S for Synchronous) : returns when other end posted matching recv}
\item {\verb+MPI_Recv+ : Returns when message has been received }
\item {\verb+MPI_Sendrecv+ : Sends and receive within the same call to avoid deadlocks}
\item {\verb+MPI_Bsend+ : (B for Buffer) : returns immediately, send buffer can be reused immediately }
\item {\verb+MPI_Rsend+ : (R for Ready) : returns only when the send buffer can be safely reused}
\item {\verb+MPI_Sendrecv_replace+ : sends, receive and replace buffer values using only one buffer}
\frametitle{Non-Blocking point-to-point communications}
\item {\verb+MPI_Isend+ and \verb+MPI_Irecv+ : Do not wait for message to be buffered send/recv. Fills an additional \verb+MPI_Request+ parameter that identifies the request}
\item {Do not use / modify / delete buffers until request completed}
\item {Wait calls block until request(s) completed :
\item {\verb+MPI_Wait(request, status)+}
\item {\verb+MPI_Waitall(count, array_of_requests,+\\ \verb+ array_of_statuses)+}
\item {\verb+MPI_Issend+ : Non-blocking version of \verb+MPI_Ssend+}
\item {\verb+MPI_Ibsend+ : Non-blocking version of \verb+MPI_Bsend+}
\item {\verb+MPI_Irsend+ : Non-blocking version of \verb+MPI_Rsend+}
%An $MPI\_Request$ object has different purposes;
% \item {The $MPI\_Send$ and $MPI\_Recv$ requests: internally identified as requests with specific types ($MPIR\_SEND$ and $MPIR\_RECV$}
% \item {Persistent communication operations : they look like sends and receives, but are actually different from an implementation perspective}
% \item {An $MPI\_Request$ is used to represent an operation, such as a nonblocking send, at several different stages of completion. The state of the operation is encoded within the request}
% \item {}
\frametitle{Non-Blocking point-to-point communications (cont'd)}
Playing with \verb+MPI_Request+
\item {\verb+MPI_Waitsome+ : Waits for an MPI request to complete }
\item {\verb+MPI_Waitany+ : Waits for any specified MPI Request to complete }
\item {\verb+MPI_Test+ : Tests for the completion of a request }
\item {\verb+MPI_Testall+ : Tests for the completion of all previously initiated requests }
\item {\verb+MPI_Testany+ : Tests for completion of any previously initiated requests }
\item {\verb+MPI_Testsome+ : Tests for some given requests to complete }
int main(int argc, char *argv[]) {
int rank;
int buf[100];
MPI_Request request;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0)
MPI_Isend(buf, 100, MPI_INT, 1, 0,MPI_COMM_WORLD, &request);
else if (rank == 1)
MPI_Irecv(buf, 100, MPI_INT, 0, 0,MPI_COMM_WORLD, &request);
MPI_Wait(&request, &status);
Process 0 and 1 exchange the content of their buffer with non-blocking \label{call}
if (rank == 0) {
MPI_Isend(buf1, 10, MPI_INT, 1, 0, MPI_COMM_WORLD, &request);
}else if (rank == 1){
MPI_Isend(buf1, 10, MPI_INT, 0, 0, MPI_COMM_WORLD, &request);
MPI_Wait(&request, &status);
memcpy(buf1, buf2, 10*sizeof(int));
Process 0 and 1 exchange the content of their buffer with sendrecv()
if (rank == 0) {
MPI_Sendrecv(buf1, 10, MPI_INT, 1, 0, buf2, 10, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
} else if (rank == 1) {
MPI_Sendrecv(buf1, 10, MPI_INT, 0, 0, buf2, 10, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
memcpy(buf1, buf2, 10*sizeof(int));
Process 0 and 1 exchange the content of their buffer with sendrecv$\_$replace()
if (rank == 0){
MPI_Sendrecv_replace(buf1, 10, MPI_INT, 1, 0, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
} else if (rank == 1) {
MPI_Sendrecv_replace(buf1, 10, MPI_INT, 0, 0, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
\frametitle{Wildcard receives}
MPI$\_$ANY$\_$SOURCE and MPI$\_$ANY$\_$TAG are wildcards :
switch(rank) {
case 0: MPI_Send(..., 1, 0, ...); break;
case 1:
case 2: MPI_Send(..., 1, 1, ...); break;
leads to : \\
\begin{tabular}{ c c c c c}
\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
send(1,0) & $\longrightarrow$ & recv(*,*) & $\longleftarrow$ & send(1,1) \\
& $\searrow$ & recv(*,*) & $\swarrow$ & \\
Returns when all processes in communicator have joined
switch(rank) {
case 0:
MPI_Send(..., dest = 1, tag = 0, ...);
MPI_Barrier(MPI_COMM_WORLD); break;
case 1:
MPI_Recv(..., MPI_ANY_SOURCE, MPI_ANY_TAG, ...); break;
case 2:
MPI_Send(..., src = 1, tag = 1, ...); break;
\frametitle{MPI$\_$Barrier(communicator) (cont'd)}
MPI$\_$ANY$\_$SOURCE and MPI$\_$ANY$\_$TAG are wildcards :
\begin{tabular}{ c c c c c}
\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
send(1,0) & $\longrightarrow$ & recv(*,*) & & \\
{\bf barrier} & & {\bf barrier} & & {\bf barrier} \\
& & recv(*,*) & $\longleftarrow$ & send(1,1) \\
Order of calls on process 1 is important
\item{recv -- barrier -- recv \textcolor{green}{correct}}
\item{barrier -- recv -- recv \textcolor{orange}{message race or deadlock (depends on msg size)}}
\item{recv -- recv -- barrier \textcolor{red}{deadlock}}
\frametitle{\textcolor{orange}{barrier -- recv -- recv}}
\begin{tabular}{ c c c c c}
\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
send(1,0) & $\longrightarrow$ & & & \\
& $\searrow$ & & & \\
{\bf barrier} & & {\bf barrier} & & {\bf barrier} \\
& & recv(*,*) & $\longleftarrow$ & send(1,1) \\
& & recv(*,*) & $\swarrow$ & \\
Order of calls on process 1 is important
\item{recv -- barrier -- recv \textcolor{green}{correct}}
\item{barrier -- recv -- recv \textcolor{orange}{message race or deadlock (depends on msg size)}}
\item{recv -- recv -- barrier \textcolor{red}{deadlock}}
\frametitle{\textcolor{red}{recv -- recv -- barrier}}
\begin{tabular}{ c c c c c}
\underline{Process 0} & & \underline{Process 1} & & \underline{Process 2} \\
send(1,0) & $\longrightarrow$ & & & \\
& & recv(*,0) & & \\
& & recv(2,*) & & \\
{\bf barrier} & & \textcolor{red}{{\bf barrier}} & & {\bf barrier} \\
& & & $\longleftarrow$ & \textcolor{red}{send(1,1)} \\
Process 1 never enters the barrier since its second recv is never matched
\subsection{Collective communications}
\frametitle{Collective communications}
\frametitle{Collective communications}
\item{\verb+MPI_Bcast+ : Sends the same data to every process}
\item{\verb+MPI_Scatter+ : Sends pieces of a buffer to every process of the communicator}
\item{\verb+MPI_Gather+ : Retrieves pieces of data from every process}
\item{\verb+MPI_Allgather+ : All pieces retrieved by all processes}
\item{\verb+MPI_Reduce+ : Performs a reduction operation (\verb+MPI_SUM+, ...) across all nodes. E.g. dot product on distributed vectors}
\item{\verb+MPI_Allreduce+ : Result distributed to all processes }
\item{\verb+MPI_Alltoall+ : Sends all data to all processes}
\item{{\bf Every process of the communicator must participate}. Parameters must match. Mismatches cause race conditions or deadlocks}
\frametitle{Receiving image parts in order}
/* Generate image parts */
/* Each process sends */
MPI_Isend(imgPart, partSize, MPI_BYTE, 0, 0, MPI_COMM_WORLD, &request);
// Process 0 receives all parts into buf
if (rank == 0){
char *buf = malloc(nProcs*partSize);
for (int i=0; i<nProcs; i++){
MPI_Recv(buf + i*partSize, partSize, MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Wait(&request, MPI_STATUS_IGNORE);
\frametitle{Receiving image parts out-of-order}
/* Generate image parts */
/* Each process sends */
MPI_Isend(imgPart, partSize, MPI_BYTE, 0, 0, MPI_COMM_WORLD, &request);
// Process 0 receives all parts into buf
if (rank == 0) {
char *buf = malloc(nProcs*partSize);
MPI_Status s; int count;
for (int i=0; i<nProcs; i++) {
MPI_Get_count(&s, MPI_BYTE, &count);
} } MPI_Wait(&request, MPI_STATUS_IGNORE);
\frametitle{Receiving image parts with a collective}
/* Generate image parts */
/* Process 0 is the root of the collective, i.e. the receiver of all parts */
int root = 0;
char *buf = NULL;
if (rank == root) /* Only the root must allocate buf */
buf = malloc(nProcs*partSize)
MPI_Gather(part, partSize, MPI_BYTE, buf, partSize, MPI_BYTE, root, MPI_COMM_WORLD);
\frametitle{Collectives within conditions}
Avoid collective calls within conditional clauses. What happens if :
int root = 0;
char *buf = NULL;
if (rank == root) { /* Only the root must allocate buf */
buf = malloc(nProcs*partSize)
MPI_Gather(part, partSize, MPI_BYTE, buf, partSize, MPI_BYTE, root, MPI_COMM_WORLD);
MPI_Send(part, ... , ... , rank, MPI_TAG);
\frametitle{Customized communicators and datatypes}
\item{ You can define your own communicators :
\item{\verb+MPI_Comm_dup+ duplicates a communicator (e.g. to enable private communications within library functions)}
\item{\verb+MPI_Comm_split+ splits a communicator into multiple smaller communicators (useful when using 2D and 3D domain decompositions) }
\item{ You can define custom datatypes :
\item{Simple structs (\verb+MPI_Type_struct+)}
\item{Vectors (\verb+MPI_Type_vector+)}
\item{{\bf NO POINTERS}}
\frametitle{Does this program terminate? (assume 2 processes)}
int rank;
MPI_Init(&argc, &argv);
MPI_Comm_rank(&rank, MPI_COMM_WORLD);
if (rank)
MPI_Send(&rank, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
if (rank)
MPI_Send(&rank, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
\frametitle{Timing MPI programs}
\item {\verb+MPI_Wtime()+ : returns a double precision floating point number, the time in seconds since some abritrary point of time in the past }
\item {\verb+MPI_Wtick()+ : returns a double precision floating point number, the time in seconds between successive ticks of the clock }
\item { both functions are not synchronized across the running processors. }
\frametitle{Next time}
That's all for MPI 1.0. What's new in MPI2 :
\item {Parallel I/O}
\item {One-sided communications}
\item {Dynamic process management}
\frametitle{Implementation of MPI}
You don't need to have always access to a cluster or a supercomputer to develop parallel applications. Here follows some implementations of MPI :
\item {\textbf{Linux} OpenMPI, MPICH2, IntelMPI (licensed)}
\item {\textbf{MacOS X} OpenMPI (without Fortran for the official port, with if installing by hand), MPICH2 (via homebrew)}
\item {\textbf{Windows} OpenMPI, MPICH1 (older version), MS-MPI }
