diff --git a/examples/mpi/CMakeLists.txt b/examples/mpi/CMakeLists.txt
index b482589..dde2924 100644
--- a/examples/mpi/CMakeLists.txt
+++ b/examples/mpi/CMakeLists.txt
@@ -1,8 +1,8 @@
 find_package(MPI REQUIRED COMPONENTS MPICXX)
 
-foreach(tgt hello_mpi send_recv isend_recv)
+foreach(tgt hello_mpi send_recv isend_recv ping_ping ping_pong datatypes pack_unpack)
   add_executable(${tgt} ${tgt}.cc)
   target_link_libraries(${tgt} PRIVATE ${MPI_CXX_LIBRARIES})
   target_include_directories(${tgt} PRIVATE ${MPI_CXX_INCLUDE_DIRS})
   target_compile_options(${tgt} PRIVATE ${MPI_CXX_COMPILE_OPTIONS})
 endforeach()
diff --git a/examples/mpi/Makefile b/examples/mpi/Makefile
deleted file mode 100644
index f2a921c..0000000
--- a/examples/mpi/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-CC=mpicc
-CFLAGS=-Wall -Werror -g
-LDFLAGS=-lm
-
-EXECUTABLES=hello ping ping_correct iping exchange exchange_send_recv exchange_send_recv_replace
-
-all: $(EXECUTABLES)
-
-hello: hello.c
-	$(CC) $< $(CFLAGS) -o $@
-
-ping: ping.c
-	$(CC) $< $(CFLAGS) -o $@
-
-ping_correct: ping_correct.c
-	$(CC) $< $(CFLAGS) -o $@
-
-iping: iping.c
-	$(CC) $< $(CFLAGS) -o $@
-
-exchange: exchange.c
-	$(CC) $< $(CFLAGS) -o $@
-
-exchange_send_recv: exchange_send_recv.c
-	$(CC) $< $(CFLAGS) -o $@
-
-exchange_send_recv_replace: exchange_send_recv_replace.c
-	$(CC) $< $(CFLAGS) -o $@
-
-clean:
-	rm -f $(EXECUTABLES) timing.o
diff --git a/examples/mpi/datatypes.cc b/examples/mpi/datatypes.cc
new file mode 100644
index 0000000..c4fef20
--- /dev/null
+++ b/examples/mpi/datatypes.cc
@@ -0,0 +1,53 @@
+#include <iostream>
+#include <mpi.h>
+#include <vector>
+
+int main() {
+  MPI_Init(NULL, NULL);
+
+  int prank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+
+  MPI_Count lb, extent, size;
+
+  struct Test_t {
+    double d[2];
+    int i;
+  };
+
+  std::vector<Test_t> foo(100);
+
+  std::array<int, 2> block_lengths = {2, 1};
+  std::array<MPI_Aint, 2> displacements;
+  std::array<MPI_Datatype, 2> old_types = {MPI_DOUBLE, MPI_INT};
+
+  MPI_Aint addr0, addr1;
+  MPI_Get_address(&foo[0], &addr0);
+  MPI_Get_address(&foo[0].d, &displacements[0]);
+  MPI_Get_address(&foo[0].i, &displacements[1]);
+
+  displacements[0] = MPI_Aint_diff(displacements[0], addr0);
+  displacements[1] = MPI_Aint_diff(displacements[1], addr0);
+
+  MPI_Datatype mpi_test_t, mpi_test_vector_t;
+
+  MPI_Type_create_struct(2, block_lengths.data(), displacements.data(),
+                         old_types.data(), &mpi_test_t);
+
+  MPI_Get_address(&foo[1], &addr1);
+  addr1 = MPI_Aint_diff(addr1, addr0);
+
+  MPI_Type_create_resized(mpi_test_t, 0, addr1, &mpi_test_vector_t);
+  MPI_Type_commit(&mpi_test_vector_t);
+
+  MPI_Type_get_extent_x(mpi_test_t, &lb, &extent);
+  MPI_Type_size_x(mpi_test_t, &size);
+  std::cout << "MPI Datatype: mpi_test_t, size: " << size
+            << " extent: " << extent << std::endl;
+  std::cout << "C++ Datatype: Test_t, size: " << sizeof(Test_t) << std::endl;
+
+  MPI_Type_free(&mpi_test_vector_t);
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/examples/mpi/exchange.c b/examples/mpi/exchange.c
deleted file mode 100644
index 8f188bd..0000000
--- a/examples/mpi/exchange.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <mpi.h>
-#include <string.h>
-#include <stdio.h>
-
-int main(int argc, char * argv[]) {
-  int myrank;
-  int buf1[100];
-  int buf2[100];
-
-  MPI_Status status;
-  MPI_Request request;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  if (myrank == 0) {
-    MPI_Isend(buf1, 100, MPI_INT, 1, 0, MPI_COMM_WORLD, &request);
-    MPI_Recv(buf2, 100, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-  } else if (myrank == 1) {
-    MPI_Isend(buf1, 100, MPI_INT, 0, 0, MPI_COMM_WORLD, &request);
-    MPI_Recv(buf2, 100, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-  }
-
-  MPI_Wait(&request, &status);
-  memcpy(buf1, buf2, 100 * sizeof(int));
-
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/examples/mpi/exchange_send_recv.c b/examples/mpi/exchange_send_recv.c
deleted file mode 100644
index f46bf76..0000000
--- a/examples/mpi/exchange_send_recv.c
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <mpi.h>
-#include <string.h>
-#include <stdio.h>
-
-int main(int argc, char * argv[]) {
-  int myrank;
-  int buf1[100];
-  int buf2[100];
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  if (myrank == 0) {
-    MPI_Sendrecv(buf1, 100, MPI_INT, 1, 0, buf2, 100, MPI_INT, 1, 0,
-                 MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-  } else if (myrank == 1) {
-    MPI_Sendrecv(buf1, 100, MPI_INT, 0, 0, buf2, 100, MPI_INT, 0, 0,
-                 MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-  }
-  memcpy(buf1, buf2, 100 * sizeof(int));
-
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/examples/mpi/exchange_send_recv_replace.c b/examples/mpi/exchange_send_recv_replace.c
deleted file mode 100644
index eeb9784..0000000
--- a/examples/mpi/exchange_send_recv_replace.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <mpi.h>
-#include <string.h>
-#include <stdio.h>
-
-int main(int argc, char * argv[]) {
-  int myrank;
-  int buf1[100];
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  if (myrank == 0) {
-    MPI_Sendrecv_replace(buf1, 100, MPI_INT, 1, 0, 1, 0, MPI_COMM_WORLD,
-                         MPI_STATUS_IGNORE);
-  } else if (myrank == 1) {
-    MPI_Sendrecv_replace(buf1, 100, MPI_INT, 0, 0, 0, 0, MPI_COMM_WORLD,
-                         MPI_STATUS_IGNORE);
-  }
-
-  MPI_Finalize();
-
-  return 0;
-}
diff --git a/examples/mpi/iping.c b/examples/mpi/iping.c
deleted file mode 100644
index 4da9188..0000000
--- a/examples/mpi/iping.c
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <mpi.h>
-#include <stdio.h>
-
-int main(int argc, char * argv[]) {
-  int rank;
-  int buf[100];
-
-  MPI_Request request;
-  MPI_Status status;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-  if (rank == 0)
-    MPI_Isend(buf, 100, MPI_INT, 1, 0, MPI_COMM_WORLD, &request);
-  else if (rank == 1)
-    MPI_Irecv(buf, 100, MPI_INT, 0, 0, MPI_COMM_WORLD, &request);
-
-  MPI_Wait(&request, &status);
-  MPI_Finalize();
-}
diff --git a/examples/mpi/pack_unpack.cc b/examples/mpi/pack_unpack.cc
new file mode 100644
index 0000000..5e3bdb2
--- /dev/null
+++ b/examples/mpi/pack_unpack.cc
@@ -0,0 +1,50 @@
+#include <assert.h>
+#include <iostream>
+#include <mpi.h>
+#include <vector>
+
+void fill_buffer(std::vector<int> &buf) {
+  for (auto &v : buf) {
+    v = 0;
+  }
+}
+
+int main() {
+  int rank, size;
+  std::vector<char> buf(100);
+  int position{0}, count;
+  MPI_Status status;
+
+  int a;
+  double d[10];
+
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  assert(size == 2 && "Works only with 2 procs");
+
+  if (rank == 0) {
+    a = 0xcafe;
+    MPI_Pack(&a, 1, MPI_INT, buf.data(), buf.size(), &position, MPI_COMM_WORLD);
+    MPI_Pack(d, 10, MPI_DOUBLE, buf.data(), buf.size(), &position,
+             MPI_COMM_WORLD);
+    MPI_Send(buf.data(), position, MPI_PACKED, 1, 0, MPI_COMM_WORLD);
+  } else if (rank == 1) {
+    MPI_Recv(buf.data(), buf.size(), MPI_PACKED, 0, 0, MPI_COMM_WORLD, &status);
+    MPI_Unpack(buf.data(), buf.size(), &position, &a, 1, MPI_INT,
+               MPI_COMM_WORLD);
+    MPI_Unpack(buf.data(), buf.size(), &position, d, 10, MPI_DOUBLE,
+               MPI_COMM_WORLD);
+  }
+
+  if (rank == 1) {
+    MPI_Get_count(&status, MPI_PACKED, &count);
+    std::cout << " position: " << position << " count: " << count << " - a: 0x"
+              << std::hex << a << std::endl;
+  }
+
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/examples/mpi/ping.c b/examples/mpi/ping.c
deleted file mode 100644
index e44295e..0000000
--- a/examples/mpi/ping.c
+++ /dev/null
@@ -1,24 +0,0 @@
-#include <mpi.h>
-#include <stdio.h>
-
-int main(int argc, char * argv[]) {
-  int myrank, mysize;
-  int buf[100];
-  MPI_Status status;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  MPI_Comm_size(MPI_COMM_WORLD, &mysize);
-
-  if (mysize != 2)
-    printf("Warning: this examples will most probably deadlock with a number "
-           "of process different from 2.\n");
-
-  if (myrank == 0) {
-    MPI_Send(buf, 100, MPI_INT, 1, 0, MPI_COMM_WORLD);
-  } else {
-    MPI_Recv(buf, 100, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
-  }
-
-  MPI_Finalize();
-}
diff --git a/examples/mpi/ping_correct.c b/examples/mpi/ping_correct.c
deleted file mode 100644
index 50ce3dd..0000000
--- a/examples/mpi/ping_correct.c
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <mpi.h>
-#include <stdio.h>
-
-int main(int argc, char * argv[]) {
-  int myrank, mysize;
-  int buf[100];
-
-  MPI_Status status;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  MPI_Comm_size(MPI_COMM_WORLD, &mysize);
-
-  if (myrank == 0) {
-    MPI_Send(buf, 100, MPI_INT, 1, 0, MPI_COMM_WORLD);
-  } else if (myrank == 1) {
-    MPI_Recv(buf, 100, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
-  }
-
-  MPI_Finalize();
-}
diff --git a/examples/mpi/ping_ping.cc b/examples/mpi/ping_ping.cc
new file mode 100644
index 0000000..922b4a4
--- /dev/null
+++ b/examples/mpi/ping_ping.cc
@@ -0,0 +1,57 @@
+#include "print_size.hh"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <mpi.h>
+#include <vector>
+
+void fill_buffer(std::vector<int> &buf, size_t size) {
+  buf.resize(size);
+  for (auto &v : buf) {
+    v = 0;
+  }
+}
+
+using clk = std::chrono::high_resolution_clock;
+using second = std::chrono::duration<double>;
+
+int main() {
+  int prank, psize;
+  const auto N{30};
+  size_t REP{1000};
+
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+  MPI_Comm_size(MPI_COMM_WORLD, &psize);
+
+  std::vector<int> buf;
+  auto partner = (prank + 1) % psize;
+
+  for (size_t n = 0; n < N; n += 2) {
+    auto size = 1 << n;
+    fill_buffer(buf, size);
+
+    auto t_start = clk::now();
+    for (size_t repetition = 0; repetition < REP; ++repetition) {
+      MPI_Send(buf.data(), buf.size(), MPI_INT, partner, 0, MPI_COMM_WORLD);
+      MPI_Recv(buf.data(), buf.size(), MPI_INT, partner, 0, MPI_COMM_WORLD,
+               MPI_STATUS_IGNORE);
+    }
+    auto time_s = (second{clk::now() - t_start}).count() / REP;
+
+    if (prank == 0) {
+      auto size_b = size * sizeof(int);
+      std::cout << "PingPing"
+                << " size: " << std::setw(10) << printHuman(size_b, "B", 2)
+                << " time: " << std::setw(10) << printHuman(time_s, "s")
+                << " bandwidth: " << std::setw(10)
+                << printHuman(size_b / time_s, "B/s", 2) << std::endl;
+    }
+    if (size > 256)
+      REP = 1;
+  }
+
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/examples/mpi/ping_pong.cc b/examples/mpi/ping_pong.cc
new file mode 100644
index 0000000..b538e52
--- /dev/null
+++ b/examples/mpi/ping_pong.cc
@@ -0,0 +1,65 @@
+#include "print_size.hh"
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <mpi.h>
+#include <vector>
+
+void fill_buffer(std::vector<int> &buf, size_t size) {
+  buf.resize(size);
+  for (auto &v : buf) {
+    v = 0;
+  }
+}
+
+using clk = std::chrono::high_resolution_clock;
+using second = std::chrono::duration<double>;
+
+int main() {
+  int prank, psize;
+  const auto N{30};
+  size_t REP{1000};
+
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+  MPI_Comm_size(MPI_COMM_WORLD, &psize);
+
+  std::vector<int> buf;
+  auto partner = (prank + 1) % psize;
+
+  for (size_t n = 0; n < N; ++n) {
+    auto size = 1 << n;
+    fill_buffer(buf, size);
+
+    if (size > 256)
+      REP = 1;
+
+    auto t_start = clk::now();
+    for (size_t repetition = 0; repetition < REP; ++repetition) {
+      if (prank == 0) {
+        MPI_Send(buf.data(), buf.size(), MPI_INT, partner, 0, MPI_COMM_WORLD);
+        MPI_Recv(buf.data(), buf.size(), MPI_INT, partner, 0, MPI_COMM_WORLD,
+                 MPI_STATUS_IGNORE);
+      } else {
+        MPI_Recv(buf.data(), buf.size(), MPI_INT, partner, 0, MPI_COMM_WORLD,
+                 MPI_STATUS_IGNORE);
+        MPI_Send(buf.data(), buf.size(), MPI_INT, partner, 0, MPI_COMM_WORLD);
+      }
+    }
+    second time = clk::now() - t_start;
+
+    if (prank == 0) {
+      auto size_b = size * sizeof(int);
+      auto time_s = time.count() / REP / 2.;
+      std::cout << "PingPong"
+                << " size: " << std::setw(10) << printHuman(size_b, "B", 2)
+                << " time: " << std::setw(10) << printHuman(time_s, "s")
+                << " bandwidth: " << std::setw(10)
+                << printHuman(size_b / time_s, "B/s") << std::endl;
+    }
+  }
+
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/examples/mpi/print_size.hh b/examples/mpi/print_size.hh
new file mode 100644
index 0000000..3b9335f
--- /dev/null
+++ b/examples/mpi/print_size.hh
@@ -0,0 +1,35 @@
+#include <array>
+#include <cmath>
+#include <iomanip>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+#ifndef PRINT_SIZE_H_
+#define PRINT_SIZE_H_
+
+template <class T>
+inline std::string printHuman(T size, std::string unit, char base = 10) {
+  int mult = 0;
+  int fact = base == 10 ? 3 : 10;
+  if (size != 0) {
+    mult = std::floor((std::log(double(size)) / std::log(double(base))) / fact);
+  }
+
+  auto real_size = double(size) / double(std::pow(base, fact * mult));
+  std::array<std::string, 12> ratio = {"n", u8"μ", "m", "",  "K", "M",
+                                       "G", "T", "P", "E", "Z", "Y"};
+
+  std::string base_2_correction;
+  if (base == 2 and ratio[mult + 3] != "") {
+    base_2_correction = "i";
+  }
+
+  std::stringstream sstr;
+  sstr << std::fixed << std::setprecision(1) << real_size << ratio[mult + 3]
+       << base_2_correction << unit;
+  return sstr.str();
+}
+
+#endif // PRINT_SIZE_H_
diff --git a/notes.org b/notes.org
index fe568c3..613a1a8 100644
--- a/notes.org
+++ b/notes.org
@@ -1,88 +1,89 @@
 * PHYS 743 - Parallel Programming
 
 ** General remarks
 - QUESTION: Do we have time to visit INJ?
 
 ** Tentative agenda
 *** Admin intro
 **** Projects
 *** Basic concepts
 **** ssh, scp, rsync
 **** Compilation
 ***** Modules
 **** Debugging
 *** Architecture
 **** Cluster (MPI)
 ***** Clusters in general
 ***** At SCITAS
 **** Multicore (OpenMP)
 **** Singlecore (SIMD)
 *** Optimization
 **** Data access
 **** Vectorization
 **** Basic optimization techniques
 *** Performance measurement
 **** Key concepts
 ***** FLOPS, memory bandwidth
 ***** timing (speedup, scalings)
 **** Profiling
 **** Roofline
 *** Shared memory (OpenMP) [13/13]
 **** [X] Task parallelism
 **** [X] OpenMP terminology / Read spec
 **** [X] Fork-join / Omp parallel / Implicit barriers
 **** [X] Exercise Hello World / SLURM
 **** [X] Omp parallel for
 **** [X] Exercise
 **** [X] Omp critical (synchronization), atomic
 **** [X] Barriers
 **** [X] Omp private
 **** [X] Omp reduction
 **** [X] Collapse
 **** [X] Work sharing constructs
 **** [X] Exercise Poisson
 *** Advanced [3/5]
 **** [X] Schedule
 **** [X] NUMA / pinning / first touch
 **** [X] Race condition, accumulation in array (false sharing)
 **** [-] OpenMP (new features not covered)
 **** [-] (GPU)
-*** Distributed memory (MPI) basic [2/8]
+*** Distributed memory (MPI) basic [5/8]
 **** [X] Introduction / Read spec
 **** [-] MPI enviroment / Hello world [1/3]
 ***** [-]  Print before init
 ***** [X]  Print rank
-***** [-]  Print conditionnaly rank
+***** [-]  Print conditionaly rank
 **** [-] MPI terminology
 **** [X] Point-to-point [2/2]
 ***** [X]  Synchronous / Deadlock
+****** example MPI_Send MPI_Recv
 ***** [X]  Asynchronous / race condition
-**** [-] Collective [2/3]
+**** [X] Collective [3/3]
 ***** [X]  Bcast
 ***** [X]  Gather/scatter
 ***** [X]  Reduce
-**** [-] Advanced collective [3/4]
+**** [X] Advanced collective [3/3]
 ***** [X]  All
-***** [-]  Gather/Scatterv
 ***** [X]  All to all
 ***** [X]  Barrier
 **** [-] MPI Fortran
 ***** [-]  Bindings
 ***** [-]  Asynchronous arrays
-**** [-] Exercise Poisson
-*** Distributed memory (MPI) advanced [0/0]
+**** [X] Exercise Poisson
+*** Distributed memory (MPI) advanced [1/9]
+**** [X] Gather/Scatterv
 **** [-] Derived types
 **** [-] (un)Pack
 **** [-] Communicator
 **** [-] Topologies
 **** [-] IO
 **** [-] One-sided
 **** [-] Persistent
 **** [-] Non blocking collectives
 *** Hybrid programming
 **** Mpi init
 **** Task/thread Repartition
-
+**** MPI_Mprobe
 *** Recap
 *** Projects
 *** Pub pour SCITAS
diff --git a/phys_743_parallel_programming.tex b/phys_743_parallel_programming.tex
index 8c5c8d3..a386bdf 100644
--- a/phys_743_parallel_programming.tex
+++ b/phys_743_parallel_programming.tex
@@ -1,250 +1,252 @@
 \documentclass[8pt,aspectratio=169,notes]{beamer}
 \usepackage[utf8]{inputenc}
 \usepackage[english]{babel}
 \usepackage[most, minted]{tcolorbox}
 \usepackage{xcolor}
 \usepackage{graphicx}
 \usepackage{fancyvrb}
 \usepackage{tikz}
 \usepackage{colortbl}
 \usepackage{booktabs}
 \usepackage[super]{nth}
 \usepackage{amssymb}
 \usepackage[binary-units=true]{siunitx}
+\usepackage{booktabs}
 
 \usemintedstyle{emacs}
 
 \makeatletter
 % Define commands to select the folder where the Beamer theme lies in
 \def\beamer@calltheme#1#2#3{%
   \def\beamer@themelist{#2}
   \@for\beamer@themename:=\beamer@themelist\do
   {\usepackage[{#1}]{\beamer@themelocation/#3\beamer@themename}}}
 
 \def\usefolder#1{
   \def\beamer@themelocation{#1}
 }
 \def\beamer@themelocation{}
 
 % Patch Table of Content to insert fixed spaces between items instead of vfills
 \patchcmd{\beamer@sectionintoc}
   {\vfill}
   {\vskip\itemsep}
   {}
   {}
 
 % New counter for line numbers
 \newcounter{verbatim@cnt}
 
 % This is for color band on the linenos in listings
 \AtEndEnvironment{Verbatim}{%
   \stepcounter{verbatim@cnt}%
   \protected@write\@auxout{}{%
     \global\protect\@namedef{verbatim@numwidth@\the\c@verbatim@cnt}{%
       \ifnum\c@FancyVerbLine>999
         7.5mm%
       \else\ifnum\c@FancyVerbLine>99
         6mm%
       \else
         4mm%
       \fi\fi
     }%
   }%
 }
 
 \def\minted@auto@numwidth#1{%
   \ifcsname verbatim@numwidth@\the\numexpr\c@verbatim@cnt#1\relax\endcsname
     \csname verbatim@numwidth@\the\numexpr\c@verbatim@cnt#1\relax\endcsname
   \else
     4mm%
   \fi
 }
 
 \tcbset{bashstyle/.style={
     colframe=black!70,
     listing engine=minted,
     listing only,
     minted style=colorful,
     minted language=console,
     size=fbox,
     breakable,
     enhanced,
     minted options={
       autogobble=true,
       breaklines=true,
       breakbefore=.,
       numbersep=2mm,
     },
   }}
 
 \tcbset{cxx/.style={
     colframe=black!70,
     listing engine=minted,
     listing only,
     minted style=emacs,
     minted language=C++,
     size=fbox,
     breakable,
     enhanced,
     minted options={
       autogobble=true,
       linenos,
       breaklines=true,
       breakbefore=.,
       numbersep=2mm,
       escapeinside=||,
     },
     overlay={%
       \begin{tcbclipinterior}
         \fill[gray!25] (frame.south west) rectangle ([xshift=\dimexpr\minted@auto@numwidth{}\relax]frame.north west);
       \end{tcbclipinterior}
     },
     % in "left", \c@verbatim@cnt is not stepped yet, hence the argument "+1"
     left=\dimexpr\minted@auto@numwidth{+1}\relax,
   }}
 
 % \EscMintinline[options]{<language>}{<backslash-escaped text>}
 \def\EscMintinline{%
   \FVExtraRobustCommand
   \RobustEscMintinline
   \FVExtraUnexpandedReadOArgMArgEscVArg}
 
 \NewExpandableDocumentCommand \FVExtraUnexpandedReadOArgMArgEscVArg { o m m } {%
   \IfNoValueTF{#1}
   {\FVExtraAlwaysUnexpanded
     {\FVExtraUnexpandedReadOArgMArgEscVArg{#2}{#3}}}
   {\FVExtraAlwaysUnexpanded
     {\FVExtraUnexpandedReadOArgMArgEscVArg[#1]{#2}{#3}}}%
 }
 
 \newrobustcmd\RobustEscMintinline[2][]{%
   % similar to \mintinline
   \begingroup
   \setboolean{minted@isinline}{true}%
   \minted@configlang{#2}%
   \setkeys{minted@opt@cmd}{#1}%
   \minted@fvset
   \begingroup
   \@ifnextchar\bgroup
   {\FVExtraDetokenizeREscVArg{\minted@inline@iii}}%
   {\PackageError{minted}%
     {\string\EscMintinline\space delimiters must be paired curly braces in this context}%
     {Delimit argument with curly braces}}}
 \makeatother
 
 \newtcblisting{bashcode}{%
   colframe=black!70,
   width=\linewidth,
   bashstyle,
 }
 
 \newtcblisting{consoleoutput}{%
   colback=black,
   colupper=gray!50,
   colframe=black!70,
   listing engine=minted,
   listing only,
   minted style=monokai,
   minted language=console,
   size=fbox,
   breakable,
   enhanced,
   minted options={
     autogobble=true,
     breaklines=true,
     breakbefore=.,
     numbersep=2mm,
   },
 %  width=80ex,
 }
 
 \newtcblisting{cxxcode}[2][]{
   cxx,
   title={#2},
   #1,
 }
 
 \newtcbinputlisting{cxxfile}[2][]{%
   cxx,
   minted options app={
     fontsize=\small,
   },
   listing file={#2},
  % width=80ex,
   #1
 }
 
 \newcommand{\cxxinline}[1]{\EscMintinline{C++}{#1}}
 \newcommand{\cmd}[1]{\EscMintinline[style=colorful]{console}{#1}}
 %newmintinline[cmd]{console}{style=colorful,autogobble}
 \newcommand{\code}[1]{\texttt{\bf #1}}
 
 \DeclareSIUnit\flop{FLOP}
 \DeclareSIUnit\transfer{T}
 \DeclareSIUnit\cycle{c}
 \DeclareSIUnit\flops{\flop\per\second}
 \DeclareSIUnit\chf{CHF}
 \sisetup{per-mode=symbol}
 \sisetup{exponent-product = \cdot}
 \sisetup{group-separator={\mathrm{'}}}
 
 \definecolor{colShellBg}{HTML}{F5EDE4}
 \definecolor{links}{HTML}{2A1B81}
 \hypersetup{colorlinks,linkcolor=,urlcolor=links}
 
 \usefolder{scitas_theme}
 \usetheme{scitas}
 
 \newcommand{\FIGREP}{figures}
 \renewcommand{\arraystretch}{1.3}
 
 % Remove numbering from the ToC when it's spread on multiple frames
 \setbeamertemplate{frametitle continuation}{}
 
 \title{{\huge Parallel Programming}\\Single-core optimization, MPI, OpenMP, and hybrid programming}
 \author[N. Richart, E. Lanti]{Nicolas Richart \\ Emmanuel Lanti \\ {\scriptsize Course based
   on V. Keller's lecture notes}}
 \date{\nth{15} - \nth{19} of November 2021}
 
 \begin{document}
 \begin{frame}[plain]
   \titlepage
 \end{frame}
 
 \section{Table of Contents}
 \begin{frame}[allowframebreaks=0.8]
   \frametitle{Table of Contents}
   \tableofcontents%[hideallsubsections]
 \end{frame}
 
 % Administration
 \input{src/admin/admin}
 
 % Single-core optimization
 \input{src/basic_concepts/basic_concepts}
 \input{src/cluster_architecture/cluster_architecture}
 \input{src/performance_measurement/performance_measurement}
 \input{src/optimization/optimization}
 
 % OpenMP
 \input{src/openmp/openmp}
 
 % MPI
 \input{src/mpi/mpi}
+\input{src/mpi/mpi_advanced}
 
 % Hybrid programming
 \input{src/hybrid/hybrid}
 
 % Recapitulation of the course
 \input{src/recap/recap}
 
 % Project description
 \input{src/projects/projects}
 
 \end{document}
 
 %%% Local Variables:
 %%% mode: latex
 %%% TeX-command-extra-options: "-shell-escape"
 %%% TeX-master: t
 %%% End:
diff --git a/scitas_theme/figures/izar.jpg b/scitas_theme/figures/izar.jpg
new file mode 100644
index 0000000..0468f3c
Binary files /dev/null and b/scitas_theme/figures/izar.jpg differ
diff --git a/src/admin/admin.tex b/src/admin/admin.tex
index 873de12..053192f 100644
--- a/src/admin/admin.tex
+++ b/src/admin/admin.tex
@@ -1,171 +1,171 @@
 \section{Administration}
 \label{sec:administration}
 
 \begin{frame}
   \frametitle{Administration}
   \framesubtitle{}
 
   \textbf{Course organization}
   \begin{itemize}
     \item This parallel programming course amounts to two full weeks of work (70
     hours)
     \item It is organized in two parts:
     \begin{itemize}
       \item A week of ``theoretical'' lectures completed by practical exercises
       \item A personal project realized within the two following weeks
     \end{itemize}
     \item An oral evaluation of your project (15' + 5') will conclude the course
     \item If passed, you'll get 3 ECTS
   \end{itemize}
   \vfill
   \textbf{A few remarks}
   \begin{itemize}
     \item During the course, we'll primarily use C++ for the examples, but you
     can also use C or Fortran
     \item Do not hesitate to interupt if you have questions
     \item Exercises are important! Do not hesitate to play with them, change the
     parameters, see what happens, try to understand why, etc.
   \end{itemize}
 \end{frame}
 
 \begin{frame}
   \frametitle{Administration}
   \framesubtitle{}
   \textbf{Lecture and exercises}
   \begin{itemize}
     \item We tried to build this course with as much exercises as possible
     \item We often use exercises during the lectures to illustrate and
           understand concepts we presented
     \item To easily differentiate between theory and exercises, there are two
           templates
     \item This is a theory slide!
   \end{itemize}
 \end{frame}
 
 \begin{frame}[exercise]
   \frametitle{Administration}
   \framesubtitle{}
   \textbf{Lecture and exercises}
   \begin{itemize}
     \item This is an exercise slide!
   \end{itemize}
 \end{frame}
 
 \subsection{Tentative program}
 \label{sec:tentative-program}
 
 \begin{frame}
   \frametitle{Tentative program}
   \framesubtitle{Monday}
 
   \begin{table}
     \centering
     \begin{tabular}{ccc}
       \toprule \\
       Time & Subject & Details \\
       \midrule \\
       8:15 -- 10:00 & Basic concepts & Compilation, debugging and data transfer \\
       10:00 -- 10:30 & Break & \\
       10:30 -- 12:00 & Cluster architecture & Cluster, node and CPU architecture\\
       12:00 -- 13:00 & Lunch on your own & \\
       13:15 -- 15:00 & Single-core optimization & Data access and vectorization\\
       15:00 -- 15:30 & Break & \\
       15:30 -- 17:00 & Performance measurement & Key concepts, profiling and
                                                  roofline model \\
       \bottomrule \\
     \end{tabular}
   \end{table}
 \end{frame}
 
 \begin{frame}
   \frametitle{Tentative program}
   \framesubtitle{Tuesday}
 
   \begin{table}
     \centering
     \begin{tabular}{ccc}
       \toprule \\
       Time & Subject & Details \\
       \midrule \\
-      8:15 -- 10:00 & & \\
+      8:15 -- 10:00 & Intro OpenMP & History, execution and memory model, parallel construct\\
       10:00 -- 10:30 & Break & \\
-      10:30 -- 12:00 & & \\
+      10:30 -- 12:00 & Constructs and directives & \\
       12:00 -- 13:00 & Lunch on your own & \\
-      13:15 -- 15:00 & & \\
+      13:15 -- 15:00 & Clauses & \\
       15:00 -- 15:30 & Break & \\
-      15:30 -- 17:00 & & \\
+      15:30 -- 17:00 & Getting a speedup & \\
       \bottomrule \\
     \end{tabular}
   \end{table}
 \end{frame}
 
 \begin{frame}
   \frametitle{Tentative program}
   \framesubtitle{Wednesday}
 
   \begin{table}
     \centering
     \begin{tabular}{ccc}
       \toprule \\
       Time & Subject & Details \\
       \midrule \\
-      8:15 -- 10:00 & & \\
+      8:15 -- 10:00 & Introduction to MPI & \\
       10:00 -- 10:30 & Break & \\
-      10:30 -- 12:00 & & \\
+      10:30 -- 12:00 & Blocking Pt2Pt& \\
       12:00 -- 13:00 & Lunch on your own & \\
-      13:15 -- 15:00 & & \\
+      13:15 -- 15:00 & Non blocking and collectives & \\
       15:00 -- 15:30 & Break & \\
-      15:30 -- 17:00 & & \\
+      15:30 -- 17:00 & Paralellization of Poisson& \\
       \bottomrule \\
     \end{tabular}
   \end{table}
 \end{frame}
 
 \begin{frame}
   \frametitle{Tentative program}
   \framesubtitle{Thursday}
 
   \begin{table}
     \centering
     \begin{tabular}{ccc}
       \toprule \\
       Time & Subject & Details \\
       \midrule \\
       8:15 -- 10:00 & & \\
       10:00 -- 10:30 & Break & \\
       10:30 -- 12:00 & & \\
       12:00 -- 13:00 & Lunch on your own & \\
       13:15 -- 15:00 & & \\
       15:00 -- 15:30 & Break & \\
       15:30 -- 17:00 & & \\
       \bottomrule \\
     \end{tabular}
   \end{table}
 \end{frame}
 
 \begin{frame}
   \frametitle{Tentative program}
   \framesubtitle{Friday}
 
   \begin{table}
     \centering
     \begin{tabular}{ccc}
       \toprule \\
       Time & Subject & Details \\
       \midrule \\
       8:15 -- 10:00 & & \\
       10:00 -- 10:30 & Break & \\
       10:30 -- 12:00 & & \\
       12:00 -- 13:00 & Lunch on your own & \\
       13:15 -- 15:00 & & \\
       15:00 -- 15:30 & Break & \\
       15:30 -- 17:00 & & \\
       \bottomrule \\
     \end{tabular}
   \end{table}
 \end{frame}
 
 %%% Local Variables:
 %%% mode: latex
 %%% TeX-master: "../../phys_743_parallel_programming"
 %%% End:
diff --git a/src/mpi/mpi.tex b/src/mpi/mpi.tex
index 4e74e6c..3cd7903 100644
--- a/src/mpi/mpi.tex
+++ b/src/mpi/mpi.tex
@@ -1,679 +1,699 @@
 \renewcommand{\FIGREP}{src/mpi/figures}
+
 \section{Message Passing Interface (MPI)}
 \label{sec:mpi}
 \intersec{deneb}
 
 \begin{frame}
   \frametitle{MPI}
   \framesubtitle{Goals of this section}
   \begin{itemize}
     \item Introduce distributed memory programming paradigm
     \item Point-to-point communications
     \item Collective communications
   \end{itemize}
 \end{frame}
 
 \subsection{Introduction}
 \begin{frame}
   \frametitle{MPI}
   \framesubtitle{Overview and goals of MPI}
 
   \begin{itemize}
     \item MPI is a \textit{Message-Passing Interface} specification
     \item There are many implementations (MPICH, MVAPICH, Intel MPI, OpenMPI,
           etc)
     \item Library interface, not a programming language
     \item It is standardized
           \begin{itemize}
             \item Defined by the \href{https://www.mpi-forum.org/}{MPI forum}
             \item Current version is MPI 4.0
           \end{itemize}
     \item As such, it is portable, flexible and efficient
     \item Interface to C and Fortran in standard
   \end{itemize}
 \end{frame}
 
 \begin{frame}
   \frametitle{MPI}
   \framesubtitle{Message passing concepts}
 
   \begin{itemize}
     \item Let's derive a minimal message-passing interface
     \item Message passing consists in transferring data chunks between separate
           address spaces
     \item Cooperative operation (\textit{send} matched with a \textit{receive}
           operation)
     \item From the sender:
           \begin{itemize}
             \item
           \end{itemize}
   \end{itemize}
 \end{frame}
 
 \subsection{MPI environment}
 \begin{frame}[fragile]
   \frametitle{MPI}
   \framesubtitle{A simple hello world example}
 
   \cxxfile[%
   title={mpi/hello\_mpi.cc},
   ]{examples/mpi/hello_mpi.cc}
 \end{frame}
 
+\note{
+  \begin{itemize}
+    \item MIMD paradigm
+    \item Multiple process from the beginning
+    \item Move the cout before init and after finialize
+  \end{itemize}
+}
+
 \begin{frame}[b]
   \frametitle{Environment}
   \framesubtitle{}
   \begin{itemize}
     \item MPI code is bordered by a \cxxinline{MPI_Init} and a \cxxinline{MPI_Finalize}
     \item MPI starts $N$ processes numbered $0, 1, ..., N-1$. It is the process \textit{rank}
     \item They are grouped in a \textit{communicator} of \textit{size} $N$
     \item After init, MPI provides a default communicator called \code{MPI\_COMM\_WORLD}
   \end{itemize}
   \addimage[width=7cm]{\FIGREP/communicator}{4.5cm}{3.2cm}
 \end{frame}
 
 \begin{frame}[exercise, fragile]
   \frametitle{Hello world $\pi$}
   \begin{itemize}
     \item In the $pi$ code initialize/finialize properly MPI
     \item Print out the number of processes and the rank of each process
     \item Modify the makefile to use \code{mpicxx} instead of \code{g++}
     \item Write a batch script to run your parallel code
           \begin{bashcode}
             #!/bin/bash
             #SBATCH -n <ntasks>
             module purge
             module load <compiler> <mpi library>
             srun <my_mpi_executable>
           \end{bashcode}
 
           \emph{\textbf{Note :} To use MPI on the cluster you first have to load
           a MPI implementation through the module \code{mvapich2} or
           \code{intel-mpi}.}
   \end{itemize}
 \end{frame}
 
 
 
 \subsection{Terminology}
 
 \begin{frame}
   \frametitle{Types of communications in MPI}
   \begin{itemize}
     \item {Point-to-Point (One-to-One)}
     \item {Collectives (One-to-All, All-to-One, All-to-All)}
     \item {One-sided/Shared memory (One-to...)}
     \item {Blocking and Non-Blocking of all types}
   \end{itemize}
 \end{frame}
 
 \subsection{Blocking point-to-point communications}
 
 \begin{frame}[fragile]
   \frametitle{Send/Receive}
   \framesubtitle{}
 
   \begin{cxxcode}{Synthax}
       int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest,
                   int tag, MPI_Comm comm);
 
       int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source,
                    int tag, MPI_Comm comm, MPI_Status *status);
   \end{cxxcode}
 
   \begin{itemize}
     \item \code{buf} pointer to the data to send/receive
     \item \code{count} number of element to send/receive
     \item \code{datatype} datatype of the data to send/receive
     \item \code{dest}, \code{source} the rank of the destination/source of the communication
     \item \code{tag} a message tag to differentiate the communications
     \item \code{comm} communicator in which to communication happens
     \item \code{status} object containing information on the communication
   \end{itemize}
 \end{frame}
   \note{
     \begin{itemize}
       \item \url{https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf}
       \item MPI\_Send on page 32
     \end{itemize}
   }
 
 \begin{frame}[fragile,b]
   \frametitle{Send/Receive}
   \framesubtitle{Details on the buffer}
 
   \addimage[width=7cm]{\FIGREP/buffer}{4.5cm}{5.5cm}
 
   \begin{itemize}
     \item Buffer is a pointer to the first data (\cxxinline{buf}), a size (\cxxinline{count}) and a \cxxinline{datatype}
     \item Datatypes (extract):
           \begin{itemize}
             \item \cxxinline{MPI_INT}
             \item \cxxinline{MPI_UNSIGNED}
             \item \cxxinline{MPI_FLOAT}
             \item \cxxinline{MPI_DOUBLE}
           \end{itemize}
     \item For \cxxinline{std::vector<double> vect}:
           \begin{itemize}
             \item \cxxinline{buf = vect.data()}
             \item \cxxinline{count = vect.size()}
             \item \cxxinline{datatype = MPI_DOUBLE}
           \end{itemize}
   \end{itemize}
   \vspace{1cm}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Send/Receive}
   \framesubtitle{Useful constants and status}
 
   Constants:
   \begin{itemize}
     \item \cxxinline{MPI_STATUS_IGNORE} to state that the status is ignored
     \item \cxxinline{MPI_PROC_NULL} placeholder for the source or destination
     \item \cxxinline{MPI_ANY_SOURCE} is a wildcard for the source of a receive
     \item \cxxinline{MPI_ANY_TAG} is a wildcard for the tag of a receive
   \end{itemize}
 
   Status:
   \begin{itemize}
     \item Structure containing \cxxinline{tag} and \cxxinline{source}
     \begin{cxxcode}{}
       MPI_Status status;
       std::cout << "Tag: " << status.tag << " - "
                 << "Source: " << status.source << std::endl;
     \end{cxxcode}
     \item Size of the message can be asked using the status
     \begin{cxxcode}{}
       int MPI_Get_count(const MPI_Status *status, MPI_Datatype datatype,
                         int *count);
     \end{cxxcode}
 
   \end{itemize}
 \end{frame}
 
 
 \begin{frame}
   \frametitle{Send/Receive}
   \framesubtitle{Example}
 
   \cxxfile[title={mpi/send\_recv.cc},
   minted options app={firstline=16,
     lastline=29}]{examples/mpi/send_recv.cc}
 \end{frame}
 
 \begin{frame}[exercise, fragile,t]
   \frametitle{Ring reduction of $\pi$}
 
   \begin{minipage}{0.45\textwidth}
     \begin{overprint}
       \only<1>{\includegraphics[width=\linewidth]{\FIGREP/ring_explanation_0}}
       \only<2>{\includegraphics[width=\linewidth]{\FIGREP/ring_explanation_1}}
       \only<3>{\includegraphics[width=\linewidth]{\FIGREP/ring_explanation_2}}
       \only<4>{\includegraphics[width=\linewidth]{\FIGREP/ring_explanation_3}}
       \only<5>{\includegraphics[width=\linewidth]{\FIGREP/ring_explanation_4}}
       \only<6>{\includegraphics[width=\linewidth]{\FIGREP/ring_explanation_5}}
     \end{overprint}
   \end{minipage}
   \hspace{.5cm}
   \begin{minipage}{0.45\textwidth}
     \begin{overprint}
       \onslide<1>
       \begin{itemize}
         \item \cxxinline{l_sum = local_computation();}\\
               \cxxinline{sum += l_sum;}
       \end{itemize}
 
       \onslide<2>
       \begin{itemize}
         \item \cxxinline{send_buf = l_sum;}
       \end{itemize}
 
       \onslide<3>
       \begin{itemize}
         \item \cxxinline{send(send_buf);}\\
               \cxxinline{receive(recv_buf);}
       \end{itemize}
 
       \onslide<4>
       \begin{itemize}
         \item \cxxinline{send_buf = recv_buf;}\\
               \cxxinline{sum += recv_buf;}
       \end{itemize}
 
       \onslide<5>
       \begin{itemize}
         \item \cxxinline{send(send_buf);}\\
               \cxxinline{receive(recv_buf);}
       \end{itemize}
 
       \onslide<6>
       \begin{itemize}
         \item Split the sum space between the processes
         \item Implement a ring to communicate the partial sum between the
               processes. using \cxxinline{MPI\_Ssend} and
               \cxxinline{MPI_Recv}
 
               \emph{\textbf{Remember :} each MPI process runs the same code!}
 
               \emph{\textbf{Note :} in a loop the next process is \cxxinline{(prank + 1) \% psize} and the previous is \cxxinline{(prank - 1 + psize) \% psize}}
       \end{itemize}
     \end{overprint}
   \end{minipage}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Send/Receive}
   \framesubtitle{Send variants}
   \begin{itemize}
     \item \cxxinline{MPI_Ssend} : (S for Synchronous) function returns when other end
     posted matching recv and the buffer can be saflty reused
     \item \cxxinline{MPI_Bsend} : (B for Buffer) function returns immediately, send
     buffer can be reused immediately
     \item \cxxinline{MPI_Rsend} : (R for Ready) can be used only when a receive
     is already posted
     \item \cxxinline{MPI_Send} : acts like \cxxinline{MPI_Bsend} on small
     arrays, and like \cxxinline{MPI_Ssend} on bigger ones
   \end{itemize}
 \end{frame}
 
 \note{
   \begin{itemize}
     \item For Ssend the receive does not need to be finished but it has to be started
     \item For Bsend need to attach a buffer section 3.6 \cxxinline{MPI_Buffer_attach}
     \item Rsend is here for some case where hardware can avoid a hand-shake
     \item For Send the size of buffer is implementation dependent, usually can
           be altered with an environment variable
   \end{itemize}
 }
 
+\begin{frame}[fragile]
+  \frametitle{Send/Receive}
+  \framesubtitle{Particularity of \cxxinline{MPI_Send}}
+
+  \cxxfile[title={mpi/ping\_ping.cc},
+  minted options app={firstline=31,
+    lastline=40}]{examples/mpi/ping_ping.cc}
+
+  \pause
+  \begin{consoleoutput}
+    $ mpirun -np 2 ./mpi/ping_ping
+    PingPing size:       4.0B time:    303.5ns bandwidth:  12.6MiB/s
+    PingPing size:      16.0B time:    291.0ns bandwidth:  52.4MiB/s
+    PingPing size:      64.0B time:    289.7ns bandwidth: 210.7MiB/s
+    PingPing size:     256.0B time:    327.9ns bandwidth: 744.5MiB/s
+    PingPing size:     1.0KiB time:    686.9ns bandwidth:   1.4GiB/s
+  \end{consoleoutput} %$
+\end{frame}
+
 \begin{frame}[fragile]
   \frametitle{Send/Receive}
   \framesubtitle{Combined send-receive}
 
   \begin{cxxcode}{Synthax}
     int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
                      int dest, int sendtag,
                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
                      int source, int recvtag,
                      MPI_Comm comm, MPI_Status *status);
   \end{cxxcode}
 
   \begin{itemize}
     \item Combines a send and a receive, to help mitigate deadlocks
     \item Has a in-place variant \cxxinline{MPI_Sendrecv_replace}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[exercise,fragile]
   \frametitle{Ring reduction of $\pi$}
   \framesubtitle{Using \code{MPI\_Sendrecv}}
 
   \begin{itemize}
     \item Modify the previous exercise to use \cxxinline{MPI_Sendrecv}
   \end{itemize}
 \end{frame}
 
 \subsection{Non-blocking point-to-point communications}
 
 \begin{frame}[containsverbatim]
   \frametitle{Non-blocking send/receive}
 
   \begin{cxxcode}{Synthax}
       int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest,
                     int tag, MPI_Comm comm, MPI_Request *request);
 
       int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source,
                     int tag, MPI_Comm comm, MPI_Request *request);
   \end{cxxcode}
 
   \begin{itemize}
     \item \code{I} for \emph{immediate}
     \item \cxxinline{request} in addition to parameters from blocking version
     \item receive does not have a status
     \item \cxxinline{request} is an object attached to the communication
     \item the communications starts but is not completed
     \item \code{S}, \code{B}, and \code{S} variant are also defined
   \end{itemize}
 \end{frame}
 
 \begin{frame}[containsverbatim]
   \frametitle{Non-blocking send/receive}
   \framesubtitle{Completion}
 
   \begin{cxxcode}{Synthax}
     int MPI_Wait(MPI_Request *request, MPI_Status *status);
 
     int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status);
   \end{cxxcode}
 
   \begin{itemize}
     \item completion of communication should be checked
     \item \cxxinline{MPI_Test} or \cxxinline{MPI_Wait}
     \item send completed means the buffer can be reused
     \item receive completed means the buffer can be read
     \item \cxxinline{status} is set a completion
     \item \cxxinline{flag} is \code{true} if completed, \code{false} othterwhy
   \end{itemize}
 \end{frame}
 
 
 \begin{frame}
   \frametitle{Send/Receive}
   \framesubtitle{Example}
 
   \cxxfile[title={mpi/sendrecv.cc},
   minted options app={firstline=22,
     lastline=31}]{examples/mpi/isend_recv.cc}
 \end{frame}
 
 \begin{frame}[exercise,fragile]
   \frametitle{Ring reduction of $\pi$}
   \framesubtitle{Using non-blocking send}
 
   \begin{itemize}
     \item Modify the previous exercise to use \cxxinline{MPI_Isend} and
     \cxxinline{MPI_Recv}
     \item Do not forget to \code{wait}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Non-blocking send/receive}
   \framesubtitle{Multiple completions}
 
   \begin{itemize}
     \item \cxxinline{MPI_Waitall}, \cxxinline{MPI_Testall} wait or test completion of all the pending requests
     \item \cxxinline{MPI_Waitany}, \cxxinline{MPI_Testany} wait or test completion of one out on many
     \item \cxxinline{MPI_Waitsome}, \cxxinline{MPI_Testsome} wait or test completion of all the enabled requests
 
     \item for arrays of statuses can use \cxxinline{MPI_STATUSES_IGNORE}
     \item \cxxinline{MPI_Request_get_status} equivalent to \cxxinline{MPI_Test} but does not free completed requests
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Probing}
   \framesubtitle{}
 
   \begin{cxxcode}{Synthax}
     int MPI_Iprobe(int source, int tag, MPI_Comm comm, int *flag,
                    MPI_Status *status);
 
     int MPI_Probe(int source, int tag, MPI_Comm comm, MPI_Status *status);
   \end{cxxcode}
 
   \begin{itemize}
     \item check incoming messages without receiving
     \item Immediate variant returns \code{true} if matching message exists
   \end{itemize}
 \end{frame}
 
 \subsection{Collective communications}
 
 \begin{frame}[fragile]
   \frametitle{Collective communications}
   \framesubtitle{Synchronization}
 
   \begin{cxxcode}{Synthax}
     int MPI_Barrier(MPI_Comm comm);
   \end{cxxcode}
 
   \begin{itemize}
     \item collectives communications \textbf{must} be called by all process in the communicator
     \item barrier is hard synchronization
     \item avoid as much a possible
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Collective communications}
   \framesubtitle{Broadcast}
 
   \begin{cxxcode}{Synthax}
     int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root,
                   MPI_Comm comm);
   \end{cxxcode}
 
   \begin{itemize}
     \item all process send data to the \code{root} process
   \end{itemize}
 
   \begin{center}
     \begin{overprint}
       \only<1>{\addimage[width=4cm]{\FIGREP/bcast0}{7cm}{1cm}}
       \only<2>{\addimage[width=4cm]{\FIGREP/bcast1}{7cm}{1cm}}
     \end{overprint}
   \end{center}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Collective communications}
   \framesubtitle{Scatter}
 
   \begin{cxxcode}{Synthax}
     int MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
                     void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
                     MPI_Comm comm);
   \end{cxxcode}
 
   \begin{itemize}
     \item the \code{root} process send a piece of the data to all processes
     \item the \cxxinline{sendbuf}, \cxxinline{sendcount} and
           \cxxinline{sendtype} are only relevant on the root
   \end{itemize}
 
   \begin{overprint}
     \only<1>{\addimage[width=4cm]{\FIGREP/scatter0}{7cm}{1cm}}
     \only<2>{\addimage[width=4cm]{\FIGREP/scatter1}{7cm}{1cm}}
   \end{overprint}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Collective communications}
   \framesubtitle{Gather}
 
   \begin{cxxcode}{Synthax}
     int MPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
                    void *recvbuf, int recvcount, MPI_Datatype recvtype, int root,
                    MPI_Comm comm);
   \end{cxxcode}
 
   \begin{itemize}
     \item all process their data to the \code{root} process
     \item the \cxxinline{recvbuf}, \cxxinline{recvcount} and
           \cxxinline{recvtype} are only relevant on the root
+    \item \cxxinline{recvcount} is the size per process not the total size
    \end{itemize}
 
   \begin{center}
     \begin{overprint}
       \only<1>{\addimage[width=4cm]{\FIGREP/gather0}{7cm}{1cm}}
       \only<2>{\addimage[width=4cm]{\FIGREP/gather1}{7cm}{1cm}}
     \end{overprint}
   \end{center}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Collective communications}
   \framesubtitle{Gather to all}
 
   \begin{cxxcode}{Synthax}
     int MPI_Allgather(const void *sendbuf, int sendcount,
                       MPI_Datatype sendtype, void *recvbuf, int recvcount,
                       MPI_Datatype recvtype, MPI_Comm comm);
   \end{cxxcode}
 
   \begin{itemize}
     \item all process send their data to all other process
    \end{itemize}
 
   \begin{center}
     \begin{overprint}
       \only<1>{\addimage[width=4cm]{\FIGREP/gather0}{7cm}{1cm}}
       \only<2>{\addimage[width=4cm]{\FIGREP/allgather}{7cm}{1cm}}
     \end{overprint}
   \end{center}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Collective communications}
   \framesubtitle{All to all gather/scatter}
 
   \begin{cxxcode}{Synthax}
     int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
                      MPI_Comm comm);
   \end{cxxcode}
 
   \begin{itemize}
     \item all process send their a piece of their data to all other process
   \end{itemize}
 
   \begin{center}
     \begin{overprint}
       \only<1>{\addimage[width=4cm]{\FIGREP/alltoall0}{7cm}{1cm}}
       \only<2>{\addimage[width=4cm]{\FIGREP/alltoall1}{7cm}{1cm}}
     \end{overprint}
   \end{center}
 \end{frame}
 
 \begin{frame}[exercise,fragile]
   \frametitle{Ring reduction of $\pi$}
   \framesubtitle{Using collective communications}
 
   \begin{itemize}
     \item \cxxinline{MPI_Gather} the
           partial sums to the root process.
     \item \cxxinline{MPI_Bcast} the total sum all the process
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Collective communications}
   \framesubtitle{Reduction}
 
   \begin{cxxcode}{Synthax}
     int MPI_Reduce(const void *sendbuf, void *recvbuf, int count,
                    MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm);
   \end{cxxcode}
 
   \begin{itemize}
     \item data from all process are reduced on the \code{root} process
     \item common operations being \cxxinline{MPI_SUM}, \cxxinline{MPI_MAX},
           \cxxinline{MPI_MIN}, \cxxinline{MPI_PROD}
     \item a \cxxinline{MPI_Allreduce} variant exists where all the process have the results
     \item \cxxinline{MPI_IN_PLACE} can be passed in the \cxxinline{sendbuf} of
           \code{root} for a \emph{reduce} a of all process for a
           \emph{allreduce}
   \end{itemize}
 
 
   \begin{center}
     \begin{overprint}
       \only<1>{\addimage[width=4cm]{\FIGREP/reduction0}{7cm}{1cm}}
       \only<2>{\addimage[width=4cm]{\FIGREP/reduction1}{7cm}{1cm}}
       \only<3>{\addimage[width=4cm]{\FIGREP/reduction2}{7cm}{1cm}}
     \end{overprint}
   \end{center}
 \end{frame}
 
 \begin{frame}[exercise,fragile]
   \frametitle{Ring reduction of $\pi$}
   \framesubtitle{Using collective communications}
 
   \begin{itemize}
     \item Modify the previous exercise to use \cxxinline{MPI_Reduce} and
           \cxxinline{MPI_Bcast}
     \item Modify it again to use \cxxinline{MPI_Allreduce}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile]
   \frametitle{Collective communications}
   \framesubtitle{Partial reductions}
 
   \begin{cxxcode}{Synthax}
     int MPI_Scan(const void *sendbuf, void *recvbuf, int count,
                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
 
     int MPI_Exscan(const void *sendbuf, void *recvbuf, int count,
                    MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
   \end{cxxcode}
 
   \begin{itemize}
     \item performs the prefix reduction on data
     \item \cxxinline{MPI_Scan} on process $i$ contains the reduction of values
           from processes $[0, i]$
     \item \cxxinline{MPI_Exscan} on process $i$ contains the reduction of values
     from processes $[0, i[$
     \item \cxxinline{MPI_IN_PLACE} can be passed a \cxxinline{sendbuf}
   \end{itemize}
 \end{frame}
 
 \begin{frame}[fragile,t]
   \frametitle{Parallelization of the poisson code}
 
   \begin{minipage}{.45\linewidth}
     \centering
     \begin{overprint}
       \only<1>{\includegraphics[width=.8\linewidth]{\FIGREP/grid_0}}
       \only<2>{\includegraphics[width=.8\linewidth]{\FIGREP/grid_1}}
       \only<3>{\includegraphics[width=.8\linewidth]{\FIGREP/grid_2}}
       \only<4>{\includegraphics[width=.8\linewidth]{\FIGREP/grid_3}}
       \only<5>{\includegraphics[width=.8\linewidth]{\FIGREP/grid_4}}
       \only<6>{\includegraphics[width=.8\linewidth]{\FIGREP/grid_4}}
     \end{overprint}
   \end{minipage}
   \begin{minipage}{.45\linewidth}
     \begin{overprint}
       \onslide<1>
       \begin{itemize}
         \item Parallelize the Poisson 2D problem using the Messages Passing
               Interface (MPI)
       \end{itemize}
 
       \onslide<2>
       \begin{itemize}
         \item The memory allocation is done in the C default manner, “Row-Major
               Order”: make your domain decomposition by lines
       \end{itemize}
 
       \onslide<3>
       \begin{itemize}
         \item $p$ domains of size $N/p$ each (1 per process)
       \end{itemize}
 
       \onslide<4>
       \begin{itemize}
         \item Adding \emph{ghost} lines before and after
       \end{itemize}
 
       \onslide<5>
       \begin{itemize}
         \item Use the \emph{ghost} lines to receive the missing local data
       \end{itemize}
 
       \onslide<6>
       \begin{itemize}
         \item Start using \cxxinline{MPI_Sendrecv} to implement the communications
         \item You can use the number of iteration as a check
         \item Remove the \cxxinline{dump()} function to start
         \item One it is working try to use \emph{non-blocking} communications
       \end{itemize}
     \end{overprint}
   \end{minipage}
 \end{frame}
 
-
-
-\subsection{Advanced collective communications}
-\begin{frame}
-  \frametitle{Advanced MPI}
-  \framesubtitle{}
-
-\end{frame}
-
 %%% Local Variables:
 %%% mode: latex
 %%% TeX-master: "../../phys_743_parallel_programming"
 %%% End:
diff --git a/src/mpi/mpi_advanced.tex b/src/mpi/mpi_advanced.tex
new file mode 100644
index 0000000..547696c
--- /dev/null
+++ b/src/mpi/mpi_advanced.tex
@@ -0,0 +1,813 @@
+\section{Advanced MPI}
+\intersec{izar}
+
+\begin{frame}
+  \frametitle{Advanced MPI}
+  \framesubtitle{Goals of this section}
+
+  \begin{itemize}
+    \item Overview of more advanced functionalities
+    \item Persistent communications
+    \item Advanced collective communications
+    \item Describing your own datatype
+    \item Redefining communicators
+    \item Associating a topology to a communicator
+    \item Parallel I/O
+    \item One sided communications
+  \end{itemize}
+\end{frame}
+
+\subsection{Persistent point to point}
+
+\begin{frame}[fragile]
+  \frametitle{Persistent communications}
+  \framesubtitle{}
+
+  \begin{itemize}
+    \item \cxxinline{MPI_Send_init} \cxxinline{MPI_Recv_init}, initialize the communication
+    \item Same signature as non-blocking communications
+    \item \cxxinline{MPI_Start}, \cxxinline{MPI_Startall} to start the communication
+    \item Completion is checked the same way as for non-blocking
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[exercise, fragile]
+  \frametitle{Persistent communications}
+  \framesubtitle{}
+
+  \begin{itemize}
+    \item Replace the non-blocking communication in the Poisson code by persistent ones
+  \end{itemize}
+\end{frame}
+
+
+\subsection{Advanced collective communications}
+\subsubsection{V versions}
+\begin{frame}[fragile]
+  \frametitle{Collective communications}
+  \framesubtitle{V extension to \cxxinline{MPI\_Gather}}
+
+  \begin{cxxcode}{Syntax}
+    int MPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                    void *recvbuf, const int recvcounts[], const int displs[],
+                    MPI_Datatype recvtype, int root, MPI_Comm comm);
+  \end{cxxcode}
+
+  \begin{itemize}
+    \item \cxxinline{recvcounts} is now an array, one entry per rank
+    \item \cxxinline{displs} array of displacements defining where to place the
+    $i^{\mathrm{th}}$ receive data
+    \item receive different sizes per process
+    \item receive in an array with strides
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Collective communications}
+  \framesubtitle{Gatherv semantic}
+
+  \begin{cxxcode}{Semantic equivalent}
+    // Every process
+    MPI_Send(sendbuf, sendcount, sendtype, root, /*...*/);
+
+    // On root process
+    for(i = 0; i < nb_process; ++i)
+      MPI_Recv(recvbuf+displs[j] * extent(recvtype), recvcounts[j], recvtype, i,
+               /*...*/);
+  \end{cxxcode}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Collective communications}
+  \framesubtitle{V extension to \cxxinline{MPI\_Scatter}}
+
+  \begin{cxxcode}{Syntax}
+    int MPI_Scatterv(const void *sendbuf, const int sendcounts[],
+                     const int displs[], MPI_Datatype sendtype, void *recvbuf,
+                     int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm);
+  \end{cxxcode}
+
+  \begin{itemize}
+    \item \cxxinline{sendcounts} is now an array, one entry per rank
+    \item \cxxinline{displs} array of displacements defining where to place the
+    $i^{\mathrm{th}}$ receive data
+    \item receive different sizes
+    \item receive in an array with strides
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Collective communications}
+  \framesubtitle{Scatterv semantic}
+
+  \begin{cxxcode}{Semantic equivalent}
+    // On root process
+    for(i = 0; i < nb_process; ++i)
+      MPI_Send(sendbuf+displs[i]*extent(sendtype), sendcounts[i], sendtype, i,
+               /*...*/)
+
+    // Every process
+    MPI_Recv(recvbuf, recvcount, recvtype, i, /*...*/).
+  \end{cxxcode}
+\end{frame}
+
+\subsubsection{Non-blocking collective communications}
+
+\begin{frame}[fragile]
+  \frametitle{Non-blocking collective communications}
+  \framesubtitle{}
+
+  \begin{itemize}
+    \item \code{I} variant of collective communications
+    \item extra parameter \cxxinline{request}
+    \item \cxxinline{MPI_Ibarrier}, \cxxinline{MPI_Ibcast}
+    \item \cxxinline{MPI_Igather}, \cxxinline{MPI_Igatherv},
+          \cxxinline{MPI_Iscatter}, \cxxinline{MPI_Iscatterv}
+    \item \cxxinline{MPI_Iallgather}, \cxxinline{MPI_Iallgatherv},
+          \cxxinline{MPI_Ialltoall}
+    \item \cxxinline{MPI_Ireduce}, \cxxinline{MPI_Iallreduce},
+          \cxxinline{MPI_Iscan}, \cxxinline{MPI_Iexscan}
+  \end{itemize}
+\end{frame}
+
+
+\subsubsection{Persistent collective communications}
+
+\begin{frame}[fragile]
+  \frametitle{Persistent collective communications}
+  \framesubtitle{}
+
+  \begin{itemize}
+    \item \code{_init} variant of collective communications
+    \item extra parameter \cxxinline{request}
+    \item \cxxinline{MPI_Barrier_init}, \cxxinline{MPI_Bcast_init}
+    \item \cxxinline{MPI_Gather_init}, \cxxinline{MPI_Gatherv_init},
+          \cxxinline{MPI_Scatter_init}, \cxxinline{MPI_Scatterv_init}
+    \item \cxxinline{MPI_Allgather_init}, \cxxinline{MPI_Allgatherv_init},
+          \cxxinline{MPI_Alltoall_init}
+    \item \cxxinline{MPI_Reduce_init}, \cxxinline{MPI_Allreduce_init},
+          \cxxinline{MPI_Scan_init}, \cxxinline{MPI_Exscan_init}
+
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[exercise, fragile]
+  \frametitle{Persistent collective}
+  \framesubtitle{}
+
+  \begin{itemize}
+    \item Replace the the \cxxinline{MPI_Allreduce} by a persistent one
+  \end{itemize}
+\end{frame}
+
+
+\subsection{Derived Datatypes}
+
+\begin{frame}[fragile]
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Definition of a datatypes}
+
+  \begin{itemize}
+    \item \cxxinline{MPI_Datatype} opaque type containing a \emph{Typemap}
+    \begin{itemize}
+      \item $Typemap = \{(type_{0},disp_{0}), \dotsb, (type_{n - 1},disp_{n - 1})\}$
+      \item sequence of basic datatypes
+      \item sequence of displacements (in bytes)
+    \end{itemize}
+    \item \code{extent} is the span from the first byte to the last one, with alignment requirement
+    \begin{align*}
+        lb(Typemap) &= \underset{j}{min}(disp_{j}),\\
+        ub(Typemap) &= \underset{j}{max}(disp_{j} + \mathrm{sizeof}(type_{j})) + \epsilon, and\\
+        extent(Typemap) &= ub(Typemap) - lb(Typemap)
+      \end{align*}
+      $\epsilon$ is there to account for alignment requirements
+  \end{itemize}
+
+\end{frame}
+
+\begin{frame}
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Base datatypes}
+  \begin{minipage}{.45\linewidth}
+    \small
+    \begin{tabular}{ll}
+      \toprule
+      MPI datatype & C datatype\\
+      \midrule
+      \cxxinline{MPI_CHAR} & \cxxinline{char} \\
+      \cxxinline{MPI_SHORT} & \cxxinline{signed short int} \\
+      \cxxinline{MPI_INT} & \cxxinline{signed int} \\
+      \cxxinline{MPI_LONG} & \cxxinline{signed long int} \\
+      \cxxinline{MPI_LONG_LONG_INT} & \cxxinline{signed long long int} \\
+      \cxxinline{MPI_LONG_LONG} & \cxxinline{signed long long int} \\
+      \cxxinline{MPI_SIGNED_CHAR} & \cxxinline{signed char} \\
+      \cxxinline{MPI_UNSIGNED_CHAR} & \cxxinline{unsigned char} \\
+      \cxxinline{MPI_UNSIGNED_SHORT} & \cxxinline{unsigned short int} \\
+      \cxxinline{MPI_UNSIGNED} & \cxxinline{unsigned int} \\
+      \cxxinline{MPI_UNSIGNED_LONG} & \cxxinline{unsigned long int} \\
+      \cxxinline{MPI_UNSIGNED_LONG_LONG} & \cxxinline{unsigned long long int} \\
+      \bottomrule
+    \end{tabular}
+  \end{minipage}
+  \hspace{1cm}
+  \begin{minipage}{.45\linewidth}
+    \small
+    \begin{tabular}{ll}
+      \toprule
+      MPI datatype & C datatype\\
+      \midrule
+      \cxxinline{MPI_FLOAT} & \cxxinline{float} \\
+      \cxxinline{MPI_DOUBLE} & \cxxinline{double} \\
+      \cxxinline{MPI_LONG_DOUBLE} & \cxxinline{long double} \\
+      \cxxinline{MPI_WCHAR} & \cxxinline{wchar_t} \\
+
+      \cxxinline{MPI_C_BOOL} & \cxxinline{_Bool} \\
+      \cxxinline{MPI_INT8_T} & \cxxinline{int8_t} \\
+      \cxxinline{MPI_INT16_T} & \cxxinline{int16_t} \\
+      \cxxinline{MPI_INT32_T} & \cxxinline{int32_t} \\
+      \cxxinline{MPI_INT64_T} & \cxxinline{int64_t} \\
+      \cxxinline{MPI_UINT8_T} & \cxxinline{uint8_t} \\
+      \cxxinline{MPI_UINT16_T} & \cxxinline{uint16_t} \\
+      \cxxinline{MPI_UINT32_T} & \cxxinline{uint32_t} \\
+      \cxxinline{MPI_UINT64_T} & \cxxinline{uint64_t} \\
+      \bottomrule
+    \end{tabular}
+  \end{minipage}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Base datatypes}
+
+  \begin{minipage}{.45\linewidth}
+    \small
+    \begin{tabular}{ll}
+      \toprule
+      MPI datatype & C++ datatype\\
+      \midrule
+      \cxxinline{MPI_CXX_BOOL} & \cxxinline{bool} \\
+      \cxxinline{MPI_CXX_FLOAT_COMPLEX} & \cxxinline{std::complex<float>} \\
+      \cxxinline{MPI_CXX_DOUBLE_COMPLEX} & \cxxinline{std::complex<double>} \\
+      \cxxinline{MPI_CXX_LONG_DOUBLE_COMPLEX} & \cxxinline{std::complex<long double>}\\
+      \bottomrule
+    \end{tabular}
+  \end{minipage}
+  \hspace{1.8cm}
+  \begin{minipage}{.3\linewidth}
+    \small
+    \begin{tabular}{ll}
+      \toprule
+      MPI datatype & C datatype\\
+      \midrule
+      \cxxinline{MPI_AINT} & \cxxinline{MPI_Aint} \\
+      \cxxinline{MPI_OFFSET} & \cxxinline{MPI_Offset} \\
+      \cxxinline{MPI_COUNT} & \cxxinline{MPI_Count} \\
+      \cxxinline{MPI_BYTE} & \\
+      \cxxinline{MPI_PACKED} & \\
+      \bottomrule
+    \end{tabular}
+  \end{minipage}
+\end{frame}
+
+\note{
+  \begin{itemize}
+    \item \cxxinline{MPI_CHAR} is a printable character where \cxxinline{MPI_BYTE} is a type of exactly 8bit not printable as a character
+    \item \cxxinline{MPI_PACKED} for pack/unpacked
+  \end{itemize}
+}
+
+\begin{frame}[fragile]
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Arrays}
+
+  \begin{cxxcode}{Syntax}
+    int MPI_Type_contiguous(int count, MPI_Datatype oldtype,
+                            MPI_Datatype *newtype);
+
+    int MPI_Type_vector(int count, int blocklength, int stride,
+                        MPI_Datatype oldtype, MPI_Datatype *newtype);
+  \end{cxxcode}
+
+  \begin{itemize}
+    \item array of contiguous array or with strided blocks of same type
+    \item \cxxinline{count}: number of repetition (blocks)
+    \item \cxxinline{blocklength}: number of element per block
+    \item \cxxinline{stride}: number of element between start of each block
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Array variants}
+
+  \begin{itemize}
+    \item \cxxinline{MPI_Type_create_hvector}: same as \cxxinline{MPI_Type_vector} with \cxxinline{stride} expressed in bytes
+    \item \cxxinline{MPI_Type_create_indexed_block} same as \cxxinline{MPI_Type_vector} with array of and \cxxinline{displacements}
+    \item \cxxinline{MPI_Type_create_hindexed_block}: same as \cxxinline{MPI_Type_create_indexed_block} with \cxxinline{displacements} in bytes
+    \item \cxxinline{MPI_Type_indexed}: same as \cxxinline{MPI_Type_create_indexed_block} with arrays of \cxxinline{blocklengths}
+    \item \cxxinline{MPI_Type_create_hindexed}: same as \cxxinline{MPI_Type_indexed} with \cxxinline{displacements} in bytes
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Structures}
+
+  \begin{cxxcode}{Syntax}
+    int MPI_Type_create_struct(int count, const int array_of_blocklengths[],
+                               const MPI_Aint array_of_displacements[],
+                               const MPI_Datatype array_of_types[], MPI_Datatype *newtype)
+  \end{cxxcode}
+
+  \begin{itemize}
+    \item \cxxinline{count}: number of repetition (blocks)
+    \item \cxxinline{array_of_blocklengths}: sizes per block
+    \item \cxxinline{array_of_displacements}: displacements between blocks in bytes
+    \item \cxxinline{array_of_types}: types contained in each blocks
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Usefull helper functions}
+
+  \begin{itemize}
+    \item \cxxinline{MPI_Get_address}: get the address of a variable
+    \item \cxxinline{MPI_Aint_diff}: get the difference between 2 addresses
+    \item \cxxinline{MPI_Aint_add}: get the sum of 2 addresses
+    \item \cxxinline{MPI_Type_size}: get the size of a datatype
+    \item \cxxinline{MPI_Get_type_extent}: get the lower bound and the extent of a type
+    \item \cxxinline{MPI_Type_create_resized}: reset the lower bound and the extent of a type
+  \end{itemize}
+\end{frame}
+
+\note{
+  \begin{itemize}
+    \item Prefer \cxxinline{MPI_Get_address} over \&
+    \item extent could be badly set then not possible to communicate multiple
+          objects of same datatype
+  \end{itemize}
+}
+
+\begin{frame}[fragile]
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Commit/free}
+
+  \begin{cxxcode}{Syntax}
+    int MPI_Type_commit(MPI_Datatype *datatype);
+
+    int MPI_Type_free(MPI_Datatype *datatype);
+  \end{cxxcode}
+  
+  \begin{itemize}
+    \item new datatypes should be committed before being usable in communications
+    \item committed types need to be freed once note used anymore
+  \end{itemize}
+\end{frame}
+
+
+\begin{frame}[fragile]
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Example}
+
+  \cxxfile[title={mpi/datatypes.cc},
+  minted options app={
+    firstline=13,
+    lastline=41,
+    fontsize=\tiny}]{examples/mpi/datatypes.cc}
+\end{frame}
+
+\begin{frame}[fragile, exercise]
+  \frametitle{Derived Datatypes}
+  \framesubtitle{Send lines in poisson code}
+
+  \begin{itemize}
+    \item Create a \cxxinline{MPI_Datatype line_t} representing a line of data
+    \item Exchange data of type \cxxinline{line_t} instead of \cxxinline{MPI_FLOAT}
+  \end{itemize}
+\end{frame}
+
+\subsection{Pack/Unpack}
+
+\begin{frame}[fragile]
+  \frametitle{Pack/Unpack}
+  \framesubtitle{Pack}
+
+  \begin{cxxcode}{Syntax}
+    int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype,
+                 void *outbuf, int outsize, int *position, MPI_Comm comm);
+  \end{cxxcode}
+
+  \begin{itemize}
+    \item \cxxinline{inbuf}, \cxxinline{incount}, \cxxinline{datatype} correspond to the description of data to pack
+    \item \cxxinline{outbuf}, \cxxinline{outsize} description of the buffer where to pack
+    \item \cxxinline{position} current position in the packing buffer
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Pack/Unpack}
+  \framesubtitle{Unpack}
+
+  \begin{cxxcode}{Syntax}
+    int MPI_Unpack(const void *inbuf, int insize, int *position, void *outbuf,
+                   int outcount, MPI_Datatype datatype, MPI_Comm comm);
+  \end{cxxcode}
+
+  \begin{itemize}
+    \item \cxxinline{inbuf}, \cxxinline{incount}, description of the buffer from which to unpack
+    \item \cxxinline{position} current position in the unpacking buffer
+    \item \cxxinline{outbuf}, \cxxinline{outsize}, and \cxxinline{datatype} correspond to the description of data to unpack
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+  \frametitle{Pack/Unpack}
+  \framesubtitle{Example}
+
+  \cxxfile[title={mpi/pack_unpack.cc},
+  minted options app={
+    firstline=26,
+    lastline=39
+    }]{examples/mpi/pack_unpack.cc}
+\end{frame}
+
+\subsection{Groups and Communicator}
+
+\begin{frame}[containsverbatim]
+  \frametitle{Groups and Communicators}
+
+  \begin{itemize}
+    \item a \code{communicator}:
+    \begin{itemize}
+      \item Encapsulate a \code{context}, a \code{group}, a \code{virtual topology}  and \code{attributes}
+      \item Two kinds \code{intra-communicator} and \code{inter-communicator}
+    \end{itemize}
+    \item a \code{group}:
+    \begin{itemize}
+      \item ordered set of processes
+      \item each process has an unique ID (rank within the group) and can belong to several different groups
+      \item a group can be used to create a new communicator
+    \end{itemize}
+  \end{itemize}
+\end{frame}
+
+\note{
+  \begin{itemize}
+    \item \code{intra} communications inside a group
+    \item \code{inter} communications between groups
+  \end{itemize}
+}
+
+\begin{frame}[containsverbatim]
+  \frametitle{Groups and Communicators}
+  \framesubtitle{Creating new communicators}
+
+  \begin{itemize}
+    \item duplicating or splitting an existing one \cxxinline{MPI_Comm_dup}, \cxxinline{MPI_Comm_split}
+    \item creating communicator from a group \cxxinline{MPI_Comm_create}, \cxxinline{MPI_Comm_create_group}
+    \item need to create groups
+    \begin{itemize}
+      \item from a communicator \cxxinline{MPI_Comm_group}
+      \item boolean operations \cxxinline{MPI_Group_union},
+            \cxxinline{MPI_Group_intersection}, \cxxinline{MPI_Group_difference}
+      \item specifying ranks \cxxinline{MPI_Group_incl}, \cxxinline{MPI_Group_excl}
+    \end{itemize}
+    \item destroy created objects \cxxinline{MPI_Comm_free},
+          \cxxinline{MPI_Group_free}
+  \end{itemize}
+\end{frame}
+
+
+\subsection{Virutal Topologies}
+
+\begin{frame}
+  \frametitle{Virtual Topologies}
+  \framesubtitle{}
+
+  \begin{itemize}
+    \item potential performance gain by mapping process to hardware
+    \item helps for program readability
+    \item types of topologies: Cartesian, Graph, Distributed Graph
+    \item collective communication on neighborhoods
+  \end{itemize}
+\end{frame}
+
+\note{
+  Details only on the cartesian on
+}
+
+\begin{frame}
+  \frametitle{Virtual Topologies}
+  \framesubtitle{Cartesian topology}
+  \begin{cxxcode}{Syntax}
+    int MPI_Cart_create(MPI_Comm comm_old, int ndims, const int dims[],
+                        const int periods[], int reorder, MPI_Comm *comm_cart);
+  \end{cxxcode}
+
+  \begin{itemize}
+    \item create a communicator with cartesian information
+    \item convenient functions:
+    \begin{itemize}
+      \item \cxxinline{MPI_Dims_create} helps creating balanced distribution of process
+      \item \cxxinline{MPI_Cart_shift} helps determining neighboors
+      \item \cxxinline{MPI_Cart_rank} get the rank based on coordinates
+      \item \cxxinline{MPI_Cart_coords} get coordinates based on rank
+    \end{itemize}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Neighborhoods collective}
+  \framesubtitle{}
+
+  \begin{itemize}
+    \item create a communicator with cartesian information
+    \item convenient functions:
+    \begin{itemize}
+      \item \cxxinline{MPI_Dims_create} helps creating balanced distribution of process
+      \item \cxxinline{MPI_Cart_shift} helps determining neighboors
+      \item \cxxinline{MPI_Cart_rank} get the rank based on coordinates
+      \item \cxxinline{MPI_Cart_coords} get coordinates based on rank
+    \end{itemize}
+  \end{itemize}
+\end{frame}
+
+
+\subsection{Parallel I/O}
+
+\begin{frame}[containsverbatim]
+  \frametitle{Introducing remarks}
+  \begin{itemize}
+    \item {I/O is often (if not always) the main bottleneck in a parallel application}
+    \item {MPI provides a mechanism to read/write in parallel}
+  \end{itemize}
+
+  \begin{center}
+    \input{day3/images/parallelFS.tex}
+  \end{center}
+\end{frame}
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{Introducing remarks}
+  \begin{itemize}
+    \item {MPI IO API works on your desktop/laptop}
+    \item {Most of the large HPC systems have a \textbf{parallel file system} (like GPFS, Lustre, etc..)}
+    \item {If the file is distributed smartly on a parallel file system : performance increases}
+    \item {MPI IO offers a high-level API to access a distributed file (no needs to implement complexe POSIX calls)}
+    \item {\textbf{does not work with ASCII files}}
+    \item {Most of the standard file format support MPI IO (e.g. HDF5, NetCDF, etc..)}
+  \end{itemize}
+\end{frame}
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{Poisson so far}
+  \begin{center}
+    \input{day3/images/sofar.tex}
+  \end{center}
+\end{frame}
+
+\begin{frame}[containsverbatim]
+  \frametitle{Poisson ideal}
+  \begin{center}
+    \input{day3/images/sogoal.tex}
+  \end{center}
+\end{frame}
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{Open/Close a file in parallel}
+  \begin{itemize}
+    \item {\verb+comm+ : the communicator that contains the writing/reading MPI processes}
+    \item {\verb+*filename+ : a file name}
+    \item {\verb+amode+ : file access mode (Read only \verb+MPI_MODE_RDONLY+, read/write \verb+MPI_MODE_RDWR+, create \verb+MPI_MODE_CREATE+, etc..)}
+    \item {\verb+info+ : file info object}
+    \item {\verb+*fh+ : file handle}
+  \end{itemize}
+
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_File_open(MPI_Comm comm, const char *filename, int amode, MPI_Info info, MPI_File *fh)
+\end{lstlisting}
+
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_File_close(MPI_File *fh)
+\end{lstlisting}
+  \textbf{Collective calls !!}
+\end{frame}
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{etype, offset and displacement}
+  \begin{itemize}
+    \item {\textbf{etype} is the elementary type of the data of the parallel accessed file}
+    \item {\textbf{offset} is a position in the file in term of multiple of etypes}
+    \item {\textbf{displacement} of a position within the file is the number of bytes from the beginning of the file}
+  \end{itemize}
+  \begin{center}
+    \input{day3/images/offset.tex}
+  \end{center}
+\end{frame}
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{Simple independent read/write}
+  \begin{itemize}
+    \item {Can be used from a single (or group) of processes}
+    \item {The \verb+offset+ must be specified in the \verb+*buf+ buffer}
+    \item {\verb+count+ elements of type \verb+datatype+ are written}
+  \end{itemize}
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_File_write_at(MPI_File fh, MPI_Offset offset, ROMIO_CONST void *buf, int count, MPI_Datatype datatype, MPI_Status *status)
+\end{lstlisting}
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf,int count, MPI_Datatype datatype, MPI_Status *status)
+\end{lstlisting}
+\end{frame}
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{\texttt{view} by each process}
+  \begin{itemize}
+    \item {Initialy, each process view the file as a linear byte stream and each process views data in its own native representation}
+    \item {this is changed using \verb+MPI_File_set_view+}
+    \item {\verb+disp+ is the displacement (defines the beginning of the data of the file that belongs to the process) in bytes}
+    \item {\verb+etype+ is the elementary type}
+  \end{itemize}
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, MPI_Datatype filetype, ROMIO_CONST char *datarep, MPI_Info info)
+\end{lstlisting}
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_File_get_view(MPI_File fh, MPI_Offset *disp, MPI_Datatype *etype, MPI_Datatype *filetype, char *datarep)
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}[containsverbatim]
+  \frametitle{Setting up a \texttt{view}}
+  \begin{center}
+    \input{day3/images/displacements.tex}
+  \end{center}
+  (source : MPI 2.2 specifications)
+\end{frame}
+
+\begin{frame}[containsverbatim]
+  \frametitle{Simple independent read/write without offset}
+  \begin{itemize}
+    \item {the \texttt{view} is specified prior to the call }
+  \end{itemize}
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_File_write(MPI_File fh, ROMIO_CONST void *buf, int count, MPI_Datatype datatype, MPI_Status *status)
+\end{lstlisting}
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_File_read(MPI_File fh, void *buf, int count,MPI_Datatype datatype, MPI_Status *status)
+\end{lstlisting}
+\end{frame}
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{Collective read/write with/without offset}
+  \begin{itemize}
+    \item {Same structure than Independent routines but with \verb+_all+ at the end }
+    \item {for instance : }
+  \end{itemize}
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_File_write_all(MPI_File fh, ROMIO_CONST void *buf, int count, MPI_Datatype datatype, MPI_Status *status)
+\end{lstlisting}
+\end{frame}
+
+\subsection{One Sided}
+
+\begin{frame}[containsverbatim]
+  \frametitle{One-sided communication}
+  \begin{itemize}
+    \item {A MPI process can access another MPI process's memory space directly (RMA)}
+    \item {No explicit coordination between both processes}
+    \item {explicit transfer, explicit synchronization}
+    \item {Better performance}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[containsverbatim]
+  \frametitle{One-sided communication}
+  Initialization/Free (of the \textit{window} = window in memory)
+  \begin{itemize}
+    \item {\verb+MPI_Alloc_Mem()+, \verb+MPI_Free_Mem()+}
+    \item {\verb+MPI_Win_Create()+, \verb+MPI_Win_Free()+}
+  \end{itemize}
+  Remote memory access
+  \begin{itemize}
+    \item {\verb+MPI_Put()+ (like send)}
+    \item {\verb+MPI_Get()+ (like recv)}
+    \item {\verb+MPI_Accumulate()+ (like reduce)}
+  \end{itemize}
+  Synchronization
+  \begin{itemize}
+    \item {\verb+MPI_Win_Fence()+}
+    \item {\verb+MPI_Win_Post()+, \verb+MPI_Win_Start()+, \verb+MPI_Win_Complete()+, \verb+MPI_Win_Wait()+}
+    \item {\verb+MPI_Win_Lock()+, \verb+MPI_Win_Unlock()+}
+  \end{itemize}
+
+\end{frame}
+
+\begin{frame}[containsverbatim]
+  \frametitle{Memory allocation}
+  \begin{itemize}
+    \item {allocate \verb+size+ of memory segments in bytes}
+    \item {\verb+info+ can be used to provide directives that control the desired location of the allocated memory}
+    \item {\verb+*baseptr+ is the pointer to the beginning of the memory segment}
+  \end{itemize}
+
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_Alloc_mem(MPI_Aint size, MPI_Info info, void *baseptr)
+\end{lstlisting}
+
+\end{frame}
+
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{Memory \texttt{window} creation}
+  \begin{itemize}
+    \item {A \verb+MPI_Win+ is an opaque object which can be reused to perform one-sided communication}
+    \item {A \verb+window+ is a specified region in memory that can be accessed by another process}
+  \end{itemize}
+
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_Win_create(void *base, MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm, MPI_Win *win)
+\end{lstlisting}
+
+  where \verb+base+ is the initial address of the region, of \verb+size+ length of size \verb+disp_unit+ in bytes.
+
+\end{frame}
+
+\begin{frame}[containsverbatim]
+  \frametitle{\texttt{Put}/\texttt{Get} within the \texttt{window}}
+  \begin{itemize}
+    \item {close to an \verb+MPI_Send+ call with
+      \begin{itemize}
+        \item {\textit{what to send} : \verb+origin_addr+ start of the buffer of size \verb+origin_count+ of type \verb+origin_datatype+}
+        \item {\textit{to which process} : \verb+target_rank+ at the place \verb+target_count+ of type \verb+target_datatype+}
+        \item {\textit{in which context} : within the window \verb+win+}
+      \end{itemize}
+    }
+    %	\item {}
+  \end{itemize}
+
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win)
+\end{lstlisting}
+
+\begin{lstlisting}[language=C,frame=lines]
+int MPI_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win)
+\end{lstlisting}
+
+
+\end{frame}
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{One-sided communications example}
+
+\begin{lstlisting}[language=C,frame=lines]
+MPI_Win win;
+int *mem;
+float x = 1.0;
+MPI_Alloc_mem(size * sizeof(int), MPI_INFO_NULL, &mem);
+MPI_Win_create(mem, size * sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+// Write x at position 1 within process 0 's memory
+MPI_Put(&x, 1, MPI_FLOAT, 0, rank, 1, MPI_INT, win);
+
+MPI_Win_free(win);
+MPI_Free_mem(mem);
+\end{lstlisting}
+
+
+\end{frame}
+
+
+\begin{frame}[containsverbatim]
+  \frametitle{One-sided communications remarks}
+
+  \begin{itemize}
+    %	\item {Three primitives : Put (like a send), Get (like a recv) and accumulate (like a reduction)}
+    %	\item {synchronizations : fence / post-start-complete-wait / lock-unlock}
+    \item {Pay attention to the memory coherence}
+    \item {Can be dangerous : how a process knows if its data are in use/modified ?}
+    \item {MPI-3 provides new features : \begin{itemize}
+        \item cache-coherent windows,
+        \item new primitives \verb+MPI_Get_accumulate()+, \verb+MPI_Fetch_and_op()+, \verb+MPI_Compare_and_swap+,
+        \item requested-based primitives like \verb+MPI_R{put,get,accumulate,get_accumulate}+,
+        \item ``all''-versions of the synchronization routines : \verb+MPI_Win_{un}lock_all+, \verb+MPI_Win_flush{_all}+, \verb+MPI_Win_flush_local{_all}+
+			\end{itemize}
+    }
+    %	\item {}
+  \end{itemize}
+\end{frame}
+
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../phys_743_parallel_programming"
+%%% End: