diff --git a/pi/solutions-mpi/Makefile b/pi/solutions-mpi/Makefile
index 788c84f..fc4e726 100644
--- a/pi/solutions-mpi/Makefile
+++ b/pi/solutions-mpi/Makefile
@@ -1,12 +1,12 @@
-OPTIM+=-O3 -march=native
+OPTIM ?=-O3 -march=native
 CXX=mpicxx
 CC=mpicxx
 LD=${CXX}
 CXXFLAGS+=-Wall -Wextra -std=c++11 $(OPTIM)
 LDFLAGS+=-lm
-EXE=pi_p2p_async.o pi_gather_bcast.o pi_reduce.o pi_p2p_sendrecv.o pi_p2p_sync.o
+EXE=pi_p2p_async pi_gather_bcast pi_reduce pi_p2p_sendrecv pi_p2p_sync pi_p2p_persistent pi_1sided_put pi_1sided_get pi_io_at
 
 all: clean $(EXE)
 
 clean:
 	rm -f $(EXE) *.o *~
diff --git a/pi/solutions-mpi/pi_1sided_get.cc b/pi/solutions-mpi/pi_1sided_get.cc
new file mode 100644
index 0000000..2c960a0
--- /dev/null
+++ b/pi/solutions-mpi/pi_1sided_get.cc
@@ -0,0 +1,84 @@
+/*
+  This exercise is taken from the class Parallel Programming Workshop (MPI,
+  OpenMP and Advanced Topics) at HLRS given by Rolf Rabenseifner
+ */
+#include <chrono>
+#include <cstdio>
+#include <cmath>
+#include <mpi.h>
+
+using clk = std::chrono::high_resolution_clock;
+using second = std::chrono::duration<double>;
+using time_point = std::chrono::time_point<clk>;
+
+inline int digit(double x, int n) {
+  return std::trunc(x * std::pow(10., n)) - std::trunc(x * std::pow(10., n - 1)) *10.;
+}
+
+inline double f(double a) { return (4. / (1. + a * a)); }
+
+const int n = 10000000;
+
+int main(int /* argc */ , char ** /* argv */) {
+  int i;
+  double dx, x, sum, pi;
+
+  MPI_Init(NULL, NULL);
+  int prank, psize;
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+  MPI_Comm_size(MPI_COMM_WORLD, &psize);
+  auto t1 = clk::now();
+
+  auto ln = n/psize + (prank < n % psize ? 1 : 0);
+  
+  auto i_start = prank * ln + (prank < n % psize ? 0 : n % psize);
+  auto i_end = i_start + ln; 
+  
+  /* calculate pi = integral [0..1] 4 / (1 + x**2) dx */
+  dx = 1. / n;
+  double lsum = 0.0;
+  for (i = i_start; i < i_end; i++) {
+    x = 1. * i * dx;
+    lsum = lsum + f(x);
+  }
+
+  double send, recv;
+
+  //auto next = (prank + 1) % psize;
+  auto prev = (prank - 1 + psize) % psize;
+
+  MPI_Win win;
+  MPI_Win_create(&send, static_cast<MPI_Aint>(sizeof(double)), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+  send = sum = lsum;
+  
+  for (int p = 0 ; p < psize -1; ++p) {
+    MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOPRECEDE, win);
+    MPI_Get(&recv, 1, MPI_DOUBLE, prev, static_cast<MPI_Aint>(0), 1, MPI_DOUBLE, win);
+    MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, win);
+
+    sum += recv;
+    send = recv;
+  }
+
+  pi = dx * sum;
+  
+  MPI_Win_free(&win);
+
+  second elapsed = clk::now() - t1;
+
+  if (prank == 0) {
+    std::printf("computed pi                     = %.16g\n", pi);
+    std::printf("wall clock time (chrono)        = %.4gs\n", elapsed.count());
+    std::printf("n mpi process                   = %.d\n", psize);
+  
+
+    for(int d = 1; d <= 15; ++d) {
+      std::printf("%d", digit(pi, d));
+    }
+  }
+  
+  MPI_Finalize();
+  
+  return 0;
+}
diff --git a/pi/solutions-mpi/pi_1sided_put.cc b/pi/solutions-mpi/pi_1sided_put.cc
new file mode 100644
index 0000000..288618a
--- /dev/null
+++ b/pi/solutions-mpi/pi_1sided_put.cc
@@ -0,0 +1,84 @@
+/*
+  This exercise is taken from the class Parallel Programming Workshop (MPI,
+  OpenMP and Advanced Topics) at HLRS given by Rolf Rabenseifner
+ */
+#include <chrono>
+#include <cstdio>
+#include <cmath>
+#include <mpi.h>
+
+using clk = std::chrono::high_resolution_clock;
+using second = std::chrono::duration<double>;
+using time_point = std::chrono::time_point<clk>;
+
+inline int digit(double x, int n) {
+  return std::trunc(x * std::pow(10., n)) - std::trunc(x * std::pow(10., n - 1)) *10.;
+}
+
+inline double f(double a) { return (4. / (1. + a * a)); }
+
+const int n = 10000000;
+
+int main(int /* argc */ , char ** /* argv */) {
+  int i;
+  double dx, x, sum, pi;
+
+  MPI_Init(NULL, NULL);
+  int prank, psize;
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+  MPI_Comm_size(MPI_COMM_WORLD, &psize);
+  auto t1 = clk::now();
+
+  auto ln = n/psize + (prank < n % psize ? 1 : 0);
+  
+  auto i_start = prank * ln + (prank < n % psize ? 0 : n % psize);
+  auto i_end = i_start + ln; 
+  
+  /* calculate pi = integral [0..1] 4 / (1 + x**2) dx */
+  dx = 1. / n;
+  double lsum = 0.0;
+  for (i = i_start; i < i_end; i++) {
+    x = 1. * i * dx;
+    lsum = lsum + f(x);
+  }
+
+  double send, recv;
+
+  send = sum = lsum;
+
+  auto next = (prank + 1) % psize;
+  //auto prev = (prank - 1 + psize) % psize;
+
+  MPI_Win win;
+  MPI_Win_create(&recv, static_cast<MPI_Aint>(sizeof(double)), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+  
+  for (int p = 0 ; p < psize -1; ++p) {
+    MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOPRECEDE, win);
+    MPI_Put(&send, 1, MPI_DOUBLE, next, static_cast<MPI_Aint>(0), 1, MPI_DOUBLE, win);
+    MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, win);
+
+    sum += recv;
+    send = recv;
+  }
+
+  pi = dx * sum;
+
+  MPI_Win_free(&win);
+  
+  second elapsed = clk::now() - t1;
+
+  if (prank == 0) {
+    std::printf("computed pi                     = %.16g\n", pi);
+    std::printf("wall clock time (chrono)        = %.4gs\n", elapsed.count());
+    std::printf("n mpi process                   = %.d\n", psize);
+  
+
+    for(int d = 1; d <= 15; ++d) {
+      std::printf("%d", digit(pi, d));
+    }
+  }
+  
+  MPI_Finalize();
+  
+  return 0;
+}
diff --git a/pi/solutions-mpi/pi_gather_bcast.cc b/pi/solutions-mpi/pi_gather_bcast.cc
new file mode 100644
index 0000000..f488652
--- /dev/null
+++ b/pi/solutions-mpi/pi_gather_bcast.cc
@@ -0,0 +1,78 @@
+/*
+  This exercise is taken from the class Parallel Programming Workshop (MPI,
+  OpenMP and Advanced Topics) at HLRS given by Rolf Rabenseifner
+ */
+#include <chrono>
+#include <cstdio>
+#include <cmath>
+#include <numeric>
+#include <vector>
+#include <mpi.h>
+
+using clk = std::chrono::high_resolution_clock;
+using second = std::chrono::duration<double>;
+using time_point = std::chrono::time_point<clk>;
+
+inline int digit(double x, int n) {
+  return std::trunc(x * std::pow(10., n)) - std::trunc(x * std::pow(10., n - 1)) *10.;
+}
+
+inline double f(double a) { return (4. / (1. + a * a)); }
+
+const int n = 10000000;
+
+int main(int /* argc */ , char ** /* argv */) {
+  int i;
+  double dx, x, sum, pi;
+
+  MPI_Init(NULL, NULL);
+  int prank, psize;
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+  MPI_Comm_size(MPI_COMM_WORLD, &psize);
+  auto t1 = clk::now();
+
+  auto ln = n/psize + (prank < n % psize ? 1 : 0);
+  
+  auto i_start = prank * ln + (prank < n % psize ? 0 : n % psize);
+  auto i_end = i_start + ln; 
+  
+  /* calculate pi = integral [0..1] 4 / (1 + x**2) dx */
+  dx = 1. / n;
+  double lsum = 0.0;
+  for (i = i_start; i < i_end; i++) {
+    x = 1. * i * dx;
+    lsum = lsum + f(x);
+  }
+
+  std::vector<double> partial_sums;
+  if (prank == 0) {
+    partial_sums.resize(psize);
+  }
+
+  MPI_Gather(&lsum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+  if (prank == 0) {
+    sum = std::accumulate(partial_sums.begin(), partial_sums.end(), 0);
+  }
+
+  pi = dx * sum;
+  
+  MPI_Bcast(&sum, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+  
+  second elapsed = clk::now() - t1;
+
+  if (prank == 0) {
+    std::printf("computed pi                     = %.16g\n", pi);
+    std::printf("wall clock time (chrono)        = %.4gs\n", elapsed.count());
+    std::printf("n mpi process                   = %.d\n", psize);
+  
+
+    for(int d = 1; d <= 15; ++d) {
+      std::printf("%d", digit(pi, d));
+    }
+  }
+  
+  MPI_Finalize();
+  
+  return 0;
+}
diff --git a/pi/solutions-mpi/pi_io_at.cc b/pi/solutions-mpi/pi_io_at.cc
new file mode 100644
index 0000000..6460702
--- /dev/null
+++ b/pi/solutions-mpi/pi_io_at.cc
@@ -0,0 +1,87 @@
+/*
+  This exercise is taken from the class Parallel Programming Workshop (MPI,
+  OpenMP and Advanced Topics) at HLRS given by Rolf Rabenseifner
+ */
+
+#include <chrono>
+#include <cstdio>
+#include <cmath>
+#include <mpi.h>
+#include <vector>
+
+using clk = std::chrono::high_resolution_clock;
+using second = std::chrono::duration<double>;
+using time_point = std::chrono::time_point<clk>;
+
+inline int digit(double x, int n) {
+  return std::trunc(x * std::pow(10., n)) - std::trunc(x * std::pow(10., n - 1)) *10.;
+}
+
+inline double f(double a) { return (4. / (1. + a * a)); }
+
+const int n = 10000000;
+
+int main(int /* argc */ , char ** /* argv */) {
+  int i;
+  double dx, x, sum, pi;
+  int psize, prank;
+
+  MPI_Init(NULL, NULL);
+
+  MPI_Comm_size(MPI_COMM_WORLD, &psize);
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+
+  auto mpi_t1 = MPI_Wtime();
+  auto t1 = clk::now();
+
+  int nlocal = n / psize;
+  int istart = 1 + nlocal * prank;
+  int iend = nlocal * (prank + 1);
+
+  /* calculate pi = integral [0..1] 4 / (1 + x**2) dx */
+  dx = 1. / n;
+  sum = 0.0;
+  for (i = istart; i <= iend; i++) {
+    x = (1. * i - 0.5) * dx;
+    sum = sum + f(x);
+  }
+
+  MPI_Allreduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  pi = dx * sum;
+
+  auto mpi_elapsed = MPI_Wtime() - mpi_t1;
+  second elapsed = clk::now() - t1;
+
+  if(prank == 0) {
+    std::printf("computed pi                 = %.16g\n", pi);
+    std::printf("wall clock time (mpi_wtime) = %.4gs with %d process\n", mpi_elapsed, psize);
+    std::printf("wall clock time (chrono)    = %.4gs\n", elapsed.count());
+  }
+
+  char zero = '0';
+  int ndigits = 16 / psize;
+  int dstart = ndigits * prank;
+
+  std::vector<char> digits(ndigits);
+  for (int d = 0; d < ndigits; ++d) {
+    digits[d] = zero + digit(pi, dstart + d);
+  }
+
+  // open a file
+  MPI_File file;
+  MPI_File_open(MPI_COMM_WORLD, "pi.dat", MPI_MODE_WRONLY | MPI_MODE_CREATE,
+                MPI_INFO_NULL, &file);
+  MPI_File_set_size(file, 16);
+
+  // write the vector with MPI_File_write_at
+  MPI_File_write_at(file, dstart, digits.data(), digits.size(), MPI_CHAR,
+                    MPI_STATUS_IGNORE);
+
+  // close the file
+  MPI_File_close(&file);
+
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/pi/solutions-mpi/pi_p2p_derived_type.cc b/pi/solutions-mpi/pi_p2p_derived_type.cc
new file mode 100644
index 0000000..67ff97e
--- /dev/null
+++ b/pi/solutions-mpi/pi_p2p_derived_type.cc
@@ -0,0 +1,133 @@
+/*
+  This exercise is taken from the class Parallel Programming Workshop (MPI,
+  OpenMP and Advanced Topics) at HLRS given by Rolf Rabenseifner
+ */
+#include <chrono>
+#include <cstdio>
+#include <cmath>
+#include <mpi.h>
+
+using clk = std::chrono::high_resolution_clock;
+using second = std::chrono::duration<double>;
+using time_point = std::chrono::time_point<clk>;
+
+inline int digit(double x, int n) {
+  return std::trunc(x * std::pow(10., n)) - std::trunc(x * std::pow(10., n - 1)) *10.;
+}
+
+inline double f(double a) { return (4. / (1. + a * a)); }
+
+const int n = 10000000;
+
+struct Sum {
+  double sum;
+  int rank;
+
+  Sum() = default;
+  virtual ~Sum() = default;
+  Sum(Sum && other) noexcept = default;
+  Sum(const Sum & other) = default;
+  Sum & operator=(Sum && other) noexcept = default;
+  Sum & operator=(const Sum & other) = default;
+
+  Sum &operator+=(const Sum & other) {
+    sum += other.sum;
+    rank += other.rank;
+    return *this;
+  }
+};
+
+int main(int /* argc */ , char ** /* argv */) {
+  int i;
+  double dx, x, pi;
+
+  MPI_Init(NULL, NULL);
+  int prank, psize;
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+  MPI_Comm_size(MPI_COMM_WORLD, &psize);
+  auto t1 = clk::now();
+
+  auto ln = n/psize + (prank < n % psize ? 1 : 0);
+  
+  auto i_start = prank * ln + (prank < n % psize ? 0 : n % psize);
+  auto i_end = i_start + ln; 
+  
+  /* calculate pi = integral [0..1] 4 / (1 + x**2) dx */
+  dx = 1. / n;
+  double lsum = 0.0;
+  for (i = i_start; i < i_end; i++) {
+    x = 1. * i * dx;
+    lsum = lsum + f(x);
+  }
+
+  Sum send, recv, sum;
+  Sum send_buff[2];
+
+  
+  send.sum = sum.sum = lsum;
+  send.rank = sum.rank = prank;
+
+  int blk_length[2] = {1, 1};
+
+  MPI_Aint zero_address, first_address, second_address;
+  MPI_Get_address(&send, &zero_address);
+  MPI_Get_address(&send.sum, &first_address);
+  MPI_Get_address(&send.rank, &second_address);
+
+  MPI_Aint displs[2];
+  displs[0] = MPI_Aint_diff(first_address, zero_address);;
+  displs[1] = MPI_Aint_diff(second_address, zero_address);
+
+  MPI_Datatype types[2] = {MPI_DOUBLE, MPI_INT};
+  MPI_Datatype Sum_mpi_t, Sum_array_mpi_t;
+  MPI_Type_create_struct(2, blk_length, displs, types, &Sum_mpi_t);
+  MPI_Type_commit(&Sum_mpi_t);
+
+  MPI_Get_address(&(send_buff[0]), &first_address);
+  MPI_Get_address(&(send_buff[1]), &second_address);
+
+  MPI_Aint lb = 0;
+  MPI_Aint extent = MPI_Aint_diff(second_address, first_address);
+
+  MPI_Type_create_resized(Sum_mpi_t, lb, extent, &Sum_array_mpi_t);
+
+  MPI_Request request_send;
+  
+  auto next = (prank + 1) % psize;
+  auto prev = (prank - 1 + psize) % psize;
+  
+  for (int p = 0 ; p < psize -1; ++p) {
+    MPI_Isend(&send, 1, Sum_mpi_t, next, 0, MPI_COMM_WORLD,
+              &request_send);
+    MPI_Recv(&recv, 1, Sum_mpi_t, prev, 0,
+             MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+    sum += recv;
+    
+    MPI_Wait(&request_send, MPI_STATUS_IGNORE);
+    
+    send = recv;
+  }
+
+  pi = dx * sum.sum;
+  
+  second elapsed = clk::now() - t1;
+
+  if (prank == 0) {
+    std::printf("computed pi                     = %.16g\n", pi);
+    std::printf("sum of ranks                    = %d\n", sum.rank);
+    std::printf("wall clock time (chrono)        = %.4gs\n", elapsed.count());
+    std::printf("n mpi process                   = %.d\n", psize);
+  
+
+    for(int d = 1; d <= 15; ++d) {
+      std::printf("%d", digit(pi, d));
+    }
+  }
+
+  MPI_Type_free(&Sum_mpi_t);
+  
+  MPI_Finalize();
+  
+  return 0;
+}
diff --git a/poisson/solutions-cartesian/AUTHORS b/poisson/solutions-cartesian/AUTHORS
new file mode 100644
index 0000000..9253635
--- /dev/null
+++ b/poisson/solutions-cartesian/AUTHORS
@@ -0,0 +1,4 @@
+Authors:
+ - Nicolas Richart <nicolas.richart@epfl.ch>
+ - Vincent Keller <vincent.keller@epfl.ch>
+ . Vittoria Rezzonico <vittoria.rezzonico@epfl.ch>
diff --git a/poisson/solutions-cartesian/COPYING b/poisson/solutions-cartesian/COPYING
new file mode 100644
index 0000000..e551330
--- /dev/null
+++ b/poisson/solutions-cartesian/COPYING
@@ -0,0 +1,24 @@
+Copyright (c) 2017, Ecole Polytechnique Federal de Lausanne - Scientific IT and
+Application Support (SCITAS) All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/poisson/solutions-cartesian/Makefile b/poisson/solutions-cartesian/Makefile
new file mode 100644
index 0000000..77a28f6
--- /dev/null
+++ b/poisson/solutions-cartesian/Makefile
@@ -0,0 +1,21 @@
+CXX=mpic++
+#CXX=g++
+LD=${CXX}
+#VERIFY=-fsanitize=thread
+VERIFY= -D_DEBUG -D_DEBUG_MPI
+OPENMP=-fopenmp
+NOOPENMP=-fno-openmp -Wno-unknown-pragmas
+#OPTIM+=-O3 -D_MPI $(NOOPENMP)
+OPTIM+=-O3 -D_MPI $(NOOPENMP) -march=native
+CXXFLAGS+=-Wall -Wextra -Werror -pedantic -std=c++14 -g $(OPTIM) $(VERIFY)
+LDFLAGS+=-lm $(CXXFLAGS)
+
+OBJS=poisson.o grid.o double_buffer.o simulation.o  dumpers.o
+
+all: poisson
+
+poisson: $(OBJS)
+	$(LD) -o $@ $(OBJS) $(LDFLAGS)
+
+clean:
+	rm -f hello poisson *.o *~
diff --git a/poisson/solutions-cartesian/double_buffer.cc b/poisson/solutions-cartesian/double_buffer.cc
new file mode 100644
index 0000000..978eda5
--- /dev/null
+++ b/poisson/solutions-cartesian/double_buffer.cc
@@ -0,0 +1,31 @@
+/* -------------------------------------------------------------------------- */
+#include "double_buffer.hh"
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+DoubleBuffer::DoubleBuffer(int m, int n)
+    : m_current(new Grid(m, n)), m_old(new Grid(m, n)) {}
+
+/* -------------------------------------------------------------------------- */
+Grid & DoubleBuffer::current() { return *m_current; }
+
+/* -------------------------------------------------------------------------- */
+Grid & DoubleBuffer::old() { return *m_old; }
+
+/* -------------------------------------------------------------------------- */
+const Grid & DoubleBuffer::current() const { return *m_current; }
+
+/* -------------------------------------------------------------------------- */
+const Grid & DoubleBuffer::old() const { return *m_old; }
+
+/* -------------------------------------------------------------------------- */
+void DoubleBuffer::swap() {
+  m_current.swap(m_old);
+}
+
+/* -------------------------------------------------------------------------- */
+void DoubleBuffer::resize(int m, int n) {
+  m_current->resize(m, n);
+  m_old->resize(m, n);
+}
diff --git a/poisson/solutions-cartesian/double_buffer.hh b/poisson/solutions-cartesian/double_buffer.hh
new file mode 100644
index 0000000..1d21b5c
--- /dev/null
+++ b/poisson/solutions-cartesian/double_buffer.hh
@@ -0,0 +1,28 @@
+#ifndef DOUBLE_BUFFER
+#define DOUBLE_BUFFER
+
+/* -------------------------------------------------------------------------- */
+#include <memory>
+/* -------------------------------------------------------------------------- */
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+
+class DoubleBuffer {
+public:
+  DoubleBuffer(int m = 0 , int n = 0);
+
+  void resize(int m, int n);
+  Grid & current();
+  Grid & old();
+
+  const Grid & current() const;
+  const Grid & old() const;
+
+  void swap();
+private:
+
+  std::unique_ptr<Grid> m_current;
+  std::unique_ptr<Grid> m_old;
+};
+
+#endif /* DOUBLE_BUFFER */
diff --git a/poisson/solutions-cartesian/dumpers.cc b/poisson/solutions-cartesian/dumpers.cc
new file mode 100644
index 0000000..c626753
--- /dev/null
+++ b/poisson/solutions-cartesian/dumpers.cc
@@ -0,0 +1,205 @@
+/* -------------------------------------------------------------------------- */
+#include "dumpers.hh"
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+#include <array>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+void Dumper::set_min(float min) { m_min = min; }
+
+void Dumper::set_max(float max) { m_max = max; }
+
+float Dumper::min() const { return m_min; }
+
+float Dumper::max() const { return m_max; }
+
+/* -------------------------------------------------------------------------- */
+void DumperASCII::dump(const Grid & grid, int step) {
+  std::ofstream fout;
+  std::stringstream sfilename;
+
+  sfilename << "out_" << std::setfill('0') << std::setw(5) << step << ".pgm";
+  fout.open(sfilename.str());
+
+  int m = grid.m();
+  int n = grid.n();
+
+  fout << "P2" << std::endl << "# CREATOR: Poisson program" << std::endl;
+  fout << m << " " << n << std::endl;
+  fout << 255 << std::endl;
+
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      int v = 255. * (grid(i, j) - m_min) / (m_max - m_min);
+      v = std::min(v, 255);
+      fout << v << std::endl;
+    }
+  }
+}
+
+/* -------------------------------------------------------------------------- */
+void DumperBinary::dump(const Grid & grid, int step) {
+  std::stringstream sfilename;
+
+  std::array<int, 2> coords{0, 0}, dims{0, 0};
+
+  int prank{0}, psize{1};
+  MPI_Comm_rank(m_communicator, &prank);
+  MPI_Comm_size(m_communicator, &psize);
+
+  std::array<int, 2> periods{0, 0};
+  MPI_Cart_get(m_communicator, dims.size(), dims.data(), periods.data(), coords.data());
+
+  sfilename << "out_" << std::setfill('0') << std::setw(5) << step << ".bmp";
+
+  int h = grid.m();
+  int w = grid.n();
+  int global_w = w;
+
+  std::array<int, 2> starts = {0, 0};
+  std::array<int, 2> local_sizes = {h, w};
+  std::array<int, 2> global_sizes = local_sizes;
+
+  // removing the ghosts
+  if(prank == 0) std::cout << dims[0] << " " << dims[1] << "\n";
+
+  for (std::size_t i = 0; i < local_sizes.size(); ++i) {
+    if (dims[i] > 1) {
+      local_sizes[i] -= (coords[i] == 0 || coords[i] == dims[i] - 1 ? 1 : 2);
+      starts[i] += (coords[i] == 0 ? 0 : 1);
+    }
+  }
+
+  std::array<MPI_Comm, 2> comms;
+
+  // determining the local offset
+  std::array<int, 2> offset = {0, 0};
+
+  std::vector<std::vector<int>> line_col_size(2);
+
+  for (size_t i = 0; i < line_col_size.size(); ++i) {
+    std::array<int, 2> remain_dims = {false, false};
+    remain_dims[i] = true;
+    MPI_Cart_sub(m_communicator, remain_dims.data(), &(comms[i]));
+
+    line_col_size[i].resize(dims[i]);
+    MPI_Allgather(&local_sizes[i], 1, MPI_INT, line_col_size[i].data(), 1,
+                  MPI_INT, comms[i]);
+
+    global_sizes[i] = 0;
+    for (int j = 0; j < dims[i]; ++j) {
+      global_sizes[i] += line_col_size[i][j];
+    }
+  }
+
+  //for (int i = dims[0] - 1; i > coords[0]; --i) {
+  for (int i = 0; i < coords[0]; ++i) {
+    offset[0] += line_col_size[0][i];
+  }
+
+  for (int i = 0; i < coords[1]; ++i) {
+    offset[1] += line_col_size[1][i];
+  }
+
+  global_w = 4 * ((24 * global_sizes[1] + 31) / 32);
+
+  // if the file width (3*w) is not a multiple of 4 adds enough bytes to make it
+  // a multiple of 4
+  int padding = global_w - 3 * global_sizes[1];
+
+  w = 3 * local_sizes[1];
+
+  if (coords[1] == dims[1] - 1) {
+    w += padding;
+  } else {
+    padding = 0;
+  }
+
+  int filesize = 54 + global_sizes[0] * global_w;
+
+  std::vector<char> img(local_sizes[0] * w);
+  std::fill(img.begin(), img.end(), 0);
+
+  for (int i = 0; i < local_sizes[0]; i++) {
+    for (int j = 0; j < local_sizes[1]; j++) {
+      float v =
+          ((grid(i + starts[0], j + starts[1]) - m_min) / (m_max - m_min));
+
+      float r = v * 255; // Red channel
+      float g = v * 255; // Green channel
+      float b = prank * 255 / psize; // Red channel
+
+      r = std::min(r, 255.f);
+      g = std::min(g, 255.f);
+      b = std::min(b, 255.f);
+
+      img[w * i + 3 * j + 0] = r;
+      img[w * i + 3 * j + 1] = g;
+      img[w * i + 3 * j + 2] = b;
+    }
+  }
+
+  auto write_bytes = [](auto && array, auto && value) {
+    array[0] = value;
+    array[1] = value >> 8;
+    array[2] = value >> 16;
+    array[3] = value >> 24;
+  };
+
+  std::array<char, 14> bmpfileheader = {'B', 'M', 0, 0,  0, 0, 0,
+                                        0,   0,   0, 54, 0, 0, 0};
+  std::array<char, 40> bmpinfoheader = {40, 0, 0, 0, 0, 0, 0,  0,
+                                        0,  0, 0, 0, 1, 0, 24, 0};
+
+  write_bytes(&bmpfileheader[2], filesize);
+
+  write_bytes(&bmpinfoheader[4], global_sizes[1]);
+  write_bytes(&bmpinfoheader[8], global_sizes[0]);
+  write_bytes(&bmpinfoheader[20], filesize - 54);
+
+  MPI_File fh;
+  MPI_Status status;
+  MPI_Datatype subarray_t;
+
+#if defined(_DEBUG_MPI)
+  for (int i = 0; i < psize; ++i) {
+    if (prank == i)
+      std::cerr << prank << " (" << coords[0] << ", " << coords[1] << ")"
+                << " - " << local_sizes[0] << "x" << local_sizes[1] << "(" << w
+                << ") + " << offset[0] << "x" << offset[1] << "("<< 3*offset[1]<<") ++ ["
+                << global_sizes[0] << "x" << global_sizes[1] << "(" << global_w
+                << ")] " << padding << "\n";
+    MPI_Barrier(m_communicator);
+  }
+#endif
+
+  global_sizes[1] = global_w;
+  local_sizes[1] = w;
+  offset[1] *= 3;
+
+  MPI_Type_create_subarray(global_sizes.size(), global_sizes.data(),
+                           local_sizes.data(), offset.data(), MPI_ORDER_C,
+                           MPI_CHAR, &subarray_t);
+  MPI_Type_commit(&subarray_t);
+
+  // opening a file in write and create mode
+  MPI_File_open(MPI_COMM_WORLD, sfilename.str().c_str(),
+                MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
+  // defining the size of the file
+  MPI_File_set_size(fh, filesize);
+
+  // rank 0 writes the header
+  if (prank == 0) {
+    MPI_File_write_at(fh, 0, bmpfileheader.data(), 14, MPI_CHAR, &status);
+    MPI_File_write_at(fh, 14, bmpinfoheader.data(), 40, MPI_CHAR, &status);
+  }
+
+  MPI_File_set_view(fh, 54, MPI_CHAR, subarray_t, "native", MPI_INFO_NULL);
+  MPI_File_write(fh, img.data(), img.size(), MPI_CHAR, &status);
+  MPI_File_close(&fh);
+}
diff --git a/poisson/solutions-cartesian/dumpers.hh b/poisson/solutions-cartesian/dumpers.hh
new file mode 100644
index 0000000..dc9b013
--- /dev/null
+++ b/poisson/solutions-cartesian/dumpers.hh
@@ -0,0 +1,58 @@
+#if defined(_MPI)
+#include <mpi.h>
+#endif
+
+#ifndef DUMPERS_HH
+#define DUMPERS_HH
+
+/* -------------------------------------------------------------------------- */
+class Grid;
+
+/* -------------------------------------------------------------------------- */
+class Dumper {
+public:
+#if defined(_MPI)
+  explicit Dumper(MPI_Comm communicator)
+      : m_min(-1.), m_max(1.), m_communicator(communicator) {}
+#else
+  Dumper() : m_min(-1.), m_max(1.) {}
+#endif
+  virtual void dump(const Grid & m_grid, int step) = 0;
+
+  void set_min(float min);
+  void set_max(float max);
+
+  float min() const;
+  float max() const;
+
+protected:
+  float m_min, m_max;
+#if defined(_MPI)
+  MPI_Comm m_communicator;
+#endif
+};
+
+/* -------------------------------------------------------------------------- */
+class DumperASCII : public Dumper {
+public:
+#if defined(_MPI)
+  explicit DumperASCII(MPI_Comm communicator) : Dumper(communicator) {}
+#else
+  DumperASCII() : Dumper() {}
+#endif
+
+  virtual void dump(const Grid & grid, int step);
+};
+
+/* -------------------------------------------------------------------------- */
+class DumperBinary : public Dumper {
+public:
+#if defined(_MPI)
+  explicit DumperBinary(MPI_Comm communicator) : Dumper(communicator) {}
+#else
+  DumperBinary() : Dumper() {}
+#endif
+  virtual void dump(const Grid & grid, int step);
+};
+
+#endif /* DUMPERS_HH */
diff --git a/poisson/solutions-cartesian/grid.cc b/poisson/solutions-cartesian/grid.cc
new file mode 100644
index 0000000..f507a41
--- /dev/null
+++ b/poisson/solutions-cartesian/grid.cc
@@ -0,0 +1,116 @@
+/* -------------------------------------------------------------------------- */
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+#include <algorithm>
+#include <iostream>
+#if defined(_DEBUG_MPI)
+#include <iomanip>
+#endif
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+Grid::Grid(int m, int n) : m_m(m), m_n(n), m_storage(m * n) { clear(); }
+
+/* -------------------------------------------------------------------------- */
+Grid::~Grid() {
+  MPI_Type_free(&column_t);
+  MPI_Type_free(&line_t);
+}
+
+/* -------------------------------------------------------------------------- */
+void Grid::clear() { std::fill(m_storage.begin(), m_storage.end(), 0.); }
+
+/* -------------------------------------------------------------------------- */
+void Grid::resize(int m, int n) {
+  m_m = m;
+  m_n = n;
+  m_storage.resize(m * n);
+}
+
+/* -------------------------------------------------------------------------- */
+int Grid::m() const { return m_m; }
+int Grid::n() const { return m_n; }
+
+/* -------------------------------------------------------------------------- */
+void Grid::initializeHaloCommunications(MPI_Comm & m_communicator,
+                                        const std::array<int, 2> & ghosts) {
+  std::array<int, 2> coords;
+  int prank;
+
+  MPI_Type_vector(m_m - ghosts[0], 1, m_n, MPI_FLOAT, &column_t);
+  MPI_Type_vector(1, m_n - ghosts[1], 1, MPI_FLOAT, &line_t);
+
+  MPI_Type_commit(&column_t);
+  MPI_Type_commit(&line_t);
+
+  MPI_Comm_rank(m_communicator, &prank);
+  MPI_Cart_coords(m_communicator, prank, coords.size(), coords.data());
+
+  // determining the rank of the neighbors
+  int left, right, top, bottom;
+  MPI_Cart_shift(m_communicator, 1, 1, &left, &right);
+  MPI_Cart_shift(m_communicator, 0, 1, &top, &bottom);
+
+#if defined(_DEBUG_MPI)
+  static bool first = true;
+  if (first) {
+    int psize;
+    MPI_Comm_size(m_communicator, &psize);
+    for (int i = 0; i < psize; ++i) {
+      if (prank == i) {
+        std::cerr << prank << std::setw(7) << top << "\n"
+                  << prank << std::setw(8) << "^\n"
+                  << prank << std::setw(3) << left << " < . > " << right << "\n"
+                  << prank << std::setw(8) << "v\n"
+                  << prank << std::setw(7) << bottom << "\n\n";
+      }
+      MPI_Barrier(m_communicator);
+    }
+    first = false;
+  }
+#endif
+
+  auto addr = [&](auto && i, auto && j) { return &(this->operator()(i, j)); };
+
+  MPI_Request * rrequest = m_requests.data();
+  MPI_Request * srequest = m_requests.data() + 4;
+
+  static int tag = 1;
+  /// preparing receives
+  {
+    auto send = addr(m_m - 2, 1);
+    auto recv = addr(0, 1);
+    MPI_Send_init(send, 1, line_t, bottom, tag, m_communicator, srequest);
+    MPI_Recv_init(recv, 1, line_t, top, tag, m_communicator, rrequest);
+    ++rrequest;
+    ++srequest;
+    ++tag;
+  }
+  {
+    auto send = addr(1, m_n - 2);
+    auto recv = addr(1, 0);
+    MPI_Send_init(send, 1, column_t, right, tag, m_communicator, srequest);
+    MPI_Recv_init(recv, 1, column_t, left, tag, m_communicator, rrequest);
+    ++rrequest;
+    ++srequest;
+    ++tag;
+  }
+  {
+    auto send = addr(1, 1);
+    auto recv = addr(m_m - 1, 1);
+    MPI_Send_init(send, 1, line_t, top, tag, m_communicator, srequest);
+    MPI_Recv_init(recv, 1, line_t, bottom, tag, m_communicator, rrequest);
+    ++rrequest;
+    ++srequest;
+    ++tag;
+  }
+  {
+    auto send = addr(1, 1);
+    auto recv = addr(1, m_n - 1);
+    MPI_Send_init(send, 1, column_t, left, tag, m_communicator, srequest);
+    MPI_Recv_init(recv, 1, column_t, right, tag, m_communicator, rrequest);
+    ++rrequest;
+    ++srequest;
+    ++tag;
+  }
+}
diff --git a/poisson/solutions-cartesian/grid.hh b/poisson/solutions-cartesian/grid.hh
new file mode 100644
index 0000000..96e6e48
--- /dev/null
+++ b/poisson/solutions-cartesian/grid.hh
@@ -0,0 +1,73 @@
+#ifndef GRID_HH
+#define GRID_HH
+
+/* -------------------------------------------------------------------------- */
+#include <array>
+#include <vector>
+#include <mpi.h>
+#include <iostream>
+/* -------------------------------------------------------------------------- */
+namespace {
+struct DEBUG {
+#if defined(DEBUG)
+  static void print(const std::string & func) {
+    std::cerr << func << "\n";
+  }
+#else
+  static void print(const std::string & ) {
+  
+  }
+#endif
+};
+} // namespace
+
+/* -------------------------------------------------------------------------- */
+
+class Grid {
+public:
+  Grid(int m = 0, int n = 0);
+  ~Grid();
+
+  /// access the value [i][j] of the grid
+  inline float & operator()(int i, int j) { return m_storage[i * m_n + j]; }
+  inline const float & operator()(int i, int j) const {
+    return m_storage[i * m_n + j];
+  }
+
+  void resize(int m, int n);
+
+  /// set the grid to 0
+  void clear();
+
+  int m() const;
+  int n() const;
+
+  void initializeHaloCommunications(MPI_Comm & m_communicator,
+                                    const std::array<int, 2> & ghosts);
+  void startSynchronization() {
+    MPI_Request * requests = m_requests.data();
+    MPI_Startall(8, requests);
+  }
+
+  void waitReceives() {
+    auto requests = m_requests.data();
+    MPI_Waitall(4, requests, MPI_STATUS_IGNORE);
+  }
+
+  void waitSends() {
+    auto requests = m_requests.data();
+    MPI_Waitall(4, requests + 4, MPI_STATUS_IGNORE);
+  }
+
+private:
+  int m_m, m_n;
+  std::vector<float> m_storage;
+
+  /// request for asynchronous communications
+  std::array<MPI_Request, 8> m_requests;
+
+  /// communication types
+  MPI_Datatype column_t, line_t;
+};
+
+#endif /* GRID_HH */
diff --git a/poisson/solutions-cartesian/optim.sub b/poisson/solutions-cartesian/optim.sub
new file mode 100644
index 0000000..3e4440e
--- /dev/null
+++ b/poisson/solutions-cartesian/optim.sub
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+##SBATCH --reservation=phpc2017
+##SBATCH --account=phpc2017
+#SBATCH -N 1
+#SBATCH -n 1
+
+OPTIMS="-O0 -O1 -O2 -O3"
+N=512
+
+
+module load gcc
+
+echo gcc > gcc.out
+for _opt in ${OPTIMS} '-O3 -ftree-vectorize'
+do
+  make clean
+  make CXXFLAGS="-std=c++11 ${_opt}" CXX=g++
+  ./poisson $N| cut -f4 -d " " >> gcc.out
+done
+
+
+module load intel
+
+echo icpc > intel.out
+for _opt in ${OPTIMS} '-O3 -xHOST'
+do
+  make clean
+  make CXXFLAGS="-std=c++11 ${_opt}" CXX=icpc
+  ./poisson $N | cut -f4 -d " " >> intel.out
+done
+
+echo " ${OPTIMS} specific" | tr " " "\n" > header
+
+paste header gcc.out intel.out > compilers.out
diff --git a/poisson/solutions-cartesian/poisson.cc b/poisson/solutions-cartesian/poisson.cc
new file mode 100644
index 0000000..95f843b
--- /dev/null
+++ b/poisson/solutions-cartesian/poisson.cc
@@ -0,0 +1,73 @@
+/* -------------------------------------------------------------------------- */
+#include "simulation.hh"
+/* -------------------------------------------------------------------------- */
+#include <chrono>
+#include <iostream>
+#include <sstream>
+#include <tuple>
+/* -------------------------------------------------------------------------- */
+#include <mpi.h>
+/* -------------------------------------------------------------------------- */
+
+#define EPSILON 0.05
+
+typedef std::chrono::high_resolution_clock clk;
+typedef std::chrono::duration<double> second;
+
+static void usage(const std::string & prog_name) {
+  std::cerr << prog_name << " <grid_size>" << std::endl;
+  exit(0);
+}
+
+int main(int argc, char * argv[]) {
+  int prank{0}, psize{1}, nthreads{1};
+  MPI_Init(NULL, NULL);
+  // assert(provided == MPI_THREAD_MULTIPLE);
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+  MPI_Comm_size(MPI_COMM_WORLD, &psize);
+
+  if (argc != 2)
+    usage(argv[0]);
+
+  std::stringstream args(argv[1]);
+  int N;
+  args >> N;
+
+  if (args.fail())
+    usage(argv[0]);
+
+  std::unique_ptr<Simulation> simu;
+
+  simu = std::make_unique<Simulation>(N, N, MPI_COMM_WORLD);
+
+  simu->set_initial_conditions();
+  simu->set_epsilon(EPSILON);
+
+  float l2;
+  int k;
+
+  auto start = clk::now();
+  std::tie(l2, k) = simu->compute();
+  auto end = clk::now();
+
+  second time = end - start;
+
+  auto min = simu->min();
+  auto max = simu->max();
+
+  if (prank == 0)
+    std::cout << "MPI:   " << psize << " (" << simu->dims(0) << ", " << simu->dims(1) << ")\n"
+              << "OMP:   " << nthreads << "\n"
+              << "Grid:  " << N << "x" << N << "\n"
+              << "Bound: " << "[ " << min << " : " << max << " ]" << "\n"
+              << "Iter:  " << k << "\n"
+              << "Norm:  " << std::scientific << l2 << "\n"
+              << "Time:  " << time.count()  << "s" << "\n"
+              << "Flops: " << simu->complexity()*k/time.count()/1e9  << " GFlops" << std::endl;
+
+  simu.release(); // To force destruction of MPI related stuff
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/poisson/solutions-cartesian/poisson.sub b/poisson/solutions-cartesian/poisson.sub
new file mode 100644
index 0000000..aa43198
--- /dev/null
+++ b/poisson/solutions-cartesian/poisson.sub
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+##SBATCH --reservation=phpc2017
+##SBATCH --account=phpc2017
+#SBATCH -N 1
+#SBATCH -n 1
+
+module load gcc
+
+rm -f complexity.dat
+
+for i in 128 256 512 1024
+do
+    ./poisson $i >> complexity.dat
+done
+
+slmodules -r deprecated
+module load gnuplot
+
+gnuplot <<EOF
+set terminal svg
+set output 'complexity.svg'
+set multiplot layout 2,1
+set key off
+set xlabel "N [-]"
+set ylabel "iter [-]"
+plot "complexity.dat" u 1:2 w lp
+
+set ylabel "time [s]"
+plot "complexity.dat" u 1:2 w lp
+EOF
diff --git a/poisson/solutions-cartesian/simulation.cc b/poisson/solutions-cartesian/simulation.cc
new file mode 100644
index 0000000..65dcbe9
--- /dev/null
+++ b/poisson/solutions-cartesian/simulation.cc
@@ -0,0 +1,244 @@
+/* -------------------------------------------------------------------------- */
+#include "simulation.hh"
+/* -------------------------------------------------------------------------- */
+#include <assert.h>
+#include <cmath>
+#include <iostream>
+#include <numeric>
+#include <mpi.h>
+
+/* -------------------------------------------------------------------------- */
+Simulation::Simulation(int m, int n, MPI_Comm communicator) {
+
+  // retrieving the number of proc and the rank in the proc pool
+  MPI_Comm_size(communicator, &m_psize);
+
+  m_dims = {0, 0};
+  std::array<int, 2> periods{0, 0};
+  MPI_Dims_create(m_psize, m_dims.size(), m_dims.data());
+
+  assert(m_dims[0] * m_dims[1] == m_psize && "MPI_Dims_create is fucked up");
+
+  MPI_Cart_create(communicator, m_dims.size(), m_dims.data(), periods.data(), 0,
+                  &m_communicator);
+
+  MPI_Comm_rank(m_communicator, &m_prank);
+
+  MPI_Cart_coords(m_communicator, m_prank, m_coords.size(), m_coords.data());
+
+  m_dumper = std::make_unique<DumperBinary>(m_communicator);
+
+  m_global_size = {m, n};
+
+  std::array<int, 2> ghosts{0, 0};
+
+  for (std::size_t i = 0; i < m_local_size.size(); ++i) {
+    // computation of the local size of the grid the remainder is spread equally
+    // on the first processors
+    m_local_size[i] = m_global_size[i] / m_dims[i] +
+                      (m_coords[i] < m_global_size[i] % m_dims[i] ? 1 : 0);
+
+    // adding the ghosts lines if needed
+    if (m_dims[i] > 1)
+      ghosts[i] = (m_coords[i] == 0 || m_coords[i] == m_dims[i] - 1) ? 1 : 2;
+
+    m_local_size[i] += ghosts[i];
+
+    // computing the offsets of the local grid in the global one
+    m_offset[i] = (m_global_size[i] / m_dims[i]) * m_coords[i] +
+                  (m_coords[i] < m_global_size[i] % m_dims[i]
+                       ? m_coords[i]
+                       : m_global_size[i] % m_dims[i]);
+
+    m_h[i] = 1. / m_global_size[i];
+  }
+
+#if defined(_DEBUG_MPI)
+  for (int i = 0; i < m_psize; ++i) {
+    if (m_prank == i)
+      std::cerr << m_prank << " (" << m_coords[0] << ", " << m_coords[1] << ")"
+                << " - " << m_local_size[0] << "x" << m_local_size[1] << " + "
+                << m_offset[0] << "x" << m_offset[1] << " ++ [" << ghosts[0]
+                << "x" << ghosts[1] << "]\n";
+    MPI_Barrier(m_communicator);
+  }
+#endif
+
+  // resizing the different grids
+  m_grids.resize(m_local_size[0], m_local_size[1]);
+  m_f.resize(m_local_size[0], m_local_size[1]);
+
+  m_grids.current().initializeHaloCommunications(m_communicator, ghosts);
+  m_grids.old().initializeHaloCommunications(m_communicator, ghosts);
+}
+
+/* -------------------------------------------------------------------------- */
+void Simulation::set_initial_conditions() {
+  std::array<int, 2> start = {0, 0};
+  std::array<int, 2> end = m_local_size;
+
+  for (int i = 0; i < 2; ++i) {
+    start[i] += (m_coords[i] == 0 ? 0 : 1);
+    end[i] -= (m_coords[i] == m_dims[i] - 1 ? 0 : 1);
+  }
+
+  float min = 0., max = 0.;
+  float a = 5.5 * M_PI;
+  for (int i = start[0]; i < end[0]; i++) {
+    for (int j = start[1]; j < end[1]; j++) {
+      m_f(i, j) =
+          -2. * a * a * std::sin(a * (m_offset[0] + i - start[0]) * m_h[0]) *
+          std::sin(a * (m_offset[1] + j - start[1]) * m_h[1]) * m_h[0] * m_h[1];
+
+      min = std::min(min, m_f(i, j));
+      max = std::max(max, m_f(i, j));
+    }
+  }
+
+  m_dumper->set_min(min);
+  m_dumper->set_max(max);
+  m_dumper->dump(m_f, 1);
+}
+
+/* -------------------------------------------------------------------------- */
+std::tuple<float, int> Simulation::compute() {
+  int s = 0;
+
+  // std::cout << m_h_m << " " << m_h_n << " " << m_epsilon << std::endl;
+  float l2 = 0.;
+
+  do {
+    l2 = compute_step();
+    DEBUG::print(std::to_string(m_prank) + "do_step: " + std::to_string(l2));
+
+    m_grids.swap();
+
+    MPI_Allreduce(MPI_IN_PLACE, &l2, 1, MPI_FLOAT, MPI_SUM, m_communicator);
+    DEBUG::print(std::to_string(m_prank) + " MPI_Allreduce: " + std::to_string(l2) + " / " + std::to_string(m_epsilon));
+    ++s;
+
+  } while (l2 > m_epsilon);
+
+  m_dumper->set_min(min());
+  m_dumper->set_max(max());
+  m_dumper->dump(m_grids.old(), 0);
+
+  return std::make_tuple(std::sqrt(l2), s);
+}
+
+/* -------------------------------------------------------------------------- */
+void Simulation::set_epsilon(float epsilon) { m_epsilon = epsilon * epsilon; }
+
+/* -------------------------------------------------------------------------- */
+float Simulation::epsilon() const { return std::sqrt(m_epsilon); }
+
+/* -------------------------------------------------------------------------- */
+inline float Simulation::compute_row(int i) {
+  float l2 = 0.;
+
+  Grid & u = m_grids.current();
+  Grid & uo = m_grids.old();
+
+  for (int j = 1; j < m_local_size[1] - 1; j++) {
+    // computation of the new step
+    u(i, j) = (uo(i - 1, j) + uo(i + 1, j) + uo(i, j - 1) + uo(i, j + 1) -
+               m_f(i, j)) /
+              4.;
+
+    // L2 norm
+    l2 += (uo(i, j) - u(i, j)) * (uo(i, j) - u(i, j));
+  }
+
+  return l2;
+}
+
+/* -------------------------------------------------------------------------- */
+float Simulation::compute_step() {
+  float l2 = 0.;
+
+  auto & uo = m_grids.old();
+  auto & u = m_grids.current();
+
+  auto _compute = [&](auto && i, auto && j) {
+    u(i, j) = (uo(i - 1, j) + uo(i + 1, j) + uo(i, j - 1) + uo(i, j + 1) -
+               m_f(i, j)) /
+              4.;
+
+    l2 += (uo(i, j) - u(i, j)) * (uo(i, j) - u(i, j));
+  };
+
+  // start the permanent communications
+  uo.startSynchronization();
+
+  // computing the inner rows that do not depend on the ghosts
+  for (int i = 2; i < m_local_size[0] - 2; i++) {
+    for (int j = 2; j < m_local_size[1] - 2; j++) {
+      _compute(i, j);
+    }
+  }
+
+  /// wait to receive the ghosts before using them for them computation
+  uo.waitReceives();
+
+  for (int i = 2; i < m_local_size[1] - 2; i++) {
+    _compute(1, i);
+  }
+
+  for (int i = 2; i < m_local_size[1] - 2; i++) {
+    _compute(m_local_size[0] - 2, i);
+  }
+
+  for (int i = 2; i < m_local_size[0] - 2; i++) {
+    _compute(i, 1);
+  }
+
+  for (int i = 2; i < m_local_size[0] - 2; i++) {
+    _compute(i, m_local_size[1] - 2);
+  }
+
+  /// wait to send everything before changing the buffers
+  uo.waitSends();
+
+  return l2;
+}
+
+namespace {
+template <typename G, typename Func>
+auto accumulate_grid(const G & grid, float init, Func && func) {
+  auto m = grid.m();
+  auto n = grid.n();
+
+  for (decltype(m) i = 1; i < m - 1; i++) {
+    for (decltype(n) j = 1; j < n - 1; j++) {
+      init = func(init, grid(i, j));
+    }
+  }
+  return init;
+}
+} // namespace
+
+/* -------------------------------------------------------------------------- */
+float Simulation::min() const {
+  const auto & u0 = m_grids.old();
+
+  auto tmp = accumulate_grid(
+      u0, u0(1, 1), [](auto && a, auto && b) { return std::min(a, b); });
+
+  MPI_Allreduce(MPI_IN_PLACE, &tmp, 1, MPI_FLOAT, MPI_MIN, m_communicator);
+
+  return tmp;
+}
+
+/* -------------------------------------------------------------------------- */
+float Simulation::max() const {
+  const auto & u0 = m_grids.old();
+
+  auto tmp = accumulate_grid(
+      u0, u0(1, 1), [](auto && a, auto && b) { return std::max(a, b); });
+
+  MPI_Allreduce(MPI_IN_PLACE, &tmp, 1, MPI_FLOAT, MPI_MAX, m_communicator);
+
+  return tmp;
+}
+
+/* -------------------------------------------------------------------------- */
diff --git a/poisson/solutions-cartesian/simulation.hh b/poisson/solutions-cartesian/simulation.hh
new file mode 100644
index 0000000..5ff0a50
--- /dev/null
+++ b/poisson/solutions-cartesian/simulation.hh
@@ -0,0 +1,85 @@
+#ifndef SIMULATION_HH
+#define SIMULATION_HH
+
+/* -------------------------------------------------------------------------- */
+#include "double_buffer.hh"
+#include "dumpers.hh"
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+#include <array>
+#include <memory>
+#include <tuple>
+#include <mpi.h>
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+class Simulation {
+public:
+  Simulation(int m, int n, MPI_Comm communicator);
+
+  /// set the initial conditions, Dirichlet and source term
+  virtual void set_initial_conditions();
+
+  /// perform the simulation
+  std::tuple<float, int> compute();
+
+  /// access the precision
+  void set_epsilon(float epsilon);
+  float epsilon() const;
+
+  std::size_t complexity() { return 8 * m_global_size[0] * m_global_size[1]; }
+
+  float min() const;
+  float max() const;
+
+  float dims(std::size_t i) const { return m_dims[i]; }
+
+protected:
+  /// compute one step and return an error
+  virtual float compute_step();
+
+  /// compute one row
+  float compute_row(int i);
+
+private:
+  /// Global problem size
+  std::array<int, 2> m_global_size;
+
+  /// Local problem size
+  std::array<int, 2> m_local_size;
+
+  /// local offset
+  std::array<int, 2> m_offset;
+
+  /// Cartesian dimensions
+  std::array<int, 2> m_dims;
+
+  /// Cartesian coordinates
+  std::array<int, 2> m_coords;
+
+  /// proc rank
+  int m_prank{0};
+
+  /// communicator size
+  int m_psize{1};
+
+  /// Precision to achieve
+  float m_epsilon;
+
+  /// grid spacing
+  std::array<float, 2> m_h;
+
+  /// Grids storage
+  DoubleBuffer m_grids;
+
+  /// source term
+  Grid m_f;
+
+  /// Dumper to use for outputs
+  std::unique_ptr<Dumper> m_dumper;
+
+  /// communicator
+  MPI_Comm m_communicator;
+};
+
+#endif /* SIMULATION_HH */
diff --git a/poisson/solutions-persistent/Makefile b/poisson/solutions-persistent/Makefile
new file mode 100644
index 0000000..77b7a64
--- /dev/null
+++ b/poisson/solutions-persistent/Makefile
@@ -0,0 +1,14 @@
+CXX=mpic++
+LD=${CXX}
+CXXFLAGS+=-Wall -Wextra -Werror -pedantic -std=c++11 -O2 -g3
+LDFLAGS+=-lm $(CXXFLAGS)
+
+OBJS=poisson.o grid.o double_buffer.o simulation.o  dumpers.o
+
+all: poisson
+
+poisson: $(OBJS)
+	$(LD) -o $@ $(OBJS) $(LDFLAGS)
+
+clean:
+	rm -f hello poisson *.o *~
diff --git a/poisson/solutions-persistent/compile_commands.json b/poisson/solutions-persistent/compile_commands.json
new file mode 100644
index 0000000..0637a08
--- /dev/null
+++ b/poisson/solutions-persistent/compile_commands.json
@@ -0,0 +1 @@
+[]
\ No newline at end of file
diff --git a/poisson/solutions-persistent/double_buffer.cc b/poisson/solutions-persistent/double_buffer.cc
new file mode 100644
index 0000000..67749d0
--- /dev/null
+++ b/poisson/solutions-persistent/double_buffer.cc
@@ -0,0 +1,25 @@
+/* -------------------------------------------------------------------------- */
+#include "double_buffer.hh"
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+DoubleBuffer::DoubleBuffer(int m, int n)
+    : m_current(new Grid(m, n)), m_old(new Grid(m, n)) {}
+
+/* -------------------------------------------------------------------------- */
+Grid & DoubleBuffer::current() { return *m_current; }
+
+/* -------------------------------------------------------------------------- */
+Grid & DoubleBuffer::old() { return *m_old; }
+
+/* -------------------------------------------------------------------------- */
+void DoubleBuffer::swap() {
+  m_current.swap(m_old);
+}
+
+/* -------------------------------------------------------------------------- */
+void DoubleBuffer::resize(int m, int n) {
+  m_current->resize(m, n);
+  m_old->resize(m, n);
+}
diff --git a/poisson/solutions-persistent/double_buffer.hh b/poisson/solutions-persistent/double_buffer.hh
new file mode 100644
index 0000000..a05410b
--- /dev/null
+++ b/poisson/solutions-persistent/double_buffer.hh
@@ -0,0 +1,25 @@
+#ifndef DOUBLE_BUFFER
+#define DOUBLE_BUFFER
+
+/* -------------------------------------------------------------------------- */
+#include <memory>
+/* -------------------------------------------------------------------------- */
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+
+class DoubleBuffer {
+public:
+  DoubleBuffer(int m = 0 , int n = 0);
+
+  void resize(int m, int n);
+  Grid & current();
+  Grid & old();
+
+  void swap();
+private:
+
+  std::unique_ptr<Grid> m_current;
+  std::unique_ptr<Grid> m_old;
+};
+
+#endif /* DOUBLE_BUFFER */
diff --git a/poisson/solutions-persistent/dumpers.cc b/poisson/solutions-persistent/dumpers.cc
new file mode 100644
index 0000000..3dc9abb
--- /dev/null
+++ b/poisson/solutions-persistent/dumpers.cc
@@ -0,0 +1,160 @@
+/* -------------------------------------------------------------------------- */
+#include "dumpers.hh"
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+#include <array>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+void Dumper::set_min(float min) { m_min = min; }
+
+void Dumper::set_max(float max) { m_max = max; }
+
+float Dumper::min() const { return m_min; }
+
+float Dumper::max() const { return m_max; }
+
+/* -------------------------------------------------------------------------- */
+void DumperASCII::dump(int step) {
+  std::ofstream fout;
+  std::stringstream sfilename;
+
+  sfilename << "out_" << std::setfill('0') << std::setw(5) << step << ".pgm";
+  fout.open(sfilename.str());
+
+  int m = m_grid.m();
+  int n = m_grid.n();
+
+  fout << "P2" << std::endl << "# CREATOR: Poisson program" << std::endl;
+  fout << m << " " << n << std::endl;
+  fout << 255 << std::endl;
+
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      int v = 255. * (m_grid(i, j) - m_min) / (m_max - m_min);
+      v = std::min(v, 255);
+      fout << v << std::endl;
+    }
+  }
+}
+
+/* -------------------------------------------------------------------------- */
+void DumperBinary::dump(int step) {
+  std::ofstream fout;
+  std::stringstream sfilename;
+
+  int prank, psize;
+  MPI_Comm_rank(m_communicator, &prank);
+  MPI_Comm_size(m_communicator, &psize);
+
+  sfilename << "out_" << std::setfill('0') << std::setw(5) << step << ".bmp";
+  fout.open(sfilename.str(), std::ios_base::binary);
+
+  int h = m_grid.m();
+  int w = m_grid.n();
+
+  // removing the ghosts from the height
+  if (psize > 1)
+    h = (prank == 0 || prank == psize - 1 ? h - 1 : h - 2);
+  int start = (prank == 0 ? 0 : 1);
+
+  // Gathering the size of every processors, this could be done as in the
+  // constructor of the Simulation instead
+  std::vector<int> size_per_proc(psize);
+  MPI_Allgather(&h, 1, MPI_INT, size_per_proc.data(), 1, MPI_INT,
+                m_communicator);
+
+  // determining the local offset
+  int offset_h = 0;
+  for (int i = 0; i < prank; ++i) {
+    offset_h += size_per_proc[i];
+  }
+
+  int total_h = offset_h;
+  for (int i = prank; i < psize; ++i) {
+    total_h += size_per_proc[i];
+  }
+
+  // gathering all the data on the root processor
+  std::vector<float> buffer;
+  if (prank == 0) {
+    std::vector<int> displs(psize + 1);
+    displs[0] = 0;
+    for (int i = 0; i < psize; ++i) {
+      size_per_proc[i] *= w;
+      displs[i + 1] = displs[i] + size_per_proc[i];
+    }
+
+    buffer.resize(displs[psize]);
+
+    MPI_Gatherv(&m_grid(start, 0), h * w, MPI_FLOAT,
+                buffer.data(), size_per_proc.data(), displs.data(), MPI_FLOAT, 0, m_communicator);
+  } else {
+    MPI_Gatherv(&m_grid(start, 0), h * w, MPI_FLOAT,
+                NULL, NULL, NULL, MPI_FLOAT, 0, m_communicator);
+
+    return;
+  }
+
+  // only processor 0 continues from here
+
+  int row_size = 3 * w;
+  // if the file width (3*w) is not a multiple of 4 adds enough bytes to make it
+  // a multiple of 4
+  int padding = (4 - (row_size) % 4) % 4;
+  row_size += padding;
+
+  int filesize = 54 + (row_size)*total_h;
+
+  std::vector<char> img(row_size * total_h);
+  std::fill(img.begin(), img.end(), 0);
+
+  for (int i = 0; i < total_h; i++) {
+    for (int j = 0; j < w; j++) {
+      float v = ((buffer[(total_h - 1 - i)*w + j] - m_min) / (m_max - m_min));
+
+      float r = v * 255; // Red channel
+      float g = v * 255; // Green channel
+      float b = v * 255; // Red channel
+
+      r = std::min(r, 255.f);
+      g = std::min(g, 255.f);
+      b = std::min(b, 255.f);
+
+      img[row_size * i + 3 * j + 2] = r;
+      img[row_size * i + 3 * j + 1] = g;
+      img[row_size * i + 3 * j + 0] = b;
+    }
+  }
+
+  std::array<char, 14> bmpfileheader = {'B', 'M', 0, 0,  0, 0, 0,
+                                        0,   0,   0, 54, 0, 0, 0};
+  std::array<char, 40> bmpinfoheader = {40, 0, 0, 0, 0, 0, 0,  0,
+                                        0,  0, 0, 0, 1, 0, 24, 0};
+
+  bmpfileheader[2] = filesize;
+  bmpfileheader[3] = filesize >> 8;
+  bmpfileheader[4] = filesize >> 16;
+  bmpfileheader[5] = filesize >> 24;
+
+  bmpinfoheader[4] = w;
+  bmpinfoheader[5] = w >> 8;
+  bmpinfoheader[6] = w >> 16;
+  bmpinfoheader[7] = w >> 24;
+  bmpinfoheader[8] = total_h;
+  bmpinfoheader[9] = total_h >> 8;
+  bmpinfoheader[10] = total_h >> 16;
+  bmpinfoheader[11] = total_h >> 24;
+  bmpinfoheader[20] = (filesize - 54);
+  bmpinfoheader[21] = (filesize - 54) >> 8;
+  bmpinfoheader[22] = (filesize - 54) >> 16;
+  bmpinfoheader[23] = (filesize - 54) >> 24;
+
+  fout.write(bmpfileheader.data(), 14);
+  fout.write(bmpinfoheader.data(), 40);
+
+  fout.write(img.data(), total_h * row_size);
+}
diff --git a/poisson/solutions-persistent/dumpers.hh b/poisson/solutions-persistent/dumpers.hh
new file mode 100644
index 0000000..27a5f12
--- /dev/null
+++ b/poisson/solutions-persistent/dumpers.hh
@@ -0,0 +1,48 @@
+#include <mpi.h>
+
+#ifndef DUMPERS_HH
+#define DUMPERS_HH
+
+/* -------------------------------------------------------------------------- */
+class Grid;
+
+/* -------------------------------------------------------------------------- */
+class Dumper {
+public:
+  explicit Dumper(const Grid & grid, MPI_Comm communicator)
+      : m_grid(grid), m_min(-1.), m_max(1.), m_communicator(communicator) {}
+
+  virtual void dump(int step) = 0;
+
+  void set_min(float min);
+  void set_max(float max);
+
+  float min() const;
+  float max() const;
+
+protected:
+  const Grid & m_grid;
+  float m_min, m_max;
+
+  MPI_Comm m_communicator;
+};
+
+/* -------------------------------------------------------------------------- */
+class DumperASCII : public Dumper {
+public:
+  explicit DumperASCII(const Grid & grid, MPI_Comm communicator)
+      : Dumper(grid, communicator) {}
+
+  virtual void dump(int step);
+};
+
+/* -------------------------------------------------------------------------- */
+class DumperBinary : public Dumper {
+public:
+  explicit DumperBinary(const Grid & grid, MPI_Comm communicator)
+      : Dumper(grid, communicator) {}
+
+  virtual void dump(int step);
+};
+
+#endif /* DUMPERS_HH */
diff --git a/poisson/solutions-persistent/grid.cc b/poisson/solutions-persistent/grid.cc
new file mode 100644
index 0000000..bd635e9
--- /dev/null
+++ b/poisson/solutions-persistent/grid.cc
@@ -0,0 +1,24 @@
+/* -------------------------------------------------------------------------- */
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+#include <algorithm>
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+Grid::Grid(int m, int n) : m_m(m), m_n(n), m_storage(m * n) { clear(); }
+
+/* -------------------------------------------------------------------------- */
+void Grid::clear() { std::fill(m_storage.begin(), m_storage.end(), 0.); }
+
+/* -------------------------------------------------------------------------- */
+void Grid::resize(int m, int n) {
+  m_m = m;
+  m_n = n;
+  m_storage.resize(m*n);
+}
+
+/* -------------------------------------------------------------------------- */
+int Grid::m() const { return m_m; }
+int Grid::n() const { return m_n; }
+
+/* -------------------------------------------------------------------------- */
diff --git a/poisson/solutions-persistent/grid.hh b/poisson/solutions-persistent/grid.hh
new file mode 100644
index 0000000..19c1698
--- /dev/null
+++ b/poisson/solutions-persistent/grid.hh
@@ -0,0 +1,42 @@
+#ifndef GRID_HH
+#define GRID_HH
+
+/* -------------------------------------------------------------------------- */
+#include <vector>
+#include <mpi.h>
+/* -------------------------------------------------------------------------- */
+
+class Grid {
+public:
+  Grid(int m = 0, int n = 0);
+
+  ~Grid() {
+    for (auto & req : requests) {
+      MPI_Request_free(&req);
+    }
+  }
+
+  /// access the value [i][j] of the grid
+  inline float & operator()(int i, int j) { return m_storage[i * m_n + j]; }
+  inline const float & operator()(int i, int j) const {
+    return m_storage[i * m_n + j];
+  }
+
+  void resize(int m, int n);
+
+  std::vector<MPI_Request> & getRequests() { return requests; }
+
+  /// set the grid to 0
+  void clear();
+
+  int m() const;
+  int n() const;
+
+private:
+  int m_m, m_n;
+  std::vector<float> m_storage;
+
+  std::vector<MPI_Request> requests;
+};
+
+#endif /* GRID_HH */
diff --git a/poisson/solutions-persistent/optim.sub b/poisson/solutions-persistent/optim.sub
new file mode 100644
index 0000000..3e4440e
--- /dev/null
+++ b/poisson/solutions-persistent/optim.sub
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+##SBATCH --reservation=phpc2017
+##SBATCH --account=phpc2017
+#SBATCH -N 1
+#SBATCH -n 1
+
+OPTIMS="-O0 -O1 -O2 -O3"
+N=512
+
+
+module load gcc
+
+echo gcc > gcc.out
+for _opt in ${OPTIMS} '-O3 -ftree-vectorize'
+do
+  make clean
+  make CXXFLAGS="-std=c++11 ${_opt}" CXX=g++
+  ./poisson $N| cut -f4 -d " " >> gcc.out
+done
+
+
+module load intel
+
+echo icpc > intel.out
+for _opt in ${OPTIMS} '-O3 -xHOST'
+do
+  make clean
+  make CXXFLAGS="-std=c++11 ${_opt}" CXX=icpc
+  ./poisson $N | cut -f4 -d " " >> intel.out
+done
+
+echo " ${OPTIMS} specific" | tr " " "\n" > header
+
+paste header gcc.out intel.out > compilers.out
diff --git a/poisson/solutions-persistent/poisson.cc b/poisson/solutions-persistent/poisson.cc
new file mode 100644
index 0000000..d9fbd8d
--- /dev/null
+++ b/poisson/solutions-persistent/poisson.cc
@@ -0,0 +1,66 @@
+/* -------------------------------------------------------------------------- */
+#include "simulation.hh"
+/* -------------------------------------------------------------------------- */
+#include <chrono>
+#include <iostream>
+#include <sstream>
+#include <tuple>
+#include <chrono>
+#include <memory>
+/* -------------------------------------------------------------------------- */
+#include <omp.h>
+#include <mpi.h>
+/* -------------------------------------------------------------------------- */
+
+#define EPSILON 0.005
+
+typedef std::chrono::high_resolution_clock clk;
+typedef std::chrono::duration<double> second;
+
+static void usage(const std::string & prog_name) {
+  std::cerr << prog_name << " <grid_size>" << std::endl;
+  exit(0);
+}
+
+
+int main(int argc, char * argv[]) {
+  MPI_Init(&argc, &argv);
+  int prank, psize;
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+  MPI_Comm_size(MPI_COMM_WORLD, &psize);
+
+  if (argc != 2) usage(argv[0]);
+
+  std::stringstream args(argv[1]);
+  int N;
+  args >> N;
+
+  if(args.fail()) usage(argv[0]);
+
+  auto * simu = new Simulation(N, N, MPI_COMM_WORLD);
+
+  simu->set_initial_conditions();
+
+  simu->set_epsilon(EPSILON);
+
+  float l2;
+  int k;
+
+  auto start = clk::now();
+  std::tie(l2, k) = simu->compute();
+  auto end = clk::now();
+
+  second time = end - start;
+
+  if(prank == 0)
+    std::cout << psize << " " << N << " "
+              << k << " " << std::scientific << l2 << " "
+              << time.count() << std::endl;
+
+  delete simu;
+
+  MPI_Finalize();
+
+  return 0;
+}
diff --git a/poisson/solutions-persistent/poisson.sub b/poisson/solutions-persistent/poisson.sub
new file mode 100644
index 0000000..aa43198
--- /dev/null
+++ b/poisson/solutions-persistent/poisson.sub
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+##SBATCH --reservation=phpc2017
+##SBATCH --account=phpc2017
+#SBATCH -N 1
+#SBATCH -n 1
+
+module load gcc
+
+rm -f complexity.dat
+
+for i in 128 256 512 1024
+do
+    ./poisson $i >> complexity.dat
+done
+
+slmodules -r deprecated
+module load gnuplot
+
+gnuplot <<EOF
+set terminal svg
+set output 'complexity.svg'
+set multiplot layout 2,1
+set key off
+set xlabel "N [-]"
+set ylabel "iter [-]"
+plot "complexity.dat" u 1:2 w lp
+
+set ylabel "time [s]"
+plot "complexity.dat" u 1:2 w lp
+EOF
diff --git a/poisson/solutions-persistent/simulation.cc b/poisson/solutions-persistent/simulation.cc
new file mode 100644
index 0000000..4aea357
--- /dev/null
+++ b/poisson/solutions-persistent/simulation.cc
@@ -0,0 +1,149 @@
+/* -------------------------------------------------------------------------- */
+#include "simulation.hh"
+/* -------------------------------------------------------------------------- */
+#include <cmath>
+#include <iostream>
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+Simulation::Simulation(int m, int n, MPI_Comm communicator)
+    : m_global_m(m), m_global_n(n), m_epsilon(1e-7), m_h_m(1. / m),
+      m_h_n(1. / n), m_dumper(new DumperBinary(m_grids.old(), communicator)),
+      m_communicator(communicator) {
+
+  // retrieving the number of proc and the rank in the proc pool
+  MPI_Comm_rank(m_communicator, &m_prank);
+  MPI_Comm_size(m_communicator, &m_psize);
+
+  // computation of the local size of the grid the remainder is spread equally
+  // on the first processors
+  m_local_m = m / m_psize + (m_prank < m % m_psize ? 1 : 0);
+  m_local_n = n;
+
+  // adding the ghosts lines if needed
+  if (m_psize > 1)
+    m_local_m += (m_prank == 0 || m_prank == m_psize - 1) ? 1 : 2;
+
+  // computing the offsets of the local grid in the global one
+  m_offset_m =
+      (m / m_psize) * m_prank + (m_prank < m % m_psize ? m_prank : m % m_psize);
+  m_offset_n = 0;
+
+  // resizing the different grids
+  m_grids.resize(m_local_m, m_local_n);
+  m_f.resize(m_local_m, m_local_n);
+
+  // determining the rank of the neighbors
+  m_north_prank = (m_prank == 0 ? MPI_PROC_NULL : m_prank - 1);
+  m_south_prank = (m_prank == (m_psize - 1) ? MPI_PROC_NULL : m_prank + 1);
+
+  auto create_persistent_comm = [&] (Grid & g) {
+    auto & requests = g.getRequests();
+    requests.resize(4);
+
+    // Posting the receives requests
+    MPI_Recv_init(&g(0, 0), m_local_n, MPI_FLOAT, m_north_prank, 0,
+		  m_communicator, &requests[0]);
+    MPI_Recv_init(&g(m_local_m - 1, 0), m_local_n, MPI_FLOAT, m_south_prank, 0,
+		  m_communicator, &requests[1]);
+
+    // posting send requests
+    MPI_Send_init(&g(1, 0), m_local_n, MPI_FLOAT, m_north_prank, 0,
+		   m_communicator, &requests[2]);
+    MPI_Send_init(&g(m_local_m - 2, 0), m_local_n, MPI_FLOAT, m_south_prank, 0,
+		   m_communicator, &requests[3]);
+  };
+
+  create_persistent_comm(m_grids.old());
+  create_persistent_comm(m_grids.current());
+
+  // Some info if needed to debug
+  // std::cout << m_prank << " " << m_global_m << " " << m_global_n << " "
+  //           << m_local_m << " " << m_local_n << " " << m_offset_m << " "
+  //           << m_offset_n << " " << m_north_prank << " " << m_south_prank
+  //           << std::endl;
+}
+
+/* -------------------------------------------------------------------------- */
+void Simulation::set_initial_conditions() {
+  int i_start = (m_prank == 0 ? 0 : 1);
+  int i_end   = (m_prank == m_psize - 1 ? m_local_m : m_local_m - 1);
+
+  for (int i = i_start; i < i_end; i++) {
+    for (int j = 0; j < m_local_n; j++) {
+      m_f(i, j) = -2. * 100. * M_PI * M_PI *
+                  std::sin(10. * M_PI * (m_offset_m + i - i_start) * m_h_m) *
+                  std::sin(10. * M_PI * (m_offset_n + j) * m_h_n);
+    }
+  }
+}
+
+/* -------------------------------------------------------------------------- */
+std::tuple<float, int> Simulation::compute() {
+  int s = 0;
+  float l2 = 0;
+  do {
+    l2 = compute_step();
+
+    m_grids.swap();
+
+    // m_dumper->dump(s);
+    ++s;
+  } while (l2 > m_epsilon);
+
+  return std::make_tuple(std::sqrt(l2), s);
+}
+
+/* -------------------------------------------------------------------------- */
+void Simulation::set_epsilon(float epsilon) { m_epsilon = epsilon * epsilon; }
+
+/* -------------------------------------------------------------------------- */
+float Simulation::epsilon() const { return m_epsilon; }
+
+/* -------------------------------------------------------------------------- */
+inline float Simulation::compute_row(int i) {
+  float l2 = 0;
+
+  Grid & u = m_grids.current();
+  Grid & uo = m_grids.old();
+
+  for (int j = 1; j < m_local_n - 1; j++) {
+    // computation of the new step
+    u(i, j) = 0.25 * (uo(i - 1, j) + uo(i + 1, j) + uo(i, j - 1) +
+                      uo(i, j + 1) - m_f(i, j) * m_h_m * m_h_n);
+
+    // L2 norm
+    l2 += (uo(i, j) - u(i, j)) * (uo(i, j) - u(i, j));
+  }
+
+  return l2;
+}
+
+/* -------------------------------------------------------------------------- */
+float Simulation::compute_step() {
+  float l2 = 0.;
+
+  Grid & uo = m_grids.old();
+  auto & requests = uo.getRequests();
+  MPI_Startall(requests.size(), requests.data());
+
+  // computing the inner rows that do not depend on the ghosts
+  for (int i = 2; i < m_local_m - 2; i++) {
+    l2 += compute_row(i);
+  }
+
+  /// wait to receive the ghosts before using them for them computation
+  MPI_Waitall(2, requests.data(), MPI_STATUS_IGNORE);
+
+  // computing the line that depends on the ghosts
+  l2 += compute_row(1);
+  l2 += compute_row(m_local_m - 2);
+
+  /// wait to send everything before changing the buffers
+  MPI_Waitall(2, requests.data() + 2, MPI_STATUS_IGNORE);
+
+  // Summing the value of all the processors together
+  MPI_Allreduce(MPI_IN_PLACE, &l2, 1, MPI_FLOAT, MPI_SUM, m_communicator);
+
+  return l2;
+}
diff --git a/poisson/solutions-persistent/simulation.hh b/poisson/solutions-persistent/simulation.hh
new file mode 100644
index 0000000..e7d0e41
--- /dev/null
+++ b/poisson/solutions-persistent/simulation.hh
@@ -0,0 +1,78 @@
+#ifndef SIMULATION_HH
+#define SIMULATION_HH
+
+/* -------------------------------------------------------------------------- */
+#include "double_buffer.hh"
+#include "dumpers.hh"
+#include "grid.hh"
+/* -------------------------------------------------------------------------- */
+#include <tuple>
+#include <memory>
+#include <array>
+#include <mpi.h>
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+class Simulation {
+public:
+  Simulation(int m, int n, MPI_Comm communicator);
+
+  virtual ~Simulation() = default;
+
+  /// set the initial conditions, Dirichlet and source term
+  virtual void set_initial_conditions();
+
+  /// perform the simulation
+  std::tuple<float, int> compute();
+
+  /// access the precision
+  void set_epsilon(float epsilon);
+  float epsilon() const;
+
+protected:
+  /// compute one step and return an error
+  virtual float compute_step();
+
+  /// compute one row
+  float compute_row(int i);
+
+private:
+  /// Global problem size
+  int m_global_m, m_global_n;
+
+  /// Local problem size
+  int m_local_m, m_local_n;
+
+  /// local offset
+  int m_offset_m, m_offset_n;
+
+  /// local neighbors
+  int m_north_prank, m_south_prank;
+
+  /// proc rank
+  int m_prank;
+
+  /// communicator size
+  int m_psize;
+
+  /// Precision to achieve
+  float m_epsilon;
+
+  /// grid spacing
+  float m_h_m;
+  float m_h_n;
+
+  /// Grids storage
+  DoubleBuffer m_grids;
+
+  /// source term
+  Grid m_f;
+
+  /// Dumper to use for outputs
+  std::unique_ptr<Dumper> m_dumper;
+
+  /// communicator
+  MPI_Comm m_communicator;
+};
+
+#endif /* SIMULATION_HH */