diff --git a/Makefile b/Makefile
index 9017960..77160b3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,14 +1,21 @@
 CXX=mpic++
+#CXX=g++
 LD=${CXX}
-CXXFLAGS+=-Wall -Wextra -Werror -pedantic -std=c++11 -O3
+#VERIFY=-fsanitize=thread
+VERIFY= -D_DEBUG_MPI #-D_DEBUG
+OPENMP=-fopenmp
+NOOPENMP=-fno-openmp -Wno-unknown-pragmas
+#OPTIM+=-O3 -D_MPI $(NOOPENMP)
+OPTIM+=-O3 -D_MPI $(OPENMP) -march=native
+CXXFLAGS+=-Wall -Wextra -Werror -pedantic -std=c++14 -g -fPIE $(OPTIM) $(VERIFY)
 LDFLAGS+=-lm $(CXXFLAGS)
 
 OBJS=poisson.o grid.o double_buffer.o simulation.o  dumpers.o
 
 all: poisson
 
 poisson: $(OBJS)
 	$(LD) -o $@ $(OBJS) $(LDFLAGS)
 
 clean:
 	rm -f hello poisson *.o *~
diff --git a/double_buffer.cc b/double_buffer.cc
index 67749d0..978eda5 100644
--- a/double_buffer.cc
+++ b/double_buffer.cc
@@ -1,25 +1,31 @@
 /* -------------------------------------------------------------------------- */
 #include "double_buffer.hh"
 #include "grid.hh"
 /* -------------------------------------------------------------------------- */
 
 /* -------------------------------------------------------------------------- */
 DoubleBuffer::DoubleBuffer(int m, int n)
     : m_current(new Grid(m, n)), m_old(new Grid(m, n)) {}
 
 /* -------------------------------------------------------------------------- */
 Grid & DoubleBuffer::current() { return *m_current; }
 
 /* -------------------------------------------------------------------------- */
 Grid & DoubleBuffer::old() { return *m_old; }
 
+/* -------------------------------------------------------------------------- */
+const Grid & DoubleBuffer::current() const { return *m_current; }
+
+/* -------------------------------------------------------------------------- */
+const Grid & DoubleBuffer::old() const { return *m_old; }
+
 /* -------------------------------------------------------------------------- */
 void DoubleBuffer::swap() {
   m_current.swap(m_old);
 }
 
 /* -------------------------------------------------------------------------- */
 void DoubleBuffer::resize(int m, int n) {
   m_current->resize(m, n);
   m_old->resize(m, n);
 }
diff --git a/double_buffer.hh b/double_buffer.hh
index a05410b..1d21b5c 100644
--- a/double_buffer.hh
+++ b/double_buffer.hh
@@ -1,25 +1,28 @@
 #ifndef DOUBLE_BUFFER
 #define DOUBLE_BUFFER
 
 /* -------------------------------------------------------------------------- */
 #include <memory>
 /* -------------------------------------------------------------------------- */
 #include "grid.hh"
 /* -------------------------------------------------------------------------- */
 
 class DoubleBuffer {
 public:
   DoubleBuffer(int m = 0 , int n = 0);
 
   void resize(int m, int n);
   Grid & current();
   Grid & old();
 
+  const Grid & current() const;
+  const Grid & old() const;
+
   void swap();
 private:
 
   std::unique_ptr<Grid> m_current;
   std::unique_ptr<Grid> m_old;
 };
 
 #endif /* DOUBLE_BUFFER */
diff --git a/dumpers.cc b/dumpers.cc
index 5236b4c..ec03fe4 100644
--- a/dumpers.cc
+++ b/dumpers.cc
@@ -1,160 +1,214 @@
-
 /* -------------------------------------------------------------------------- */
 #include "dumpers.hh"
 #include "grid.hh"
 /* -------------------------------------------------------------------------- */
 #include <array>
 #include <fstream>
 #include <iomanip>
 #include <sstream>
 /* -------------------------------------------------------------------------- */
 
 /* -------------------------------------------------------------------------- */
 void Dumper::set_min(float min) { m_min = min; }
 
 void Dumper::set_max(float max) { m_max = max; }
 
 float Dumper::min() const { return m_min; }
 
 float Dumper::max() const { return m_max; }
 
 /* -------------------------------------------------------------------------- */
 void DumperASCII::dump(const Grid & grid, int step) {
   std::ofstream fout;
   std::stringstream sfilename;
 
   sfilename << "out_" << std::setfill('0') << std::setw(5) << step << ".pgm";
   fout.open(sfilename.str());
 
   int m = grid.m();
   int n = grid.n();
 
   fout << "P2" << std::endl << "# CREATOR: Poisson program" << std::endl;
   fout << m << " " << n << std::endl;
   fout << 255 << std::endl;
 
   for (int i = 0; i < m; i++) {
     for (int j = 0; j < n; j++) {
       int v = 255. * (grid(i, j) - m_min) / (m_max - m_min);
       v = std::min(v, 255);
       fout << v << std::endl;
     }
   }
 }
 
 /* -------------------------------------------------------------------------- */
 void DumperBinary::dump(const Grid & grid, int step) {
   std::stringstream sfilename;
 
-  int prank, psize;
+  std::array<int, 2> coords{0, 0}, dims{0, 0};
+#if defined(_MPI)
+  int prank{0}, psize{1};
   MPI_Comm_rank(m_communicator, &prank);
   MPI_Comm_size(m_communicator, &psize);
 
+  std::array<int, 2> periods{0, 0};
+  MPI_Cart_get(m_communicator, dims.size(), dims.data(), periods.data(), coords.data());
+#else
+  dims = {1, 1};
+#endif
   sfilename << "out_" << std::setfill('0') << std::setw(5) << step << ".bmp";
 
   int h = grid.m();
   int w = grid.n();
+  int global_w = w;
+
+  std::array<int, 2> starts = {0, 0};
+  std::array<int, 2> local_sizes = {h, w};
+  std::array<int, 2> global_sizes = local_sizes;
 
-  int start = 0;
+#if defined(_MPI)
+  // removing the ghosts
+  if(prank == 0) std::cout << dims[0] << " " << dims[1] << "\n";
 
-  // removing the ghosts from the height
-  if (psize > 1) {
-    h = (prank == 0 || prank == psize - 1 ? h - 1 : h - 2);
-    start = (prank == 0 ? 0 : 1);
+  for (std::size_t i = 0; i < local_sizes.size(); ++i) {
+    if (dims[i] > 1) {
+      local_sizes[i] -= (coords[i] == 0 || coords[i] == dims[i] - 1 ? 1 : 2);
+      starts[i] += (coords[i] == 0 ? 0 : 1);
+    }
   }
 
-  // Gathering the size of every processors, this could be done as in the
-  // constructor of the Simulation instead
-  std::vector<int> size_per_proc(psize);
-  MPI_Allgather(&h, 1, MPI_INT, size_per_proc.data(), 1, MPI_INT,
-                m_communicator);
+  std::array<MPI_Comm, 2> comms;
 
   // determining the local offset
-  int offset_h = 0;
-  for (int i = 0; i < prank; ++i) {
-    offset_h += size_per_proc[i];
+  std::array<int, 2> offset = {0, 0};
+
+  std::vector<std::vector<int>> line_col_size(2);
+
+  for (size_t i = 0; i < line_col_size.size(); ++i) {
+    std::array<int, 2> remain_dims = {false, false};
+    remain_dims[i] = true;
+    MPI_Cart_sub(m_communicator, remain_dims.data(), &(comms[i]));
+
+    line_col_size[i].resize(dims[i]);
+    MPI_Allgather(&local_sizes[i], 1, MPI_INT, line_col_size[i].data(), 1,
+                  MPI_INT, comms[i]);
+
+    global_sizes[i] = 0;
+    for (int j = 0; j < dims[i]; ++j) {
+      global_sizes[i] += line_col_size[i][j];
+    }
   }
 
-  int total_h = offset_h;
-  for (int i = prank; i < psize; ++i) {
-    total_h += size_per_proc[i];
+  //for (int i = dims[0] - 1; i > coords[0]; --i) {
+  for (int i = 0; i < coords[0]; ++i) {
+    offset[0] += line_col_size[0][i];
   }
 
-  int row_size = 3 * w;
+  for (int i = 0; i < coords[1]; ++i) {
+    offset[1] += line_col_size[1][i];
+  }
+#endif
+
+  global_w = 4 * ((24 * global_sizes[1] + 31) / 32);
+
   // if the file width (3*w) is not a multiple of 4 adds enough bytes to make it
   // a multiple of 4
-  int padding = (4 - (row_size) % 4) % 4;
-  row_size += padding;
+  int padding = global_w - 3 * global_sizes[1];
 
-  int filesize = 54 + row_size * total_h;
+  w = 3 * local_sizes[1];
 
-  std::vector<char> img(row_size * h);
+  if (coords[1] == dims[1] - 1) {
+    w += padding;
+  } else {
+    padding = 0;
+  }
+
+  int filesize = 54 + global_sizes[0] * global_w;
+
+  std::vector<char> img(local_sizes[0] * w);
   std::fill(img.begin(), img.end(), 0);
 
-  for (int i = 0; i < h; i++) {
-    for (int j = 0; j < w; j++) {
-      float v = ((grid(i + start, j) - m_min) / (m_max - m_min));
+  for (int i = 0; i < local_sizes[0]; i++) {
+    for (int j = 0; j < local_sizes[1]; j++) {
+      float v =
+          ((grid(i + starts[0], j + starts[1]) - m_min) / (m_max - m_min));
 
       float r = v * 255; // Red channel
       float g = v * 255; // Green channel
-      float b = v * 255; // Red channel
+      float b = 0.f; //prank * 255 / psize; // Red channel
 
       r = std::min(r, 255.f);
       g = std::min(g, 255.f);
       b = std::min(b, 255.f);
 
-      img[row_size * i + 3 * j + 2] = r;
-      img[row_size * i + 3 * j + 1] = g;
-      img[row_size * i + 3 * j + 0] = b;
+      img[w * i + 3 * j + 0] = r;
+      img[w * i + 3 * j + 1] = g;
+      img[w * i + 3 * j + 2] = b;
     }
   }
 
+  auto write_bytes = [](auto && array, auto && value) {
+    array[0] = value;
+    array[1] = value >> 8;
+    array[2] = value >> 16;
+    array[3] = value >> 24;
+  };
+
   std::array<char, 14> bmpfileheader = {'B', 'M', 0, 0,  0, 0, 0,
                                         0,   0,   0, 54, 0, 0, 0};
   std::array<char, 40> bmpinfoheader = {40, 0, 0, 0, 0, 0, 0,  0,
                                         0,  0, 0, 0, 1, 0, 24, 0};
 
-  bmpfileheader[2] = filesize;
-  bmpfileheader[3] = filesize >> 8;
-  bmpfileheader[4] = filesize >> 16;
-  bmpfileheader[5] = filesize >> 24;
-
-  bmpinfoheader[4] = w;
-  bmpinfoheader[5] = w >> 8;
-  bmpinfoheader[6] = w >> 16;
-  bmpinfoheader[7] = w >> 24;
-  bmpinfoheader[8] = total_h;
-  bmpinfoheader[9] = total_h >> 8;
-  bmpinfoheader[10] = total_h >> 16;
-  bmpinfoheader[11] = total_h >> 24;
-  bmpinfoheader[20] = (filesize - 54);
-  bmpinfoheader[21] = (filesize - 54) >> 8;
-  bmpinfoheader[22] = (filesize - 54) >> 16;
-  bmpinfoheader[23] = (filesize - 54) >> 24;
+  write_bytes(&bmpfileheader[2], filesize);
+
+  write_bytes(&bmpinfoheader[4], global_sizes[1]);
+  write_bytes(&bmpinfoheader[8], global_sizes[0]);
+  write_bytes(&bmpinfoheader[20], filesize - 54);
 
+#if defined(_MPI)
   MPI_File fh;
   MPI_Status status;
+  MPI_Datatype subarray_t;
+
+#if defined(_DEBUG_MPI)
+  for (int i = 0; i < psize; ++i) {
+    if (prank == i)
+      std::cerr << prank << " (" << coords[0] << ", " << coords[1] << ")"
+                << " - " << local_sizes[0] << "x" << local_sizes[1] << "(" << w
+                << ") + " << offset[0] << "x" << offset[1] << "("<< 3*offset[1]<<") ++ ["
+                << global_sizes[0] << "x" << global_sizes[1] << "(" << global_w
+                << ")] " << padding << "\n";
+    MPI_Barrier(m_communicator);
+  }
+#endif
+
+  global_sizes[1] = global_w;
+  local_sizes[1] = w;
+  offset[1] *= 3;
+
+  MPI_Type_create_subarray(global_sizes.size(), global_sizes.data(),
+                           local_sizes.data(), offset.data(), MPI_ORDER_C,
+                           MPI_CHAR, &subarray_t);
+  MPI_Type_commit(&subarray_t);
 
   // opening a file in write and create mode
   MPI_File_open(MPI_COMM_WORLD, sfilename.str().c_str(),
-                MPI_MODE_WRONLY | MPI_MODE_CREATE,
-                MPI_INFO_NULL, &fh);
+                MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
   // defining the size of the file
   MPI_File_set_size(fh, filesize);
 
   // rank 0 writes the header
   if (prank == 0) {
     MPI_File_write_at(fh, 0, bmpfileheader.data(), 14, MPI_CHAR, &status);
     MPI_File_write_at(fh, 14, bmpinfoheader.data(), 40, MPI_CHAR, &status);
   }
 
-  int offset = 54 + row_size * offset_h;
-
-  // We also could write that data with a write_at, the view is just to show
-  // different possibilities
-  MPI_File_set_view(fh, offset, MPI_CHAR, MPI_CHAR, "native",
-                      MPI_INFO_NULL);
+  if(prank == 0) {
+    std::cout << filesize << std::endl;
+  }
 
+  MPI_File_set_view(fh, 54, MPI_CHAR, subarray_t, "native", MPI_INFO_NULL);
   MPI_File_write(fh, img.data(), img.size(), MPI_CHAR, &status);
   MPI_File_close(&fh);
+#endif
 }
diff --git a/dumpers.hh b/dumpers.hh
index 5d55e8e..dc9b013 100644
--- a/dumpers.hh
+++ b/dumpers.hh
@@ -1,47 +1,58 @@
+#if defined(_MPI)
 #include <mpi.h>
+#endif
 
 #ifndef DUMPERS_HH
 #define DUMPERS_HH
 
 /* -------------------------------------------------------------------------- */
 class Grid;
 
 /* -------------------------------------------------------------------------- */
 class Dumper {
 public:
+#if defined(_MPI)
   explicit Dumper(MPI_Comm communicator)
       : m_min(-1.), m_max(1.), m_communicator(communicator) {}
-
+#else
+  Dumper() : m_min(-1.), m_max(1.) {}
+#endif
   virtual void dump(const Grid & m_grid, int step) = 0;
 
   void set_min(float min);
   void set_max(float max);
 
   float min() const;
   float max() const;
 
 protected:
   float m_min, m_max;
-
+#if defined(_MPI)
   MPI_Comm m_communicator;
+#endif
 };
 
 /* -------------------------------------------------------------------------- */
 class DumperASCII : public Dumper {
 public:
-  explicit DumperASCII(MPI_Comm communicator)
-      : Dumper(communicator) {}
+#if defined(_MPI)
+  explicit DumperASCII(MPI_Comm communicator) : Dumper(communicator) {}
+#else
+  DumperASCII() : Dumper() {}
+#endif
 
   virtual void dump(const Grid & grid, int step);
 };
 
 /* -------------------------------------------------------------------------- */
 class DumperBinary : public Dumper {
 public:
-  explicit DumperBinary(MPI_Comm communicator)
-      : Dumper(communicator) {}
-
+#if defined(_MPI)
+  explicit DumperBinary(MPI_Comm communicator) : Dumper(communicator) {}
+#else
+  DumperBinary() : Dumper() {}
+#endif
   virtual void dump(const Grid & grid, int step);
 };
 
 #endif /* DUMPERS_HH */
diff --git a/grid.cc b/grid.cc
index ef28218..1ac1ef1 100644
--- a/grid.cc
+++ b/grid.cc
@@ -1,60 +1,119 @@
 /* -------------------------------------------------------------------------- */
 #include "grid.hh"
 /* -------------------------------------------------------------------------- */
 #include <algorithm>
+#if defined(_DEBUG_MPI)
+#include <iomanip>
+#endif
 /* -------------------------------------------------------------------------- */
 
 /* -------------------------------------------------------------------------- */
 Grid::Grid(int m, int n) : m_m(m), m_n(n), m_storage(m * n) { clear(); }
 
+/* -------------------------------------------------------------------------- */
+Grid::~Grid() {
+#if defined(_MPI)
+  MPI_Type_free(&column_t);
+  MPI_Type_free(&line_t);
+#endif
+}
+
 /* -------------------------------------------------------------------------- */
 void Grid::clear() { std::fill(m_storage.begin(), m_storage.end(), 0.); }
 
 /* -------------------------------------------------------------------------- */
 void Grid::resize(int m, int n) {
   m_m = m;
   m_n = n;
-  m_storage.resize(m*n);
+  m_storage.resize(m * n);
 }
 
 /* -------------------------------------------------------------------------- */
 int Grid::m() const { return m_m; }
 int Grid::n() const { return m_n; }
 
 /* -------------------------------------------------------------------------- */
-void Grid::initializeHaloCommunications(MPI_Comm & m_communicator) {
-  // retrieving the number of proc and the rank in the proc pool
-  MPI_Comm_rank(m_communicator, &m_prank);
-  MPI_Comm_size(m_communicator, &m_psize);
+#if defined(_MPI)
+void Grid::initializeHaloCommunications(MPI_Comm & m_communicator,
+                                        const std::array<int, 2> & ghosts) {
+  std::array<int, 2> coords;
+  int prank;
+
+  MPI_Type_vector(m_m - ghosts[0], 1, m_n, MPI_FLOAT, &column_t);
+  MPI_Type_vector(1, m_n - ghosts[1], 1, MPI_FLOAT, &line_t);
+
+  MPI_Type_commit(&column_t);
+  MPI_Type_commit(&line_t);
+
+  MPI_Comm_rank(m_communicator, &prank);
+  MPI_Cart_coords(m_communicator, prank, coords.size(), coords.data());
 
   // determining the rank of the neighbors
-  m_north_prank = (m_prank == 0 ? MPI_PROC_NULL : m_prank - 1);
-  m_south_prank = (m_prank == (m_psize - 1) ? MPI_PROC_NULL : m_prank + 1);
-
-  /// preparing synchronize north
-  MPI_Send_init(&(operator()(1, 0)), m_n, MPI_FLOAT, m_north_prank, 0,
-            m_communicator, &m_requests[2]);
-  MPI_Recv_init(&(operator()(0, 0)), m_n, MPI_FLOAT, m_north_prank, 0,
-            m_communicator, &m_requests[0]);
-
-  /// preparing synchronize south
-  MPI_Send_init(&(operator()(m_m - 2, 0)), m_n, MPI_FLOAT, m_south_prank, 0,
-            m_communicator, &m_requests[3]);
-  MPI_Recv_init(&(operator()(m_m - 1, 0)), m_n, MPI_FLOAT, m_south_prank, 0,
-            m_communicator, &m_requests[1]);
-}
+  int left, right, top, bottom;
+  MPI_Cart_shift(m_communicator, 0, 1, &left, &right);
+  MPI_Cart_shift(m_communicator, 1, 1, &top, &bottom);
 
-/* -------------------------------------------------------------------------- */
-void Grid::startSynchronization() {
-  MPI_Startall(4, m_requests.data());
-}
+#if defined(_DEBUG_MPI)
+  static bool first = true;
+  if (first) {
+    int psize;
+    MPI_Comm_size(m_communicator, &psize);
+    for (int i = 0; i < psize; ++i) {
+      if (prank == i) {
+        std::cerr << prank << std::setw(7) << top << "\n"
+                  << prank << std::setw(8) << "^\n"
+                  << prank << std::setw(3) << left << " < . > " << right << "\n"
+                  << prank << std::setw(8) << "v\n"
+                  << prank << std::setw(7) << bottom << "\n\n";
+      }
+      MPI_Barrier(m_communicator);
+    }
+    first = false;
+  }
+#endif
 
-/* -------------------------------------------------------------------------- */
-void Grid::waitReceives() {
-  MPI_Waitall(2, m_requests.data(), MPI_STATUS_IGNORE);
-}
+  auto addr = [&](auto && i, auto && j) { return &(this->operator()(i, j)); };
 
-/* -------------------------------------------------------------------------- */
-void Grid::waitSends() {
-  MPI_Waitall(2, m_requests.data() + 2, MPI_STATUS_IGNORE);
+  MPI_Request * rrequest = m_requests.data();
+  MPI_Request * srequest = m_requests.data() + 4;
+
+  static int tag = 1;
+  /// preparing receives
+  {
+    auto send = addr(m_m - 2, 1);
+    auto recv = addr(0, 1);
+    MPI_Send_init(send, 1, line_t, bottom, tag, m_communicator, srequest);
+    MPI_Recv_init(recv, 1, line_t, top, tag, m_communicator, rrequest);
+    ++rrequest;
+    ++srequest;
+    ++tag;
+  }
+  {
+    auto send = addr(1, m_n - 2);
+    auto recv = addr(1, 0);
+    MPI_Send_init(send, 1, column_t, right, tag, m_communicator, srequest);
+    MPI_Recv_init(recv, 1, column_t, left, tag, m_communicator, rrequest);
+    ++rrequest;
+    ++srequest;
+    ++tag;
+  }
+  {
+    auto send = addr(1, 1);
+    auto recv = addr(m_m - 1, 1);
+    MPI_Send_init(send, 1, line_t, top, tag, m_communicator, srequest);
+    MPI_Recv_init(recv, 1, line_t, bottom, tag, m_communicator, rrequest);
+    ++rrequest;
+    ++srequest;
+    ++tag;
+  }
+  {
+    auto send = addr(1, 1);
+    auto recv = addr(1, m_n - 1);
+    MPI_Send_init(send, 1, column_t, left, tag, m_communicator, srequest);
+    MPI_Recv_init(recv, 1, column_t, right, tag, m_communicator, rrequest);
+    ++rrequest;
+    ++srequest;
+    ++tag;
+  }
 }
+#endif
diff --git a/grid.hh b/grid.hh
index b68a933..5602db2 100644
--- a/grid.hh
+++ b/grid.hh
@@ -1,50 +1,134 @@
 #ifndef GRID_HH
 #define GRID_HH
 
 /* -------------------------------------------------------------------------- */
-#include <vector>
 #include <array>
+#include <vector>
+#if defined(_MPI)
 #include <mpi.h>
+#endif
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+#if defined(_DEBUG)
+#include <iostream>
+#endif
+/* -------------------------------------------------------------------------- */
+namespace {
+#if not defined(_DEBUG)
+std::size_t thread_id() __attribute__((unused));
+std::size_t nb_threads() __attribute__((unused));
+#endif
+
+std::size_t thread_id() {
+#if defined(_OPENMP)
+  return omp_get_thread_num();
+#else
+  return 0;
+#endif
+}
+
+std::size_t nb_threads() {
+#if defined(_OPENMP)
+  return omp_get_max_threads();
+#else
+  return 1;
+#endif
+}
+} // namespace
+
+namespace {
+struct DEBUG {
+#if defined(_DEBUG)
+  static void print(const std::string & func) {
+    auto nthreads = nb_threads();
+    auto nthread = thread_id();
+#pragma omp critical
+    std::cerr << nthread << "/" << nthreads << ": " << func << "\n";
+  }
+#else
+  static void print(const std::string & /*func*/) {}
+#endif
+};
+} // namespace
+
 /* -------------------------------------------------------------------------- */
 
 class Grid {
 public:
   Grid(int m = 0, int n = 0);
+  ~Grid();
 
   /// access the value [i][j] of the grid
   inline float & operator()(int i, int j) { return m_storage[i * m_n + j]; }
   inline const float & operator()(int i, int j) const {
     return m_storage[i * m_n + j];
   }
 
   void resize(int m, int n);
 
   /// set the grid to 0
   void clear();
 
   int m() const;
   int n() const;
 
-  void initializeHaloCommunications(MPI_Comm & m_communicator);
-  void startSynchronization();
-  void waitReceives();
-  void waitSends();
+#if defined(_MPI)
+  void initializeHaloCommunications(MPI_Comm & m_communicator,
+                                    const std::array<int, 2> & ghosts);
+  void startSynchronization() {
+    MPI_Request * requests = m_requests.data();
+#pragma omp task depend(in : requests [0:8])
+    {
+      DEBUG::print(__PRETTY_FUNCTION__);
+      MPI_Startall(8, requests);
+    }
+  }
 
-private:
-  int m_m, m_n;
-  std::vector<float> m_storage;
+  void waitReceives() {
+    auto requests = m_requests.data();
+#pragma omp task depend(out : requests [0:4])
+    {
+      DEBUG::print(__PRETTY_FUNCTION__);
+      MPI_Waitall(4, requests, MPI_STATUS_IGNORE);
+    }
+  }
 
-  /// local neighbors
-  int m_north_prank, m_south_prank;
+  void waitSends() {
+    auto requests = m_requests.data();
+#pragma omp task depend(out : requests [4:4])
+    {
+      DEBUG::print(__PRETTY_FUNCTION__);
+      MPI_Waitall(4, requests + 4, MPI_STATUS_IGNORE);
+    }
+  }
+#else
+  void startSynchronization() {
+#pragma omp task
+    DEBUG::print(__PRETTY_FUNCTION__);
+  }
+  void waitReceives() {
+#pragma omp task
+    DEBUG::print(__PRETTY_FUNCTION__);
+  }
 
-  /// proc rank
-  int m_prank;
+  void waitSends() {
+#pragma omp task
+    DEBUG::print(__PRETTY_FUNCTION__);
+  }
+#endif
 
-  /// nb process
-  int m_psize;
+private:
+  int m_m, m_n;
+  std::vector<float> m_storage;
 
+#if defined(_MPI)
   /// request for asynchronous communications
-  std::array<MPI_Request, 4> m_requests;
+  std::array<MPI_Request, 8> m_requests;
+
+  /// communication types
+  MPI_Datatype column_t, line_t;
+#endif
 };
 
 #endif /* GRID_HH */
diff --git a/poisson.cc b/poisson.cc
index f8e402d..4edaee0 100644
--- a/poisson.cc
+++ b/poisson.cc
@@ -1,63 +1,91 @@
 /* -------------------------------------------------------------------------- */
 #include "simulation.hh"
 /* -------------------------------------------------------------------------- */
 #include <chrono>
 #include <iostream>
 #include <sstream>
 #include <tuple>
-#include <chrono>
 /* -------------------------------------------------------------------------- */
-#include <omp.h>
+#if defined(_MPI)
 #include <mpi.h>
+#endif
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
 /* -------------------------------------------------------------------------- */
 
-#define EPSILON 0.005
+#define EPSILON 0.1
 
 typedef std::chrono::high_resolution_clock clk;
 typedef std::chrono::duration<double> second;
 
 static void usage(const std::string & prog_name) {
   std::cerr << prog_name << " <grid_size>" << std::endl;
   exit(0);
 }
 
-
 int main(int argc, char * argv[]) {
-  MPI_Init(&argc, &argv);
-  int prank, psize;
+  int prank{0}, psize{1}, nthreads{1};
+#if defined(_MPI)
+  int provided;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+  // assert(provided == MPI_THREAD_MULTIPLE);
 
   MPI_Comm_rank(MPI_COMM_WORLD, &prank);
   MPI_Comm_size(MPI_COMM_WORLD, &psize);
+#endif
+
+#if defined (_OPENMP)
+  nthreads = omp_get_max_threads();
+#endif
 
-  if (argc != 2) usage(argv[0]);
+  if (argc != 2)
+    usage(argv[0]);
 
   std::stringstream args(argv[1]);
   int N;
   args >> N;
 
-  if(args.fail()) usage(argv[0]);
+  if (args.fail())
+    usage(argv[0]);
 
-  Simulation simu(N, N, MPI_COMM_WORLD);
+  std::unique_ptr<Simulation> simu;
 
-  simu.set_initial_conditions();
+#if defined(_MPI)
+  simu = std::make_unique<Simulation>(N, N, MPI_COMM_WORLD);
+#else
+  simu = std::make_unique<Simulation>(N, N);
+#endif
+  simu->set_initial_conditions();
 
-  simu.set_epsilon(EPSILON);
+  simu->set_epsilon(EPSILON);
 
   float l2;
   int k;
 
   auto start = clk::now();
-  std::tie(l2, k) = simu.compute();
+  std::tie(l2, k) = simu->compute();
   auto end = clk::now();
 
   second time = end - start;
 
-  if(prank == 0)
-    std::cout << psize << " " << N << " "
-              << k << " " << std::scientific << l2 << " "
-              << time.count() << std::endl;
-
+  auto min = simu->min();
+  auto max = simu->max();
+
+  if (prank == 0)
+    std::cout << "MPI:   " << psize << " (" << simu->dims(0) << ", " << simu->dims(1) << ")\n"
+              << "OMP:   " << nthreads << "\n"
+              << "Grid:  " << N << "x" << N << "\n"
+              << "Bound: " << "[ " << min << " : " << max << " ]" << "\n"
+              << "Iter:  " << k << "\n"
+              << "Norm:  " << std::scientific << l2 << "\n"
+              << "Time:  " << time.count()  << "s" << "\n"
+              << "Flops: " << simu->complexity()*k/time.count()/1e9  << " GFlops" << std::endl;
+
+#if defined(_MPI)
+  simu.release(); // To force destruction of MPI related stuff
   MPI_Finalize();
+#endif
 
   return 0;
 }
diff --git a/simulation.cc b/simulation.cc
index 0278d94..1dd2abb 100644
--- a/simulation.cc
+++ b/simulation.cc
@@ -1,125 +1,310 @@
 /* -------------------------------------------------------------------------- */
 #include "simulation.hh"
 /* -------------------------------------------------------------------------- */
+#include <assert.h>
 #include <cmath>
 #include <iostream>
-/* -------------------------------------------------------------------------- */
+#include <numeric>
+#if defined(_MPI)
+#include <mpi.h>
+#endif
 
 /* -------------------------------------------------------------------------- */
-Simulation::Simulation(int m, int n, MPI_Comm communicator)
-    : m_global_m(m), m_global_n(n), m_epsilon(1e-7), m_h_m(1. / m),
-      m_h_n(1. / n), m_dumper(new DumperBinary(communicator)),
-      m_communicator(communicator) {
+#if defined(_MPI)
+Simulation::Simulation(int m, int n, MPI_Comm communicator) {
+#else
+Simulation::Simulation(int m, int n) {
+#endif
+
+// retrieving the number of proc and the rank in the proc pool
+#if defined(_MPI)
+  MPI_Comm_size(communicator, &m_psize);
+
+  m_dims = {0, 0};
+  std::array<int, 2> periods{0, 0};
+  MPI_Dims_create(m_psize, m_dims.size(), m_dims.data());
+
+  assert(m_dims[0] * m_dims[1] == m_psize && "MPI_Dims_create is fucked up");
+
+  MPI_Cart_create(communicator, m_dims.size(), m_dims.data(), periods.data(), 0,
+                  &m_communicator);
 
-  // retrieving the number of proc and the rank in the proc pool
   MPI_Comm_rank(m_communicator, &m_prank);
-  MPI_Comm_size(m_communicator, &m_psize);
 
-  // computation of the local size of the grid the remainder is spread equally
-  // on the first processors
-  m_local_m = m / m_psize + (m_prank < m % m_psize ? 1 : 0);
-  m_local_n = n;
+  MPI_Cart_coords(m_communicator, m_prank, m_coords.size(), m_coords.data());
 
-  // adding the ghosts lines if needed
-  if (m_psize > 1)
-    m_local_m += (m_prank == 0 || m_prank == m_psize - 1) ? 1 : 2;
+  m_dumper = std::make_unique<DumperBinary>(m_communicator);
+#else
+  m_dims = {1, 1};
+  m_coords = {0, 0};
+  m_dumper = std::make_unique<DumperBinary>();
+#endif
 
-  // computing the offsets of the local grid in the global one
-  m_offset_m =
-      (m / m_psize) * m_prank + (m_prank < m % m_psize ? m_prank : m % m_psize);
-  m_offset_n = 0;
+  m_global_size = {m, n};
 
-  // resizing the different grids
-  m_grids.resize(m_local_m, m_local_n);
-  m_f.resize(m_local_m, m_local_n);
+  std::array<int, 2> ghosts{0, 0};
+
+  for (std::size_t i = 0; i < m_local_size.size(); ++i) {
+    // computation of the local size of the grid the remainder is spread equally
+    // on the first processors
+    m_local_size[i] = m_global_size[i] / m_dims[i] +
+                      (m_coords[i] < m_global_size[i] % m_dims[i] ? 1 : 0);
+
+    // adding the ghosts lines if needed
+    if (m_dims[i] > 1)
+      ghosts[i] = (m_coords[i] == 0 || m_coords[i] == m_dims[i] - 1) ? 1 : 2;
+
+    m_local_size[i] += ghosts[i];
 
-  m_grids.current().initializeHaloCommunications(communicator);
-  m_grids.old().initializeHaloCommunications(communicator);
+    // computing the offsets of the local grid in the global one
+    m_offset[i] = (m_global_size[i] / m_dims[i]) * m_coords[i] +
+                  (m_coords[i] < m_global_size[i] % m_dims[i]
+                       ? m_coords[i]
+                       : m_global_size[i] % m_dims[i]);
+
+    m_h[i] = 1. / m_global_size[i];
+  }
+
+#if defined(_DEBUG_MPI)
+  for (int i = 0; i < m_psize; ++i) {
+    if (m_prank == i)
+      std::cerr << m_prank << " (" << m_coords[0] << ", " << m_coords[1] << ")"
+                << " - " << m_local_size[0] << "x" << m_local_size[1] << " + "
+                << m_offset[0] << "x" << m_offset[1] << " ++ [" << ghosts[0]
+                << "x" << ghosts[1] << "]\n";
+    MPI_Barrier(m_communicator);
+  }
+#endif
+
+  // resizing the different grids
+  m_grids.resize(m_local_size[0], m_local_size[1]);
+  m_f.resize(m_local_size[0], m_local_size[1]);
+#if defined(_MPI)
+  m_grids.current().initializeHaloCommunications(m_communicator, ghosts);
+  m_grids.old().initializeHaloCommunications(m_communicator, ghosts);
+#endif
 }
 
 /* -------------------------------------------------------------------------- */
 void Simulation::set_initial_conditions() {
-  int i_start = (m_prank == 0 ? 0 : 1);
-  int i_end   = (m_prank == m_psize - 1 ? m_local_m : m_local_m - 1);
+  std::array<int, 2> start = {0, 0};
+  std::array<int, 2> end = m_local_size;
+
+  for (int i = 0; i < 2; ++i) {
+    start[i] += (m_coords[i] == 0 ? 0 : 1);
+    end[i] -= (m_coords[i] == m_dims[i] - 1 ? 0 : 1);
+  }
 
+  float min = 0., max = 0.;
   float a = 5.5 * M_PI;
-  for (int i = i_start; i < i_end; i++) {
-    for (int j = 0; j < m_local_n; j++) {
-      m_f(i, j) = -2. * a * a *
-                  std::sin(a * (m_offset_m + i - i_start) * m_h_m) *
-                  std::sin(a * (m_offset_n + j) * m_h_n);
+  for (int i = start[0]; i < end[0]; i++) {
+    for (int j = start[1]; j < end[1]; j++) {
+      m_f(i, j) =
+          -2. * a * a * std::sin(a * (m_offset[0] + i - start[0]) * m_h[0]) *
+          std::sin(a * (m_offset[1] + j - start[1]) * m_h[1]) * m_h[0] * m_h[1];
+
+      min = std::min(min, m_f(i, j));
+      max = std::max(max, m_f(i, j));
     }
   }
+
+  m_dumper->set_min(min);
+  m_dumper->set_max(max);
+  m_dumper->dump(m_f, 1);
 }
 
 /* -------------------------------------------------------------------------- */
 std::tuple<float, int> Simulation::compute() {
   int s = 0;
-  float l2 = 0;
+
+  m_epsilon = m_epsilon * std::min(m_h[0], m_h[1]);
+
+  // std::cout << m_h_m << " " << m_h_n << " " << m_epsilon << std::endl;
+  float l2 = 0.;
+
+#pragma omp parallel default(shared)
+#pragma omp single
   do {
     l2 = compute_step();
+    DEBUG::print("do_step: " + std::to_string(l2));
 
     m_grids.swap();
 
-    m_dumper->dump(m_grids.old(), s);
+#if defined(_MPI)
+    MPI_Allreduce(MPI_IN_PLACE, &l2, 1, MPI_FLOAT, MPI_SUM, m_communicator);
+    DEBUG::print("MPI_Allreduce: " + std::to_string(m_l2_shared));
+#endif
+
     ++s;
+
+    if (s == 100)
+      break;
+
   } while (l2 > m_epsilon);
 
+  m_dumper->set_min(min());
+  m_dumper->set_max(max());
+  m_dumper->dump(m_grids.old(), 0);
+
   return std::make_tuple(l2, s);
 }
 
 /* -------------------------------------------------------------------------- */
 void Simulation::set_epsilon(float epsilon) { m_epsilon = epsilon; }
 
 /* -------------------------------------------------------------------------- */
 float Simulation::epsilon() const { return m_epsilon; }
 
 /* -------------------------------------------------------------------------- */
 inline float Simulation::compute_row(int i) {
-  float l2 = 0;
+  float l2 = 0.;
 
   Grid & u = m_grids.current();
   Grid & uo = m_grids.old();
 
-  for (int j = 1; j < m_local_n - 1; j++) {
+  for (int j = 1; j < m_local_size[1] - 1; j++) {
     // computation of the new step
-    u(i, j) = 0.25 * (uo(i - 1, j) + uo(i + 1, j) + uo(i, j - 1) +
-                      uo(i, j + 1) - m_f(i, j) * m_h_m * m_h_n);
+    u(i, j) = (uo(i - 1, j) + uo(i + 1, j) + uo(i, j - 1) + uo(i, j + 1) -
+               m_f(i, j)) /
+              4.;
 
     // L2 norm
     l2 += (uo(i, j) - u(i, j)) * (uo(i, j) - u(i, j));
   }
 
+  DEBUG::print(std::string{__PRETTY_FUNCTION__} + " - i=" + std::to_string(i));
+
   return l2;
 }
 
 /* -------------------------------------------------------------------------- */
 float Simulation::compute_step() {
-  float l2 = 0.;
 
-  Grid & uo = m_grids.old();
+  auto nthreads = nb_threads();
+  std::vector<decltype(m_l2_shared)> l2(nthreads, 0);
+
+  auto & uo = m_grids.old();
+  auto & u = m_grids.current();
+
+  auto _compute = [&](auto && i, auto && j, auto && nthread) {
+    u(i, j) = (uo(i - 1, j) + uo(i + 1, j) + uo(i, j - 1) + uo(i, j + 1) -
+               m_f(i, j)) /
+              4.;
 
-  // start the permanent communications
-  uo.startSynchronization();
+    l2[nthread] += (uo(i, j) - u(i, j)) * (uo(i, j) - u(i, j));
+  };
 
+#pragma omp taskgroup
+  {
+    // start the permanent communications
+    uo.startSynchronization();
+
+#pragma omp taskloop grainsize(50)
     // computing the inner rows that do not depend on the ghosts
-  for (int i = 2; i < m_local_m - 2; i++) {
-    l2 += compute_row(i);
-  }
+    for (int i = 2; i < m_local_size[0] - 2; i++) {
+      auto nthread = thread_id();
+      for (int j = 2; j < m_local_size[1] - 2; j++) {
+        _compute(i, j, nthread);
+      }
+    }
 
-  /// wait to receive the ghosts before using them for them computation
-  uo.waitReceives();
+    /// wait to receive the ghosts before using them for them computation
+    uo.waitReceives();
+  }
 
+#pragma omp taskgroup
+  {
   // computing the line that depends on the ghosts
-  l2 += compute_row(1);
-  l2 += compute_row(m_local_m - 2);
+#pragma omp task depend(out : l2)
+    {
+      auto nthread = thread_id();
+      for (int i = 2; i < m_local_size[1] - 2; i++) {
+        _compute(1, i, nthread);
+      }
+    }
+
+#pragma omp task depend(out : l2)
+    {
+      auto nthread = thread_id();
+      for (int i = 2; i < m_local_size[1] - 2; i++) {
+        _compute(m_local_size[1] - 1, i, nthread);
+      }
+    }
+
+#pragma omp task depend(out : l2)
+    {
+      auto nthread = thread_id();
+      for (int i = 2; i < m_local_size[0] - 2; i++) {
+        _compute(i, 1, nthread);
+      }
+    }
+
+#pragma omp task depend(out : l2)
+    {
+      auto nthread = thread_id();
+      for (int i = 2; i < m_local_size[0] - 2; i++) {
+        _compute(i, m_local_size[1] - 1, nthread);
+      }
+    }
+
+    /// wait to send everything before changing the buffers
+    uo.waitSends();
 
-  /// wait to send everything before changing the buffers
-  uo.waitSends();
+    // Summing the value of all the processors together
+#pragma omp task depend(in : l2)
+    {
+      m_l2_shared = 0.;
+      for (decltype(nthreads) i = 0; i < nthreads; ++i) {
+        m_l2_shared += l2[i];
+      }
+      DEBUG::print("Local reduce: " + std::to_string(m_l2_shared));
+    }
+  }
 
+  return m_l2_shared;
+}
 
-  // Summing the value of all the processors together
-  MPI_Allreduce(MPI_IN_PLACE, &l2, 1, MPI_FLOAT, MPI_SUM, m_communicator);
+namespace {
+template <typename G, typename Func>
+auto accumulate_grid(const G & grid, float init, Func && func) {
+  auto m = grid.m();
+  auto n = grid.n();
 
-  return l2;
+  for (decltype(m) i = 1; i < m - 1; i++) {
+    for (decltype(n) j = 1; j < n - 1; j++) {
+      init = func(init, grid(i, j));
+    }
+  }
+  return init;
+}
+} // namespace
+
+/* -------------------------------------------------------------------------- */
+float Simulation::min() const {
+  const auto & u0 = m_grids.old();
+
+  auto tmp = accumulate_grid(
+      u0, u0(1, 1), [](auto && a, auto && b) { return std::min(a, b); });
+
+#if defined(_MPI)
+  MPI_Allreduce(MPI_IN_PLACE, &tmp, 1, MPI_FLOAT, MPI_MIN, m_communicator);
+#endif
+
+  return tmp;
 }
+
+/* -------------------------------------------------------------------------- */
+float Simulation::max() const {
+  const auto & u0 = m_grids.old();
+
+  auto tmp = accumulate_grid(
+      u0, u0(1, 1), [](auto && a, auto && b) { return std::max(a, b); });
+
+#if defined(_MPI)
+  MPI_Allreduce(MPI_IN_PLACE, &tmp, 1, MPI_FLOAT, MPI_MAX, m_communicator);
+#endif
+
+  return tmp;
+}
+
+/* -------------------------------------------------------------------------- */
diff --git a/simulation.hh b/simulation.hh
index 1dfe321..e223819 100644
--- a/simulation.hh
+++ b/simulation.hh
@@ -1,76 +1,94 @@
 #ifndef SIMULATION_HH
 #define SIMULATION_HH
 
 /* -------------------------------------------------------------------------- */
 #include "double_buffer.hh"
 #include "dumpers.hh"
 #include "grid.hh"
 /* -------------------------------------------------------------------------- */
-#include <tuple>
-#include <memory>
 #include <array>
+#include <memory>
+#include <tuple>
+#if defined(_MPI)
 #include <mpi.h>
+#endif
 /* -------------------------------------------------------------------------- */
 
 /* -------------------------------------------------------------------------- */
 class Simulation {
 public:
+#if defined(_MPI)
   Simulation(int m, int n, MPI_Comm communicator);
-
+#else
+  Simulation(int m, int n);
+#endif
   /// set the initial conditions, Dirichlet and source term
   virtual void set_initial_conditions();
 
   /// perform the simulation
   std::tuple<float, int> compute();
 
   /// access the precision
   void set_epsilon(float epsilon);
   float epsilon() const;
 
+  std::size_t complexity() { return 8 * m_global_size[0] * m_global_size[1]; }
+
+  float min() const;
+  float max() const;
+
+  float dims(std::size_t i) const { return m_dims[i]; }
+
 protected:
   /// compute one step and return an error
   virtual float compute_step();
 
   /// compute one row
   float compute_row(int i);
 
 private:
   /// Global problem size
-  int m_global_m, m_global_n;
+  std::array<int, 2> m_global_size;
 
   /// Local problem size
-  int m_local_m, m_local_n;
+  std::array<int, 2> m_local_size;
 
   /// local offset
-  int m_offset_m, m_offset_n;
+  std::array<int, 2> m_offset;
 
-  /// local neighbors
-  int m_north_prank, m_south_prank;
+  /// Cartesian dimensions
+  std::array<int, 2> m_dims;
+
+  /// Cartesian coordinates
+  std::array<int, 2> m_coords;
 
   /// proc rank
-  int m_prank;
+  int m_prank{0};
 
   /// communicator size
-  int m_psize;
+  int m_psize{1};
 
   /// Precision to achieve
   float m_epsilon;
 
   /// grid spacing
-  float m_h_m;
-  float m_h_n;
+  std::array<float, 2> m_h;
+
+  float m_l2_shared;
 
   /// Grids storage
   DoubleBuffer m_grids;
 
   /// source term
   Grid m_f;
 
   /// Dumper to use for outputs
   std::unique_ptr<Dumper> m_dumper;
 
+#if defined(_MPI)
   /// communicator
   MPI_Comm m_communicator;
+#endif
 };
 
 #endif /* SIMULATION_HH */