diff --git a/CHANGELOG.md b/CHANGELOG.md
index b78892e..b717aa3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,199 +1,201 @@
 # Changelog
 
 All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) for final versions and [PEP440](https://www.python.org/dev/peps/pep-0440/) in case intermediate versions need to be released (e.g. development version `2.2.3.dev1` or release candidates `2.2.3rc1`), or individual commits are packaged.
 
 ## Unreleased
 
 ### Added
 
 - Added `read()` method to dumpers to create a model from a dump file
 - `getClusters()` can be called in MPI contact with partial contact maps
 - Added a JSON encoder class for models and a JSON dumper
 - CUDA compatibility is re-established, but has not been tested
 
 ### Changed
 
 - Tamaas version numbers are now managed by
   [versioneer](https://github.com/python-versioneer/python-versioneer). This
   means that Git tags prefixed with `v` (e.g. `v2.2.3`) carry meaning and
   determine the version. When no tag is set, versioneer uses the last tag,
   specifies the commit short hash and the distance to the last tag (e.g.
   `2.2.2+33.ge314b0e`). This version string is used in the compiled library, the
   `setup.py` script and the `__version__` variable in the python module.
 
 ### Fixed
 
 - Fixed a host of dump read/write issues when model type was not `volume_*d`.
   Dumper tests are now streamlined and systematic.
 - Fixed a bug where `Model::solveDirichlet` would not compute correctly
+- Fixed a bug where `Statistics::contact` would not normalize by the global
+  number of surface points
 
 ## v2.2.2 -- 2021-04-02
 
 ### Added
 
 - Entry-point `tamaas` defines a grouped CLI for `examples/pipe_tools`. Try
   executing `tamaas surface -h` from the command-line!
 
 ### Changed
 
 - `CXXFLAGS` are now passed to the linker
 - Added this changelog
 - Using absolute paths for environmental variables when running `scons test`
 - Reorganized documentation layout
 - Gave the build system a facelift (docs are now generated directly with SCons
   instead of a Makefile)
 
 ### Deprecated
 
 - Python 2 support is discontinued. Version `v2.2.1` is the last PyPi build with
   a Python 2 wheel.
 - The scripts in `examples/pipe_tools` have been replaced by the `tamaas` command
 
 ### Fixed
 
 - `UVWDumper` no longer imports `mpi4py` in sequential
 - Compiling with different Thrust/FFTW backends
 
 
 ## v2.2.1 -- 2021-03-02
 
 ### Added
 
 - Output registered fields and dumpers in `print(model)`
 - Added `operator[]` to the C++ model class (for fields)
 - Added `traction` and `displacement` properties to Python model bindings
 - Added `operators` property to Python model bindings, which provides a
   dict-like access to registered operators
 - Added `shape` and `spectrum` to properties to Python surface generator
   bindings
 - Surface generator constructor accepts surface global shape as argument
 - Choice of FFTW thread model
 
 ### Changed
 
 - Tests use `/tmp` for temporary files
 - Updated dependency versions (Thrust, Pybind11)
 
 ### Deprecated
 
 - Most `get___()` and `set___()` in Python bindings have been deprecated. They
   will generate a `DeprecationWarning`.
 
 ### Removed
 
 - All legacy code
 
 
 ## v2.2.0 -- 2020-12-31
 
 ### Added
 
 - More accurate function for computation of contact area
 - Function to compute deviatoric of tensor fields
 - MPI implementation
 - Convenience `hdf5toVTK` function
 - Readonly properties `shape`, `global_shape`, `boundary_shape` on model to give
   shape information
 
 ### Changed
 
 - Preprocessor defined macros are prefixed with `TAMAAS_`
 - Moved `tamaas.to_voigt` to `tamaas.compute.to_voigt`
 
 ### Fixed
 
 - Warning about deprecated constructors with recent GCC versions
 - Wrong computation of grid strides
 - Wrong computation of grid sizes in views
 
 
 ## v2.1.4 -- 2020-08-07
 
 ### Added
 
 - Possibility to generate a static `libTamaas`
 - C++ implementation of DFSANE solver
 - Allowing compilation without OpenMP
 
 ### Changed
 
 - NetCDF dumper writes frames to a single file
 
 ### Fixed
 
 - Compatibility with SCons+Python 3
 
 ## v2.1.3 -- 2020-07-27
 
 ### Added
 
 - Version number to `TamaasInfo`
 
 ### Changed
 
 - Prepending root directory when generating archive
 
 
 ## v2.1.2 -- 2020-07-24
 
 This release changes some core internals related to discrete Fourier transforms
 for future MPI support.
 
 ### Added
 
 - Caching `CXXFLAGS` in SCons build
 - SCons shortcut to create code archive
 - Test of the elastic-plastic contact solver
 - Paraview data dumper (`.pvd` files)
 - Compression for UVW dumper
 - `__contains__` and `__iter__` Python bindings of model
 - Warning message of possible overflow in Kelvin
 
 ### Changed
 
 - Simplified `tamaas_info.cpp`, particularly the diff part
 - Using a new class `FFTEngine` to manage discrete Fourier transforms. Plans are
   re-used as much as possible with different data with the same shape. This is
   in view of future MPI developments
 - Redirecting I/O streams in solve functions so they can be used from Python
   (e.g. in Jupyter notebooks)
 - Calling `initialize()` and `finalize()` is no longer necessary
 
 ### Fixed
 
 - Convergence issue with non-linear solvers
 - Memory error in volume potentials
 
 
 ## v2.1.1 -- 2020-04-22
 
 ### Added
 
 - SCons shortcut to run tests
 
 ### Fixed
 
 - Correct `RPATH` for shared libraries
 - Issues with SCons commands introduced in v2.1.0
 - Tests with Python 2.7
 
 
 ## v2.1.0 -- 2020-04-17
 
 ### Added
 
 - SCons shortcuts to build/install Tamaas and its components
 - Selection of integration method for Kelvin operator
 - Compilation option to remove the legacy part of Tamaas
 - NetCDF dumper
 
 ### Fixed
 
 - Link bug with clang
 - NaNs in Kato saturated solver
 
 ## v2.0.0 -- 2019-11-11
 
 First public release. Contains relatively mature elastic-plastic contact code.
diff --git a/tests/SConscript b/tests/SConscript
index 737f1ef..39a9cd4 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -1,215 +1,219 @@
 # -*- mode:python; coding: utf-8 -*-
 # vim: set ft=python:
 
 # @file
 # LICENSE
 #
 # Copyright (©) 2016-2021 EPFL (École Polytechnique Fédérale de Lausanne),
 # Laboratory (LSMS - Laboratoire de Simulation en Mécanique des Solides)
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as published
 # by the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Affero General Public License for more details.
 #
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from __future__ import print_function
 
 from SCons.Script import Split, Copy, Dir, Import
 from detect import FindGTest, FindPybind11
 
 
 # ------------------------------------------------------------------------------
 def copyComStr(env, main):
     if 'SHCXXCOMSTR' in main:
         env['CXXCOMSTR'] = main['SHCXXCOMSTR']
     if 'SHLINKCOMSTR' in main:
         env['LINKCOMSTR'] = main['SHLINKCOMSTR']
 
 
 # ------------------------------------------------------------------------------
 def make_python_tests(env):
     """Copy python tests to build directory"""
     test_env = env.Clone()
 
     test_files = Split("""
     test_hertz.py
     test_westergaard.py
     test_patch_westergaard.py
     test_patch_plasticity.py
     test_surface.py
     test_hertz_disp.py
     test_hertz_kato.py
     test_saturated_pressure.py
     test_flood_fill.py
     test_integral_operators.py
     test_dumper.py
     test_tangential.py
     test_boussinesq_surface.py
     test_voigt.py
     test_memory.py
     test_epic.py
     fftfreq.py
     conftest.py
     pytest.ini
     """)
 
     if env['use_mpi']:
         test_files += ['test_mpi_routines.py', 'mpi_routines.py']
 
     src_dir = "#/tests"
 
     targets = [
         test_env.Command(file, test_env.File(file, src_dir),
                          Copy("$TARGET", "$SOURCE"))
         for file in test_files
     ]
 
     test_env = env.Clone(tools=[pybind11])
     # Helper module for integral operators
     test_env['SHLIBPREFIX'] = ''
     test_env.PrependUnique(LIBS=['Tamaas'])
     register = test_env.Pybind11Module(
         target="register_integral_operators",
         source=["register_integral_operators.cpp"])
 
     Import('libTamaas')
     test_env.Depends(register, libTamaas)
     targets.append(register)
 
     return targets
 
 
 # ------------------------------------------------------------------------------
 def compile_google_test(env, gtest_path):
     gtest_obj = env.Object('gtest.o',
                            [env.File("src/gtest-all.cc", gtest_path)])
     return env.StaticLibrary('gtest', gtest_obj)
 
 
 # ------------------------------------------------------------------------------
 def make_google_tests(env):
     gtest_dir = Dir(env['GTEST_ROOT'])
 
     gtest_env = env.Clone(CPPPATH=[gtest_dir],
                           CXXFLAGS=['-pthread', '-isystem',
                                     env.Dir('include', gtest_dir).path])
 
     FindGTest(gtest_env)
 
     libgtest = None
     # Hugly hack to detect if we need to compile gtest submodule
     if env['GTEST_ROOT'] == '#third-party/googletest/googletest':
         gtest_path = str(gtest_dir)
         libgtest = compile_google_test(gtest_env, gtest_path)
 
     env.AppendUnique(CXXFLAGS=gtest_env['CXXFLAGS'])
     env.PrependUnique(LIBS=['Tamaas', env.subst('python${py_version}')])
 
     google_test_files = Split("""
-                              test_fft.cpp
                               test_grid.cpp
                               test_loop.cpp
                               test_model.cpp
                               test_static_types.cpp
                               test_integration.cpp
                               """)
 
+    if env['use_fftw']:
+        google_test_files += ['test_fftw.cpp']
+    if env['use_cuda']:
+        google_test_files += ['test_cufft.cpp']
+
     # Necessary for the tests that use pybind11 calls to python
     uses = []
     if env['build_python']:
         google_test_files.append('test_fftfreq.cpp')
         uses = ['TAMAAS_USE_PYTHON']
 
     if env['use_mpi']:
         google_test_files.append('test_mpi.cpp')
 
     defines = env['CPPDEFINES']
     if type(defines) is not list:
         defines = [defines]
 
     gtest_main = env.Object("tamaas_gtest_main.o", 'tamaas_gtest_main.cc',
                             CPPDEFINES=defines + uses)
     gtest_all = env.Program('test_gtest_all', google_test_files + [gtest_main],
                             LIBS=(env['LIBS'] + ['gtest']))
     Import('libTamaas')
     env.Depends(gtest_all, libTamaas)
     env.Depends(gtest_all, libgtest)
 
     return [gtest_all]
 
 
 # ------------------------------------------------------------------------------
 def make_bare_tests(env):
     rough = env.Program("test_rough_surface.cpp")
     Import('libTamaas')
     env.Depends(rough, libTamaas)
 
     return [rough]
 
 
 # ------------------------------------------------------------------------------
 Import('main_env')
 
 # Setup of test environment
 test_env = main_env.Clone()
 test_env.AppendUnique(
     LIBPATH=['.', '../src'],
     RPATH=["'$$$$ORIGIN/../src'"]
 )
 test_env.PrependUnique(LIBS=['Tamaas'])
 
 # Building tests that do not require any third party
 targets = make_bare_tests(test_env)
 
 # Build tests that required python bindings
 if test_env['build_python']:
     FindPybind11(test_env)
     test_env.Tool(pybind11)
     test_env.ParseConfig("${py_exec}-config --ldflags")
     test_env['CCFLAGS'] = []
 
     targets += make_python_tests(test_env)
 
 # Building google tests
 if test_env['use_googletest']:
     targets += make_google_tests(test_env)
     targets.append(test_env.Command('test_gtest.py', '#tests/test_gtest.py',
                                     Copy('$TARGET', '$SOURCE')))
 
 # Target alias to build tests
 main_env.Alias('build-tests', targets)
 
 # Check if pytest is installed
 conf = Configure(test_env,
                  custom_tests={'CheckPythonModule': CheckPythonModule})
 
 has_pytest = conf.CheckPythonModule('pytest')
 conf.Finish()
 
 # Define a command to execute tests
 if has_pytest:
     pytest_env = test_env.Clone()
     test_env['pythonpath'] = '${build_dir}/python'
     test_env['ld_library_path'] = '${build_dir}/src'
     pytest_env.PrependENVPath('PYTHONPATH', '${pythonpath.abspath}')
     pytest_env.PrependENVPath('LD_LIBRARY_PATH', '${ld_library_path.abspath}')
     # Setting a moderate thread number
     pytest_env['ENV']['OMP_NUM_THREADS'] = "1"
     pytest_env['PYTESTOPTS'] = ['${"-v" if verbose else "-q"}']
     test_target = pytest_env.Command(
         '.phony_test', targets,
         '${py_exec} -m pytest $PYTESTOPTS ${TARGET.dir}')
     main_env.Alias('test', test_target)
 
 else:
     # We still define a target here so that `scons test` still works
     dummy_command(main_env, 'test',
                   'Cannot run tests: pytest is not installed')
diff --git a/tests/test.hh b/tests/test.hh
index c36cc76..18c83c9 100644
--- a/tests/test.hh
+++ b/tests/test.hh
@@ -1,70 +1,72 @@
 /**
  *
  * @author Lucas Frérot <lucas.frerot@epfl.ch>
  *
  * LICENSE
  *
  * Copyright (©)  2017 EPFL  (Ecole Polytechnique  Fédérale de
  * Lausanne)  Laboratory (LSMS  -  Laboratoire de  Simulation  en Mécanique  des
  * Solides)
  *
  * Tamaas is free  software: you can redistribute it and/or  modify it under the
  * terms  of the  GNU Lesser  General Public  License as  published by  the Free
  * Software Foundation, either version 3 of the License, or (at your option) any
  * later version.
  *
  * Tamaas is  distributed in the  hope that it  will be useful, but  WITHOUT ANY
  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  * A  PARTICULAR PURPOSE. See  the GNU  Lesser General  Public License  for more
  * details.
  *
  * You should  have received  a copy  of the GNU  Lesser General  Public License
  * along with Tamaas. If not, see <http://www.gnu.org/licenses/>.
  *
  */
 /* -------------------------------------------------------------------------- */
 #include "fftw/interface.hh"
 #include "tamaas.hh"
 #include "gtest/gtest.h"
 #include <algorithm>
 #include <array>
-#include <numeric>
 #include <limits>
+#include <numeric>
 #include <thrust/complex.h>
 #include <utility>
 
 template <typename T, typename U>
 bool compare(T&& a, U&& b) {
   return std::mismatch(std::begin(a), std::end(a), std::begin(b)) ==
          std::make_pair(std::end(a), std::end(b));
 }
 
 template <typename T, typename U, typename Pred>
 bool compare(T&& a, U&& b, Pred&& pred) {
   return std::mismatch(std::begin(a), std::end(a), std::begin(b), pred) ==
          std::make_pair(std::end(a), std::end(b));
 }
 
 struct AreFloatEqual {
   tamaas::Real tolerance = 4 * std::numeric_limits<tamaas::Real>::epsilon();
   template <typename T, typename U>
   inline bool operator()(T&& x, U&& y) {
     // T y = reinterpret_cast<T>(z);
     tamaas::Real abs_max = std::max<std::decay_t<T>>(std::abs(x), std::abs(y));
     abs_max = std::max(abs_max, tamaas::Real(1.));
     return std::abs(x - y) <= (tolerance * abs_max);
   }
 };
 
 struct AreComplexEqual {
   tamaas::Real tolerance = 4 * std::numeric_limits<tamaas::Real>::epsilon();
+#ifdef TAMAAS_USE_FFTW
   inline bool operator()(const tamaas::Complex& x,
                          const fftw::helper<tamaas::Real>::complex& y) {
     auto equal = AreFloatEqual{tolerance};
     return equal(x.real(), y[0]) && equal(x.imag(), y[1]);
   }
+#endif
   inline bool operator()(const tamaas::Complex& x, const tamaas::Complex& y) {
     auto equal = AreFloatEqual{tolerance};
     return equal(x.real(), y.real()) && equal(x.imag(), y.imag());
   }
 };
diff --git a/tests/test_cufft.cpp b/tests/test_cufft.cpp
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_fft.cpp b/tests/test_fftw.cpp
similarity index 95%
rename from tests/test_fft.cpp
rename to tests/test_fftw.cpp
index 9333dfa..89f385c 100644
--- a/tests/test_fft.cpp
+++ b/tests/test_fftw.cpp
@@ -1,259 +1,237 @@
 /**
  *  @file
  *  LICENSE
  *
  *  Copyright (©) 2016-2021 EPFL (École Polytechnique Fédérale de Lausanne),
  *  Laboratory (LSMS - Laboratoire de Simulation en Mécanique des Solides)
  *
  *  This program is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU Affero General Public License as published
  *  by the Free Software Foundation, either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU Affero General Public License for more details.
  *
  *  You should have received a copy of the GNU Affero General Public License
  *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
  *
  */
 /* -------------------------------------------------------------------------- */
 #include "fftw/fftw_engine.hh"
 #include "grid.hh"
 #include "grid_hermitian.hh"
 #include "grid_view.hh"
 #include "test.hh"
 
 using namespace tamaas;
 using fft = fftw::helper<Real>;
 
 /* -------------------------------------------------------------------------- */
 
 template <typename T>
 struct span {
   T* ptr;
   std::size_t size;
 
   ~span() { fftw::free(ptr); }
   const T* begin() const { return ptr; }
   const T* end() const { return ptr + size; }
   T* begin() { return ptr; }
   T* end() { return ptr + size; }
 
   operator T*() { return ptr; }
 };
 
 /* -------------------------------------------------------------------------- */
 TEST(TestFFTEngine, FFT1D) {
   constexpr UInt size = 1000;
   FFTWEngine engine;
 
   span<Real> data{fft::alloc_real(size), size};
   span<fft::complex> solution{fft::alloc_complex(size / 2 + 1), size / 2 + 1};
 
   fftw::plan<Real> solution_plan{
       fftw::plan_1d_forward(size, data, solution, engine.flags())};
 
   std::iota(data.begin(), data.end(), 0);
   fftw::execute(solution_plan);
 
   Grid<Real, 1> grid({size}, 1);
   GridHermitian<Real, 1> result({size / 2 + 1}, 1);
 
   std::iota(grid.begin(), grid.end(), 0);
   engine.forward(grid, result);
 
-#ifdef TAMAAS_USE_CUDA
-  cudaDeviceSynchronize();
-#endif
-
   ASSERT_TRUE(compare(result, solution, AreComplexEqual()))
       << "1D FFTW transform failed";
 }
 
 /* -------------------------------------------------------------------------- */
 TEST(TestFFTWEngine, FFT2D) {
   constexpr UInt size = 100;
   constexpr UInt rsize = size * size;
   constexpr UInt csize = size * (size / 2 + 1);
   FFTWEngine engine;
 
   span<Real> data{fft::alloc_real(rsize), rsize};
   span<fft::complex> solution{fft::alloc_complex(csize), csize};
 
   fftw::plan<Real> solution_plan{
       fftw::plan_2d_forward(size, size, data, solution, engine.flags())};
 
   std::iota(data.begin(), data.end(), 0);
   fftw::execute(solution_plan);
 
   Grid<Real, 2> grid({size, size}, 1);
   GridHermitian<Real, 2> result({size, size / 2 + 1}, 1);
 
   std::iota(grid.begin(), grid.end(), 0);
   engine.forward(grid, result);
 
-#ifdef TAMAAS_USE_CUDA
-  cudaDeviceSynchronize();
-#endif
-
   ASSERT_TRUE(compare(result, solution, AreComplexEqual()))
       << "2D FFTW transform failed";
 }
 
 /* -------------------------------------------------------------------------- */
 TEST(TestFFTWEngine, FFT2DBackwards) {
   const std::ptrdiff_t N0 = 20, N1 = 20;
 
   Grid<Real, 2> real({N0, N1}, 1);
   GridHermitian<Real, 2> spectral({N0, N1 / 2 + 1}, 1);
 
   real = 1.;
 
   FFTWEngine engine;
 
   engine.forward(real, spectral);
   real = 0;
   engine.backward(real, spectral);
 
   Grid<Real, 2> reference({N0, N1}, 1);
 
   reference = 1.;
 
   ASSERT_TRUE(compare(real, reference, AreFloatEqual()));
 }
 
 /* -------------------------------------------------------------------------- */
 TEST(TestFFTWEngine, FFT1D2Comp) {
   constexpr UInt size = 20;
 
   /// 1D single component FFT should be working here
   Grid<Real, 1> grid({size}, 2), data({size}, 1);
   std::iota(grid.begin(), grid.end(), 0);
   std::iota(data.begin(), data.end(), 0);
   GridHermitian<Real, 1> result({size / 2 + 1}, 2), solution({size / 2 + 1}, 1);
 
   FFTWEngine engine;
 
   engine.forward(grid, result);
-#ifdef TAMAAS_USE_CUDA
-  cudaDeviceSynchronize();
-#endif
 
   std::iota(data.begin(), data.end(), 0);
   data *= 2;
   engine.forward(data, solution);
 
   const Real tol = 200 * std::numeric_limits<Real>::epsilon();
 
   ASSERT_TRUE(
       compare(make_component_view(result, 0), solution, AreComplexEqual{tol}))
       << "1D FFTW transform with 2 components failed on 1st component";
 
   data += 1;
   engine.forward(data, solution);
 
   ASSERT_TRUE(
       compare(make_component_view(result, 1), solution, AreComplexEqual{tol}))
       << "1D FFTW transform with 2 components failed on 2nd component";
 }
 
 /* -------------------------------------------------------------------------- */
 TEST(TestFFTWEngine, FFT2D3Comp) {
   constexpr UInt size = 20;
 
   /// 2D single component FFT should be working here
   Grid<Real, 2> grid({size, size}, 3), data({size, size}, 1);
   std::iota(grid.begin(), grid.end(), 0);
   std::iota(data.begin(), data.end(), 0);
   data *= 3;
 
   GridHermitian<Real, 2> result({size, size / 2 + 1}, 3),
       solution({size, size / 2 + 1}, 1);
 
   FFTWEngine engine;
 
   engine.forward(grid, result);
 
-#ifdef TAMAAS_USE_CUDA
-  cudaDeviceSynchronize();
-#endif
-
   constexpr Real tol = 5000 * std::numeric_limits<Real>::epsilon();
 
   for (UInt i = 0; i < 3; ++i) {
     engine.forward(data, solution);
     ASSERT_TRUE(
         compare(make_component_view(result, i), solution, AreComplexEqual{tol}))
         << "2D FFTW transform with 3 components failed on " << i
         << "th component";
     data += 1;
   }
 }
 
 /* -------------------------------------------------------------------------- */
 TEST(TestFFTWEngine, FFT2DViewTransform) {
   constexpr UInt size = 20;
 
   Grid<Real, 2> data({size, size}, 1);
   GridHermitian<Real, 2> solution({size, size / 2 + 1}, 1);
   std::iota(std::begin(data), std::end(data), 0);
 
   FFTWEngine engine;
   engine.forward(data, solution);
 
   Grid<Real, 2> grid({size, size}, 3);
   auto view = make_component_view(grid, 1);
   std::iota(view.begin(), view.end(), 0);
 
   GridHermitian<Real, 2> result({size, size / 2 + 1}, 1);
   engine.forward(view, result);
 
   constexpr Real tol = 5000 * std::numeric_limits<Real>::epsilon();
 
   ASSERT_TRUE(compare(result, solution, AreComplexEqual{tol}))
       << "Fourier transform on component view fail";
 }
 
 /* -------------------------------------------------------------------------- */
 TEST(TestFFTWEngine, FFTI1D2Comp) {
   constexpr UInt size = 20;
 
   Grid<Real, 1> grid({size}, 2);
   std::iota(grid.begin(), grid.end(), 0);
   GridHermitian<Real, 1> grid_hermitian({size / 2 + 1}, 2);
   Grid<Real, 1> result({size}, 2);
 
   FFTWEngine engine;
   engine.forward(grid, grid_hermitian);
   engine.backward(result, grid_hermitian);
 
-#ifdef TAMAAS_USE_CUDA
-  cudaDeviceSynchronize();
-#endif
-
   ASSERT_TRUE(compare(grid, result, AreFloatEqual()))
       << "1D FFTI transform with 2 components failed";
 }
 
 /* -------------------------------------------------------------------------- */
 TEST(TestFFTWEngine, FFTI2D3Comp) {
   constexpr UInt size = 20;
 
   Grid<Real, 2> grid({size, size}, 3);
   std::iota(grid.begin(), grid.end(), 0);
   GridHermitian<Real, 2> grid_hermitian({size, size / 2 + 1}, 3);
   Grid<Real, 2> result({size, size}, 3);
 
   FFTWEngine engine;
   engine.forward(grid, grid_hermitian);
   engine.backward(result, grid_hermitian);
-#ifdef TAMAAS_USE_CUDA
-  cudaDeviceSynchronize();
-#endif
 
   ASSERT_TRUE(compare(grid, result, AreFloatEqual()))
       << "2D FFTI transform with 3 components failed";
 }
diff --git a/tests/test_loop.cpp b/tests/test_loop.cpp
index 3e851ef..e9185ad 100644
--- a/tests/test_loop.cpp
+++ b/tests/test_loop.cpp
@@ -1,374 +1,401 @@
 /**
  *  @file
  *  LICENSE
  *
  *  Copyright (©) 2016-2021 EPFL (École Polytechnique Fédérale de Lausanne),
  *  Laboratory (LSMS - Laboratoire de Simulation en Mécanique des Solides)
  *
  *  This program is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU Affero General Public License as published
  *  by the Free Software Foundation, either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU Affero General Public License for more details.
  *
  *  You should have received a copy of the GNU Affero General Public License
  *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
  *
  */
 /* -------------------------------------------------------------------------- */
 #include "grid.hh"
 #include "grid_view.hh"
 #include "loop.hh"
 #include "mpi_interface.hh"
 #include "static_types.hh"
 #include "test.hh"
 /* -------------------------------------------------------------------------- */
 /* WARNING: here we cannot use lambdas for tests because GoogleTest declares  */
 /* test functions as private members of classes which is incompatible with    */
 /* cuda's extended lambdas. I know... it's f*cking stupid                     */
 /* -------------------------------------------------------------------------- */
 
 using namespace tamaas;
 
 template <typename T>
 struct AddOneInplace {
   CUDA_LAMBDA void operator()(T& x) { x += 1; }
 };
 
 // Testing loops on one grid
 TEST(TestLoops, OneArgument) {
   Grid<Real, 1> grid({20}, 1);
   Grid<Real, 1> solution({20}, 1);
 
-  auto add_one = [](Real& x) { return x + 1; };
+  auto add_one = [](auto x) { return x + 1; };
 
   std::iota(grid.begin(), grid.end(), 1);
 
   // Makeing solution
   std::transform(grid.begin(), grid.end(), solution.begin(), add_one);
 
   auto add_one_inplace = AddOneInplace<Real>();
 
   Loop::loop(add_one_inplace, grid);
 
   ASSERT_TRUE(compare(grid, solution, AreFloatEqual()))
       << "One argument loop failed";
 }
 
 struct PrimalTest {
   CUDA_LAMBDA void operator()(Int& primal, Int& val) {
     val = (primal > 0) ? -1 : 1;
   }
 };
 
 // Testing loops on two grids
 TEST(TestLoops, TwoArguments) {
   // Why no ints?
   Grid<Int, 2> grid({20, 20}, 1);
   Grid<Int, 2> primal({20, 20}, 1);
   Grid<Int, 2> solution({20, 20}, 1);
 
   primal(0, 0) = 1;
   primal(0, 1) = 1;
   primal(1, 0) = 1;
   primal(1, 1) = 1;
 
   std::transform(primal.begin(), primal.end(), solution.begin(),
                  [](Int& primal) { return (primal > 0) ? -1 : 1; });
 
   auto primal_test = PrimalTest();
 
   Loop::loop(primal_test, primal, grid);
 
   ASSERT_TRUE(compare(solution, grid)) << "Two argument loop failed";
 }
 
 struct AssignUInt {
   CUDA_LAMBDA void operator()(UInt& x, UInt i) { x = i; }
 };
 
 // Testing an enumeration
 TEST(TestLoops, Enumeration) {
   Grid<UInt, 1> grid({100}, 1);
   Grid<UInt, 1> solution({100}, 1);
   std::iota(solution.begin(), solution.end(), 0);
 
   auto assign_uint = AssignUInt();
 
   Loop::loop(assign_uint, grid, Loop::range(100));
 
   ASSERT_TRUE(compare(solution, grid)) << "Enumeration loop failed";
 }
 
 /* -------------------------------------------------------------------------- */
 
 struct Identity {
   CUDA_LAMBDA UInt operator()(UInt& x) const { return x; }
 };
 
 // Testing one grid reductions
 TEST(TestReductions, OneArgument) {
   Grid<UInt, 1> grid({6}, 1);
   std::iota(grid.begin(), grid.end(), 1);
 
   const auto id = Identity();
 
   // Sum reduction
   UInt sol = mpi::allreduce<operation::plus>(
       std::accumulate(grid.begin(), grid.end(), 0, std::plus<>()));
   UInt red = Loop::reduce<operation::plus>(id, grid);
   ASSERT_TRUE(sol == red) << "Addition reduction failed on one argument";
 
   // Product reduction
   sol = mpi::allreduce<operation::times>(
       std::accumulate(grid.begin(), grid.end(), 1, std::multiplies<>()));
   red = Loop::reduce<operation::times>(id, grid);
   ASSERT_TRUE(sol == red) << "Multiplication reduction failed on one argument";
 
   // Min reduction
   sol = mpi::allreduce<operation::min>(
       *std::min_element(grid.begin(), grid.end()));
   red = Loop::reduce<operation::min>(id, grid);
   ASSERT_TRUE(sol == red) << "Min reduction failed on one argument";
 
   // Max reduction
   sol = mpi::allreduce<operation::max>(
       *std::max_element(grid.begin(), grid.end()));
   red = Loop::reduce<operation::max>(id, grid);
   ASSERT_TRUE(sol == red) << "Max reduction failed on one argument";
 }
 
+struct AssignReduce {
+  CUDA_LAMBDA UInt operator()(UInt& x, UInt i) {
+    x = i;
+    return x;
+  }
+};
+
 TEST(TestReductions, ReduceAndTransform) {
   UInt n = 20;
   Grid<UInt, 1> grid({n}, 1), solution({n}, 1);
   std::iota(solution.begin(), solution.end(), 0);
   UInt sum_value = mpi::allreduce<operation::plus>((n - 1) * n / 2);
 
-  UInt res = Loop::reduce<operation::plus>(
-      [](UInt& x, UInt i) {
-        x = i;
-        return x;
-      },
-      grid, Loop::range(n));
+  auto assign_reduce = AssignReduce{};
+  UInt res =
+      Loop::reduce<operation::plus>(assign_reduce, grid, Loop::range(n));
 
   EXPECT_EQ(res, sum_value) << "Reduction failed";
   EXPECT_TRUE(compare(grid, solution)) << "Assign failed";
 }
 
 struct PrimalReduce {
   CUDA_LAMBDA UInt operator()(UInt& p, UInt& val) { return (p > 0) ? val : 0; }
 };
 
 TEST(TestReductions, TwoArguments) {
   Grid<UInt, 1> grid({20}, 1);
   Grid<UInt, 1> primal({20}, 1);
   grid = 1;
   primal(0) = 1;
   primal(1) = 1;
 
   auto primal_reduce = PrimalReduce();
 
   // Reduce on values where primal > 0
   UInt red = Loop::reduce<operation::plus>(primal_reduce, primal, grid);
   ASSERT_TRUE(red == mpi::allreduce<operation::plus>(UInt{2}))
       << "Two args reduction failed";
 }
 
 /* -------------------------------------------------------------------------- */
 
 TEST(TestRange, type_trait) {
   Grid<UInt, 1> grid({1}, 1);
   auto gridrange = range<VectorProxy<UInt, 1>>(grid);
 
   static_assert(decltype(gridrange)::is_valid_container<Grid<UInt, 1>>::value,
                 "is_valid_container Type trait is wrong");
 
   static_assert(
       not decltype(gridrange)::is_valid_container<Grid<Real, 1>&>::value,
       "is_valid_container Type trait is wrong");
 
   static_assert(not Range<VectorProxy<Real, 1>, Real,
                           1>::is_valid_container<decltype(grid)>::value,
                 "is_valid_container Type trait is wrong");
 }
 
+struct AssignOne {
+  CUDA_LAMBDA void operator()(VectorProxy<UInt, 1> x) { x = 1; }
+  CUDA_LAMBDA void operator()(UInt& x) { x = 1; }
+  CUDA_LAMBDA void operator()(VectorProxy<UInt, 3> v) { v(2) = 1; }
+  CUDA_LAMBDA void operator()(VectorProxy<UInt, 2> v) { v = 1; }
+};
+
 TEST(TestRange, headless) {
   if (mpi::rank() != 0)
     GTEST_SKIP() << "Skipping because not root process";
 
   Grid<UInt, 1> grid({10}, 1), solution({10}, 1);
   std::fill(++solution.begin(), solution.end(), 1);
   auto gridrange = range<VectorProxy<UInt, 1>>(grid).headless();
-  Loop::loop([](auto x) { x = 1; }, gridrange);
+  auto assign_one = AssignOne{};
+  Loop::loop(assign_one, gridrange);
   ASSERT_TRUE(compare(grid, solution)) << "Headless fail";
 }
 
 template <typename T>
 using WrapVector = VectorProxy<T, 2>;
 
 struct AddOneVector {
   CUDA_LAMBDA void operator()(WrapVector<UInt> x) { x(0) += 1; }
 };
 
 TEST(TestStridedLoops, VectorStride) {
   Grid<UInt, 2> grid({10, 10}, 2);
   std::iota(grid.begin(), grid.end(), 1);
 
   Grid<UInt, 2> solution({10, 10}, 2);
   solution = grid;
 
   std::for_each(solution.begin(), solution.end(), [](UInt& x) {
     if (x % 2 == 1)
       x += 1;
   });
 
   auto add_one_inplace = AddOneVector();
 
   Loop::loop(add_one_inplace, range<WrapVector<UInt>>(grid));
 
   ASSERT_TRUE(compare(solution, grid)) << "Static vector strided loop failed";
 }
 
 template <typename T>
 using WrapMatrix = MatrixProxy<T, 2, 2>;
 
 struct SetOneMatrix {
   CUDA_LAMBDA void operator()(WrapMatrix<UInt> x) {
     x(0, 0) = 1;
     x(1, 1) = 1;
   }
 };
 
 TEST(TestStridedLoops, MatrixStride) {
   Grid<UInt, 2> grid({10, 10}, 4);
   Grid<UInt, 2> solution({10, 10}, 4);
   std::iota(solution.begin(), solution.end(), 0);
 
   std::for_each(solution.begin(), solution.end(), [](UInt& x) {
     if (x % 4 == 0 || x % 4 == 3)
       x = 1;
     else
       x = 0;
   });
 
   auto set_one = SetOneMatrix();
 
   Loop::loop(set_one, range<WrapMatrix<UInt>>(grid));
 
   ASSERT_TRUE(compare(solution, grid)) << "Static matrix strided loop failed";
 }
 
 struct VectorReduction {
   CUDA_LAMBDA Vector<UInt, 3> operator()(VectorProxy<UInt, 3> v) const {
     return v;
   }
 };
 
 struct BroadcastSet123 {
   CUDA_LAMBDA inline void operator()(VectorProxy<UInt, 3> v) const {
     v(0) = 1;
     v(1) = 2;
     v(2) = 3;
   }
 };
 
 TEST(TestStridedReduction, VectorReduce) {
   Grid<UInt, 2> grid({10, 10}, 3);
 
   Loop::loop(BroadcastSet123(), range<VectorProxy<UInt, 3>>(grid));
 
   auto res = Loop::reduce<operation::plus>(VectorReduction(),
                                            range<VectorProxy<UInt, 3>>(grid));
 
   auto reduce = [](UInt x) { return mpi::allreduce<operation::plus>(x); };
   ASSERT_EQ(res(0), reduce(100));
   ASSERT_EQ(res(1), reduce(200));
   ASSERT_EQ(res(2), reduce(300));
 }
 
+struct ScalarReduce {
+  CUDA_LAMBDA UInt operator()(UInt& x) { return x; }
+};
+
 TEST(TestViewReduction, ScalarReduce) {
   Grid<UInt, 2> grid({10, 10}, 3);
   Loop::loop(BroadcastSet123(), range<VectorProxy<UInt, 3>>(grid));
 
   auto view = make_component_view(grid, 2);
 
-  UInt res = Loop::reduce<operation::plus>([](UInt& x) { return x; }, view);
+  auto scalar_reduce = ScalarReduce{};
+  UInt res = Loop::reduce<operation::plus>(scalar_reduce, view);
   EXPECT_EQ(res, mpi::allreduce<operation::plus>(UInt{300}))
       << "Reduce on component view fail";
 }
 
 TEST(TestViewReduction, VectorReduce) {
   Grid<UInt, 2> grid({10, 10}, 3);
   auto view2 = make_view(grid, 0);
 
   Loop::loop(BroadcastSet123(), range<VectorProxy<UInt, 3>>(view2));
 
   auto res2 = Loop::reduce<operation::plus>(VectorReduction(),
                                             range<VectorProxy<UInt, 3>>(view2));
   auto reduce = [](UInt x) { return mpi::allreduce<operation::plus>(x); };
   EXPECT_EQ(res2(0), reduce(10));
   EXPECT_EQ(res2(1), reduce(20));
   EXPECT_EQ(res2(2), reduce(30));
 }
 
 TEST(TestViewLoop, ScalarLoop) {
   Grid<UInt, 2> grid({10, 10}, 3), solution({10, 10}, 3);
   auto view = make_component_view(grid, 2);
 
-  Loop::loop([](auto& x) { x = 1; }, view);
-
-  Loop::loop([](auto v) { v(2) = 1; }, range<VectorProxy<UInt, 3>>(solution));
+  auto assign_one = AssignOne{};
+  Loop::loop(assign_one, view);
+  Loop::loop(assign_one, range<VectorProxy<UInt, 3>>(solution));
 
   ASSERT_TRUE(compare(grid, solution)) << "View loop fail";
 }
 
 TEST(TestLoopChecks, Components) {
   Grid<UInt, 2> grid({10, 10}, 3);
 
-  EXPECT_THROW(
-      Loop::loop([](auto v) { v = 0; }, range<VectorProxy<UInt, 2>>(grid)),
-      Exception)
+  auto assign_one = AssignOne();
+  EXPECT_THROW(Loop::loop(assign_one, range<VectorProxy<UInt, 2>>(grid)),
+               Exception)
       << "Broken check on number of components";
 }
 
+struct CopyValues {
+  CUDA_LAMBDA auto operator()(UInt& x, UInt& y) { x = y; }
+  CUDA_LAMBDA auto operator()(VectorProxy<UInt, 2> x, VectorProxy<UInt, 1> y) {
+    x(0) = y(0);
+  }
+};
+
 TEST(TestLoopChecks, LoopSize) {
   Grid<UInt, 1> grid({10}, 2), other({10}, 1);
 
-  EXPECT_THROW(Loop::loop([](auto& x, auto& y) { x = y; }, grid, other),
-               Exception)
+  CopyValues func;
+
+  EXPECT_THROW(Loop::loop(func, grid, other), Exception)
       << "Check on loop size without ranges fail";
   other.resize({11});
-  EXPECT_THROW(Loop::loop([](auto x, auto y) { x(0) = y(0); },
-                          range<VectorProxy<UInt, 2>>(grid),
+  EXPECT_THROW(Loop::loop(func, range<VectorProxy<UInt, 2>>(grid),
                           range<VectorProxy<UInt, 1>>(other)),
                Exception)
       << "Check on loop size with ranges fail";
 
   Grid<UInt, 2> twod({10, 11}, 2);
   auto view = make_view(twod, 0);
-  EXPECT_THROW(Loop::loop([](auto& x, auto& y) { x = y; }, grid, view),
-               Exception)
+  EXPECT_THROW(Loop::loop(func, grid, view), Exception)
       << "Check on loop size with view fail";
 }
 
+struct ReduceAndTransform {
+  CUDA_LAMBDA UInt operator()(VectorProxy<UInt, 2> x, UInt) {
+    x += 1;
+    return x(0) + x(1);
+  }
+};
+
 TEST(TestReductions, ReduceAndTransformVector) {
   UInt n = 20;
   Grid<UInt, 1> grid({n}, 2), solution({n}, 2);
   std::iota(solution.begin(), solution.end(), 1);
   std::iota(grid.begin(), grid.end(), 0);
   UInt sum_value = mpi::allreduce<operation::plus>((2 * n + 1) * 2 * n / 2);
 
+  auto reduce_transform = ReduceAndTransform();
   UInt res = Loop::reduce<operation::plus>(
-      [](auto x, UInt) {
-        x += 1;
-        return x(0) + x(1);
-      },
-      range<VectorProxy<UInt, 2>>(grid), Loop::range(n));
+      reduce_transform, range<VectorProxy<UInt, 2>>(grid), Loop::range(n));
 
   EXPECT_EQ(res, sum_value) << "Reduction failed";
   EXPECT_TRUE(compare(grid, solution)) << "Assign failed";
 }