diff --git a/develop/ElementQuad4.cpp b/develop/ElementQuad4.cpp
index a5bd0bf..1d997e8 100644
--- a/develop/ElementQuad4.cpp
+++ b/develop/ElementQuad4.cpp
@@ -1,180 +1,171 @@
 
-#include <catch/catch.hpp>
-
-#define CPPMAT_NOCONVERT
-#include <cppmat/cppmat.h>
-
-#include <Eigen/Eigen>
-
-#include <GooseFEM/GooseFEM.h>
-
-#define EQ(a,b) REQUIRE_THAT( (a), Catch::WithinAbs((b), 1.e-12) );
+#include "support.h"
 
 using T2  = cppmat::tiny::cartesian::tensor2 <double,2>;
 using T2s = cppmat::tiny::cartesian::tensor2s<double,2>;
 
 // =================================================================================================
 
 TEST_CASE("GooseFEM::ElementQuad4", "ElementQuad4.h")
 {
 
 // =================================================================================================
 
 SECTION( "int_N_scalar_NT_dV" )
 {
   // mesh
   GooseFEM::Mesh::Quad4::Regular mesh(3,3);
 
   // vector-definition, and a diagonal matrix
   GooseFEM::Vector         vec(mesh.conn(), mesh.dofsPeriodic());
   GooseFEM::MatrixDiagonal mat(mesh.conn(), mesh.dofsPeriodic());
 
   // element definition, with nodal quadrature
   GooseFEM::Element::Quad4::Quadrature quad(
     vec.asElement(mesh.coor()),
     GooseFEM::Element::Quad4::Nodal::xi(),
     GooseFEM::Element::Quad4::Nodal::w()
   );
 
   // scalar per quadrature point (e.g. mass-density "rho")
   GooseFEM::ArrD rho({mesh.nelem(), quad.nip()});
   rho.setConstant(1.);
 
   // evaluate integral and assemble diagonal matrix (e.g. mass matrix)
   mat.assemble(quad.int_N_scalar_NT_dV(rho));
 
   // check matrix
   // - get the matrix
   GooseFEM::ColD M = mat.asDiagonal();
   // - check the size
   REQUIRE( M.size() == vec.ndof() );
   // - check each component
   for ( auto i = 0 ; i < M.size() ; ++i )
     EQ( M(i), 1 );
 }
 
 // =================================================================================================
 
 SECTION( "symGradN_vector" )
 {
   // mesh
   GooseFEM::Mesh::Quad4::FineLayer mesh(27,27);
 
   // vector-definition
   GooseFEM::Vector vec(mesh.conn(), mesh.dofs());
 
   // element definition, with Gauss quadrature
   GooseFEM::Element::Quad4::Quadrature quad( vec.asElement(mesh.coor()) );
 
   // macroscopic deformation gradient
   // - allocate
   T2 F;
   // - zero-initialize
   F.setZero();
   // - set non-zero components
   F(0,1) = 0.1;
 
   // convert the macroscopic strain tensor
   T2 EPS = .5 * ( F + F.T() );
 
   // nodal coordinates
   GooseFEM::MatD coor = mesh.coor();;
 
   // nodal displacement
   // - allocate
   GooseFEM::MatD disp(mesh.nnode(), mesh.ndim());
   // - zero-initialize
   disp.setZero();
 
   // apply macroscopic deformation gradient
   for ( size_t n = 0 ; n < mesh.nnode() ; ++n )
     for ( size_t i = 0 ; i < F.ndim() ; ++i )
       for ( size_t j = 0 ; j < F.ndim() ; ++j )
         disp(n,i) += F(i,j) * coor(n,j);
 
   // compute quadrature point tensors
   GooseFEM::ArrD eps = quad.symGradN_vector(vec.asElement(disp));
 
   // compute volume averaged tensor
   GooseFEM::ArrD epsbar = eps.average(quad.dV(eps.shape(-1)), {0,1});
 
   // check
   // - temporary tensor, to view the tensors
   cppmat::view::cartesian::tensor2s<double,2> Eps;
   // - check sizes
   REQUIRE( eps.shape(0) == mesh.nelem() );
   REQUIRE( eps.shape(1) == quad.nip()   );
   REQUIRE( eps.shape(2) == Eps.size()   );
   // - check all components
   for ( size_t e = 0 ; e < mesh.nelem() ; ++e ) {
     for ( size_t k = 0 ; k < quad.nip() ; ++k ) {
       Eps.setMap(&eps(e,k));
       for ( size_t i = 0 ; i < Eps.ndim() ; ++i )
         for ( size_t j = 0 ; j < Eps.ndim() ; ++j )
           EQ( Eps(i,j), EPS(i,j) );
     }
   }
 
   // check macroscopic tensor
   // - convert to tensor object
   T2s Epsbar = T2s::Copy(epsbar.begin(), epsbar.end());
   // - check all components
   for ( size_t i = 0 ; i < Epsbar.ndim() ; ++i )
     for ( size_t j = 0 ; j < Epsbar.ndim() ; ++j )
       EQ( Epsbar(i,j), EPS(i,j) );
 }
 
 // =================================================================================================
 
 SECTION( "symGradN_vector, int_gradN_dot_tensor2s_dV" )
 {
   // mesh
   GooseFEM::Mesh::Quad4::FineLayer mesh(27,27);
 
   // vector-definition
   GooseFEM::Vector vec(mesh.conn(), mesh.dofsPeriodic());
 
   // element definition, with Gauss quadrature
   GooseFEM::Element::Quad4::Quadrature quad( vec.asElement(mesh.coor()) );
 
   // macroscopic deformation gradient
   // - allocate
   T2 F;
   // - zero-initialize
   F.setZero();
   // - set non-zero components
   F(0,1) = 0.1;
 
   // nodal coordinates
   GooseFEM::MatD coor = mesh.coor();;
 
   // nodal displacement
   // - allocate
   GooseFEM::MatD disp(mesh.nnode(), mesh.ndim());
   // - zero-initialize
   disp.setZero();
 
   // apply macroscopic deformation gradient
   for ( size_t n = 0 ; n < mesh.nnode() ; ++n )
     for ( size_t i = 0 ; i < F.ndim() ; ++i )
       for ( size_t j = 0 ; j < F.ndim() ; ++j )
         disp(n,i) += F(i,j) * coor(n,j);
 
   // compute quadrature point tensors
   GooseFEM::ArrD eps = quad.symGradN_vector(vec.asElement(disp));
 
   // nodal force vector (should be zero, as it is only sensitive to periodic fluctuations)
   GooseFEM::ColD Fi = vec.assembleDofs(quad.int_gradN_dot_tensor2s_dV(eps));
 
   // check
   // - size
   REQUIRE( Fi.size() == vec.ndof() );
   // - check all components
   for ( size_t i = 0 ; i < vec.ndof() ; ++i )
     EQ( Fi(i), 0 );
 
 }
 
 // =================================================================================================
 
 }
diff --git a/develop/Iterate.cpp b/develop/Iterate.cpp
index 9ebbba9..06e2229 100644
--- a/develop/Iterate.cpp
+++ b/develop/Iterate.cpp
@@ -1,38 +1,29 @@
 
-#include <catch/catch.hpp>
-
-#define CPPMAT_NOCONVERT
-#include <cppmat/cppmat.h>
-
-#include <Eigen/Eigen>
-
-#include <GooseFEM/GooseFEM.h>
-
-#define EQ(a,b) REQUIRE_THAT( (a), Catch::WithinAbs((b), 1.e-12) );
+#include "support.h"
 
 // =================================================================================================
 
 TEST_CASE("GooseFEM::Iterate", "Iterate.h")
 {
 
 // =================================================================================================
 
 SECTION( "StopList" )
 {
   GooseFEM::Iterate::StopList stop(5);
 
   REQUIRE( stop.stop(5.e+0, 1.e-3) == false );
   REQUIRE( stop.stop(5.e+1, 1.e-3) == false );
   REQUIRE( stop.stop(5.e-1, 1.e-3) == false );
   REQUIRE( stop.stop(5.e-2, 1.e-3) == false );
   REQUIRE( stop.stop(5.e-3, 1.e-3) == false );
   REQUIRE( stop.stop(5.e-4, 1.e-3) == false );
   REQUIRE( stop.stop(5.e-4, 1.e-3) == false );
   REQUIRE( stop.stop(5.e-4, 1.e-3) == false );
   REQUIRE( stop.stop(5.e-4, 1.e-3) == false );
   REQUIRE( stop.stop(5.e-4, 1.e-3) == true  );
 }
 
 // =================================================================================================
 
 }
diff --git a/develop/MatrixDiagonal.cpp b/develop/MatrixDiagonal.cpp
index 355d23e..d877738 100644
--- a/develop/MatrixDiagonal.cpp
+++ b/develop/MatrixDiagonal.cpp
@@ -1,107 +1,98 @@
 
-#include <catch/catch.hpp>
-
-#define CPPMAT_NOCONVERT
-#include <cppmat/cppmat.h>
-
-#include <Eigen/Eigen>
-
-#include <GooseFEM/GooseFEM.h>
-
-#define EQ(a,b) REQUIRE_THAT( (a), Catch::WithinAbs((b), 1.e-12) );
+#include "support.h"
 
 // =================================================================================================
 
 TEST_CASE("GooseFEM::MatrixDiagonal", "MatrixDiagonal.h")
 {
 
 // =================================================================================================
 
 SECTION( "dot" )
 {
   // mesh
   GooseFEM::Mesh::Quad4::Regular mesh(2,2);
 
   // random matrix and column
   GooseFEM::MatD a = GooseFEM::MatD::Random(mesh.nnode()*mesh.ndim(),mesh.nnode()*mesh.ndim());
   GooseFEM::ColD b = GooseFEM::ColD::Random(mesh.nnode()*mesh.ndim());
   GooseFEM::ColD c;
 
   // set diagonal
   for ( auto i = 0 ; i < a.rows() ; ++i ) {
     for ( auto j = 0 ; j < a.cols() ; ++j ) {
       if ( i != j ) a(i,j)  = 0.0;
       else          a(i,j) += 1.0;
     }
   }
 
   // compute product using Eigen
   c = a * b;
 
   // convert to GooseFEM
   // - allocate
   GooseFEM::MatrixDiagonal A(mesh.conn(), mesh.dofs());
   GooseFEM::ColD C;
   // - set
   for ( auto i = 0 ; i < a.rows() ; ++i )
     A(i,i) = a(i,i);
 
   // compute product
   C = A * b;
 
   // check
   // - size
   REQUIRE( C.size() == c.size() );
   // - components
   for ( auto i = 0 ; i < c.rows() ; ++i )
     EQ( C(i), c(i) );
 }
 
 // -------------------------------------------------------------------------------------------------
 
 SECTION( "solve" )
 {
   // mesh
   GooseFEM::Mesh::Quad4::Regular mesh(2,2);
 
   // random matrix and column
   GooseFEM::MatD a = GooseFEM::MatD::Random(mesh.nnode()*mesh.ndim(),mesh.nnode()*mesh.ndim());
   GooseFEM::ColD b = GooseFEM::ColD::Random(mesh.nnode()*mesh.ndim());
   GooseFEM::ColD c;
 
   // set diagonal
   for ( auto i = 0 ; i < a.rows() ; ++i ) {
     for ( auto j = 0 ; j < a.cols() ; ++j ) {
       if ( i != j ) a(i,j)  = 0.0;
       else          a(i,j) += 1.0;
     }
   }
 
   // compute product using Eigen
   c = a * b;
 
   // convert to GooseFEM
   // - allocate
   GooseFEM::MatrixDiagonal A(mesh.conn(), mesh.dofs());
   GooseFEM::ColD B, C;
   // - set
   for ( auto i = 0 ; i < a.rows() ; ++i )
     A(i,i) = a(i,i);
 
   // compute product
   C = A * b;
 
   // solve
   B = A.solve(C);
 
   // check
   // - size
   REQUIRE( B.size() == b.size() );
   // - components
   for ( auto i = 0 ; i < b.rows() ; ++i )
     EQ( B(i), b(i) );
 }
 
 // =================================================================================================
 
 }
diff --git a/develop/Vector.cpp b/develop/Vector.cpp
index 582ac8b..7079dc7 100644
--- a/develop/Vector.cpp
+++ b/develop/Vector.cpp
@@ -1,187 +1,178 @@
 
-#include <catch/catch.hpp>
-
-#define CPPMAT_NOCONVERT
-#include <cppmat/cppmat.h>
-
-#include <Eigen/Eigen>
-
-#include <GooseFEM/GooseFEM.h>
-
-#define EQ(a,b) REQUIRE_THAT( (a), Catch::WithinAbs((b), 1.e-12) );
+#include "support.h"
 
 // =================================================================================================
 
 TEST_CASE("GooseFEM::Vector", "Vector.h")
 {
 
 // =================================================================================================
 
 SECTION( "asDofs - nodevec" )
 {
   // mesh
   GooseFEM::Mesh::Quad4::Regular mesh(2,2);
 
   // vector-definition
   GooseFEM::Vector vector(mesh.conn(), mesh.dofsPeriodic());
 
   // velocity field
   // - allocate
   GooseFEM::MatD v(mesh.nnode(),2);
   // - set periodic
   v(0,0) = 1.0;  v(0,1) = 0.0;
   v(1,0) = 1.0;  v(1,1) = 0.0;
   v(2,0) = 1.0;  v(2,1) = 0.0;
   v(3,0) = 1.5;  v(3,1) = 0.0;
   v(4,0) = 1.5;  v(4,1) = 0.0;
   v(5,0) = 1.5;  v(5,1) = 0.0;
   v(6,0) = 1.0;  v(6,1) = 0.0;
   v(7,0) = 1.0;  v(7,1) = 0.0;
   v(8,0) = 1.0;  v(8,1) = 0.0;
 
   // convert to DOFs
   GooseFEM::ColD V = vector.asDofs(v);
 
   // check
   // - size
   REQUIRE( V.size() == mesh.nnodePeriodic() * mesh.ndim() );
   // - individual entries
   EQ( V(0), v(0,0) );
   EQ( V(1), v(0,1) );
   EQ( V(2), v(1,0) );
   EQ( V(3), v(1,1) );
   EQ( V(4), v(3,0) );
   EQ( V(5), v(3,1) );
   EQ( V(6), v(4,0) );
   EQ( V(7), v(4,1) );
 }
 
 // -------------------------------------------------------------------------------------------------
 
 SECTION( "asDofs - elemvec" )
 {
   // mesh
   GooseFEM::Mesh::Quad4::Regular mesh(2,2);
 
   // vector-definition
   GooseFEM::Vector vector(mesh.conn(), mesh.dofsPeriodic());
 
   // velocity field
   // - allocate
   GooseFEM::MatD v(mesh.nnode(),2);
   // - set periodic
   v(0,0) = 1.0;  v(0,1) = 0.0;
   v(1,0) = 1.0;  v(1,1) = 0.0;
   v(2,0) = 1.0;  v(2,1) = 0.0;
   v(3,0) = 1.5;  v(3,1) = 0.0;
   v(4,0) = 1.5;  v(4,1) = 0.0;
   v(5,0) = 1.5;  v(5,1) = 0.0;
   v(6,0) = 1.0;  v(6,1) = 0.0;
   v(7,0) = 1.0;  v(7,1) = 0.0;
   v(8,0) = 1.0;  v(8,1) = 0.0;
 
 
   // convert to DOFs - element - DOFs
   GooseFEM::ColD V = vector.asDofs(vector.asElement(vector.asDofs(v)));
 
   // check
   // - size
   REQUIRE( V.size() == mesh.nnodePeriodic() * mesh.ndim() );
   // - individual entries
   EQ( V(0), v(0,0) );
   EQ( V(1), v(0,1) );
   EQ( V(2), v(1,0) );
   EQ( V(3), v(1,1) );
   EQ( V(4), v(3,0) );
   EQ( V(5), v(3,1) );
   EQ( V(6), v(4,0) );
   EQ( V(7), v(4,1) );
 }
 
 // -------------------------------------------------------------------------------------------------
 
 SECTION( "asDofs - assembleDofs" )
 {
   // mesh
   GooseFEM::Mesh::Quad4::Regular mesh(2,2);
 
   // vector-definition
   GooseFEM::Vector vector(mesh.conn(), mesh.dofsPeriodic());
 
   // force field
   // - allocate
   GooseFEM::MatD f(mesh.nnode(),2);
   // - set periodic
   f(0,0) = -1.0;  f(0,1) = -1.0;
   f(1,0) =  0.0;  f(1,1) = -1.0;
   f(2,0) =  1.0;  f(2,1) = -1.0;
   f(3,0) = -1.0;  f(3,1) =  0.0;
   f(4,0) =  0.0;  f(4,1) =  0.0;
   f(5,0) =  1.0;  f(5,1) =  0.0;
   f(6,0) = -1.0;  f(6,1) =  1.0;
   f(7,0) =  0.0;  f(7,1) =  1.0;
   f(8,0) =  1.0;  f(8,1) =  1.0;
 
   // assemble as DOFs
   GooseFEM::ColD F = vector.assembleDofs(f);
 
   // check
   // - size
   REQUIRE( F.size() == mesh.nnodePeriodic() * mesh.ndim() );
   // - 'analytical' result
   EQ( F(0), 0 );
   EQ( F(1), 0 );
   EQ( F(2), 0 );
   EQ( F(3), 0 );
   EQ( F(4), 0 );
   EQ( F(5), 0 );
   EQ( F(6), 0 );
   EQ( F(7), 0 );
 }
 
 // -------------------------------------------------------------------------------------------------
 
 SECTION( "asDofs - assembleNode" )
 {
   // mesh
   GooseFEM::Mesh::Quad4::Regular mesh(2,2);
 
   // vector-definition
   GooseFEM::Vector vector(mesh.conn(), mesh.dofsPeriodic());
 
   // force field
   // - allocate
   GooseFEM::MatD f(mesh.nnode(),2);
   // - set periodic
   f(0,0) = -1.0;  f(0,1) = -1.0;
   f(1,0) =  0.0;  f(1,1) = -1.0;
   f(2,0) =  1.0;  f(2,1) = -1.0;
   f(3,0) = -1.0;  f(3,1) =  0.0;
   f(4,0) =  0.0;  f(4,1) =  0.0;
   f(5,0) =  1.0;  f(5,1) =  0.0;
   f(6,0) = -1.0;  f(6,1) =  1.0;
   f(7,0) =  0.0;  f(7,1) =  1.0;
   f(8,0) =  1.0;  f(8,1) =  1.0;
 
   // convert to element, assemble as DOFs
   GooseFEM::ColD F = vector.assembleDofs( vector.asElement(f) );
 
   // check
   // - size
   REQUIRE( F.size() == mesh.nnodePeriodic() * mesh.ndim() );
   // - 'analytical' result
   EQ( F(0), 0 );
   EQ( F(1), 0 );
   EQ( F(2), 0 );
   EQ( F(3), 0 );
   EQ( F(4), 0 );
   EQ( F(5), 0 );
   EQ( F(6), 0 );
   EQ( F(7), 0 );
 }
 
 // -------------------------------------------------------------------------------------------------
 
 // =================================================================================================
 
 }
diff --git a/develop/support.h b/develop/support.h
new file mode 100644
index 0000000..4a28019
--- /dev/null
+++ b/develop/support.h
@@ -0,0 +1,13 @@
+
+#ifndef SUPPORT_H
+#define SUPPORT_H
+
+#include <catch/catch.hpp>
+#include <cppmat/cppmat.h>
+#include <Eigen/Eigen>
+
+#include "../src/GooseFEM/GooseFEM.h"
+
+#define EQ(a,b) REQUIRE_THAT( (a), Catch::WithinAbs((b), 1.e-12) );
+
+#endif
diff --git a/src/GooseFEM/DynamicsDiagonal.cpp b/src/GooseFEM/DynamicsDiagonal.cpp
deleted file mode 100644
index 5fcb719..0000000
--- a/src/GooseFEM/DynamicsDiagonal.cpp
+++ /dev/null
@@ -1,1134 +0,0 @@
-/* =================================================================================================
-
-(c - GPLv3) T.W.J. de Geus (Tom) | tom@geus.me | www.geus.me | github.com/tdegeus/GooseFEM
-
-================================================================================================= */
-
-#ifndef GOOSEFEM_DYNAMICS_DIAGONAL_CPP
-#define GOOSEFEM_DYNAMICS_DIAGONAL_CPP
-
-// -------------------------------------------------------------------------------------------------
-
-#include "DynamicsDiagonal.h"
-
-// ================================= GooseFEM::Dynamics::Diagonal ==================================
-
-namespace GooseFEM {
-namespace Dynamics {
-namespace Diagonal {
-
-// ===================================== Simulation - Periodic =====================================
-
-template<class Element>
-inline Periodic<Element>::Periodic(
-  std::unique_ptr<Element> _elem, const MatD &_x, const MatS &_conn, const MatS &_dofs, double _dt
-)
-: elem(std::move(_elem)), conn(_conn), dofs(_dofs), x(_x), dt(_dt)
-{
-  // compute sizes
-  nnode = static_cast<size_t>(x.rows());
-  ndim  = static_cast<size_t>(x.cols());
-  nelem = static_cast<size_t>(conn.rows());
-  nne   = static_cast<size_t>(conn.cols());
-  ndof  = dofs.maxCoeff()+1;
-
-  // basic checks (mostly the user is 'trusted')
-  assert( static_cast<size_t>(dofs.size()) == nnode * ndim );
-  assert(                     ndof         <  nnode * ndim );
-
-  // allocate and zero-initialize nodal quantities
-  u.conservativeResize(nnode,ndim); u.setZero();
-  v.conservativeResize(nnode,ndim); v.setZero();
-  a.conservativeResize(nnode,ndim); a.setZero();
-
-  // allocate and zero-initialize linear system (DOFs)
-  Minv.conservativeResize(ndof);
-  M   .conservativeResize(ndof);
-  D   .conservativeResize(ndof);
-  F   .conservativeResize(ndof);
-  V   .conservativeResize(ndof);
-  V_n .conservativeResize(ndof); V_n .setZero();
-  A   .conservativeResize(ndof);
-  A_n .conservativeResize(ndof); A_n .setZero();
-
-  // initialize all fields
-  updated_x();
-  updated_u(true);
-  updated_v(true);
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::velocityVerlet()
-{
-  // (1) new positions (displacements)
-  // - apply update (nodal) : x_{n+1} = x_n + dt * v_n + .5 * dt^2 * a_n"
-  u += dt * v + ( .5 * std::pow(dt,2.) ) * a;
-  // - process update in displacements
-  updated_u();
-
-  // (2a) estimate new velocities
-  // - update velocities (DOFs)
-  V.noalias() = V_n + dt * A;
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - process update in velocities
-  updated_v();
-  // - solve for accelerations (DOFs)
-  A.noalias() = Minv.cwiseProduct( - F - D.cwiseProduct(V) );
-  // - update velocities (DOFs)
-  V.noalias() = V_n + ( .5 * dt ) * ( A_n + A );
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - process update in velocities
-  updated_v();
-
-  // (2b) new velocities
-  // - solve for accelerations (DOFs)
-  A.noalias() = Minv.cwiseProduct( - F - D.cwiseProduct(V) );
-  // - update velocities (DOFs)
-  V.noalias() = V_n + ( .5 * dt ) * ( A_n + A );
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - process update in velocities
-  updated_v();
-
-  // (3) new accelerations
-  // - solve for accelerations (DOFs)
-  A.noalias() = Minv.cwiseProduct( - F - D.cwiseProduct(V) );
-  // - convert to nodal acceleration (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) a(i) = A(dofs(i));
-
-  // store history
-  A_n = A;  // accelerations (DOFs)
-  V_n = V;  // velocities    (DOFs)
-  t  += dt; // current time
-
-  // N.B. at this point:
-  // "a" == "A" == "A_n"  ->  new nodal accelerations, their DOF equivalents, and a 'back-up'
-  // "v" ==        "V_n"  ->  new nodal velocities,                           and a 'back-up'
-  // "u"                  ->  new nodal displacements
-  // The forces "F" correspond to this state of the system
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::Verlet()
-{
-  // (1) new nodal positions (displacements)
-  // - apply update (nodal) : x_{n+1} = x_n + dt * v_n + .5 * dt^2 * a_n"
-  u += dt * v + ( .5 * std::pow(dt,2.) ) * a;
-  // - process update in displacements
-  updated_u();
-  // - solve for accelerations (DOFs)
-  A.noalias() = Minv.cwiseProduct( - F );
-  // - convert to nodal acceleration (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) a(i) = A(dofs(i));
-
-  // (2) propagate velocities
-  // - update velocities (DOFs)
-  V.noalias() = V_n + ( .5 * dt ) * ( A_n + A );
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-
-  // store history
-  A_n = A;  // accelerations (DOFs)
-  V_n = V;  // velocities    (DOFs)
-  t  += dt; // current time
-
-  // N.B. at this point:
-  // "a" == "A" == "A_n"  ->  new nodal accelerations, their DOF equivalents, and a 'back-up'
-  // "v" ==        "V_n"  ->  new nodal velocities,                           and a 'back-up'
-  // "u"                  ->  new nodal displacements
-  // The forces "F" correspond to this state of the system
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::ground()
-{
-  // set accelerations and velocities zero (DOFs)
-  A.setZero();
-  V.setZero();
-
-  // convert to nodal velocity/acceleration (several nodes may depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) a(i) = A(dofs(i));
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-
-  // store history
-  A_n = A;  // accelerations (DOFs)
-  V_n = V;  // velocities    (DOFs)
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::updated_x()
-{
-  // set the nodal positions of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->x(e,m,i) = x(conn(e,m),i);
-
-  // signal update
-  elem->updated_x();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::updated_u(bool init)
-{
-  // set the nodal displacements of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->u(e,m,i) = u(conn(e,m),i);
-
-  // signal update
-  elem->updated_u();
-
-  // update
-  if ( elem->changed_M or init ) assemble_M();
-  if ( elem->changed_D or init ) assemble_D();
-  if ( elem->changed_f or init ) assemble_F();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::updated_v(bool init)
-{
-  // set the nodal displacements of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->v(e,m,i) = v(conn(e,m),i);
-
-  // signal update
-  elem->updated_v();
-
-  // update
-  if ( elem->changed_M or init ) assemble_M();
-  if ( elem->changed_D or init ) assemble_D();
-  if ( elem->changed_f or init ) assemble_F();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::assemble_M()
-{
-  // zero-initialize
-  M.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - mass matrix, per thread
-    ColD M_(ndof);
-    M_.setZero();
-
-    // - assemble diagonal mass matrix, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          M_(dofs(conn(e,m),i)) += elem->M(e,m*ndim+i,m*ndim+i);
-
-    // - reduce "M_" per thread to total "M"
-    #pragma omp critical
-      M += M_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-
-  // compute inverse of the mass matrix
-  Minv = M.cwiseInverse();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::assemble_D()
-{
-  // zero-initialize
-  D.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - damping matrix, per thread
-    ColD D_(ndof);
-    D_.setZero();
-
-    // - assemble diagonal damping matrix, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          D_(dofs(conn(e,m),i)) += elem->D(e,m*ndim+i,m*ndim+i);
-
-    // - reduce "D_" per thread to total "D"
-    #pragma omp critical
-      D += D_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::assemble_F()
-{
-  // zero-initialize
-  F.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - force, per thread
-    ColD F_(ndof);
-    F_.setZero();
-
-    // - assemble force, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          F_(dofs(conn(e,m),i)) += elem->f(e,m,i);
-
-    // - reduce "F_" per thread to total "F"
-    #pragma omp critical
-      F += F_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-}
-
-// =================================== Simulation - SemiPeriodic ===================================
-
-template<class Element>
-inline SemiPeriodic<Element>::SemiPeriodic(
-  std::unique_ptr<Element> _elem, const MatD &_x, const MatS &_conn, const MatS &_dofs,
-  const ColS &_fixedDofs, double _dt
-)
-: elem(std::move(_elem)), conn(_conn), dofs(_dofs), x(_x), fixedDofs(_fixedDofs), dt(_dt)
-{
-  // compute sizes
-  nfixed    = static_cast<size_t>(fixedDofs.size());
-  nnode     = static_cast<size_t>(x.rows());
-  ndim      = static_cast<size_t>(x.cols());
-  nelem     = static_cast<size_t>(conn.rows());
-  nne       = static_cast<size_t>(conn.cols());
-  ndof      = dofs.maxCoeff()+1;
-
-  // basic checks (mostly the user is 'trusted')
-  assert( static_cast<size_t>(dofs.size()) == nnode * ndim );
-  assert(                     ndof         <  nnode * ndim );
-
-  #ifndef NDEBUG
-  for ( size_t i = 0 ; i < nfixed ; ++i ) assert( fixedDofs(i) < ndof );
-  #endif
-
-  // allocate and zero-initialize nodal quantities
-  u.conservativeResize(nnode,ndim); u.setZero();
-  v.conservativeResize(nnode,ndim); v.setZero();
-  a.conservativeResize(nnode,ndim); a.setZero();
-
-  // allocate and zero-initialize linear system (DOFs)
-  Minv.conservativeResize(ndof);
-  M   .conservativeResize(ndof);
-  D   .conservativeResize(ndof);
-  F   .conservativeResize(ndof);
-  V   .conservativeResize(ndof);
-  V_n .conservativeResize(ndof); V_n .setZero();
-  A   .conservativeResize(ndof);
-  A_n .conservativeResize(ndof); A_n .setZero();
-
-  // fixed DOFs : default zero velocity and acceleration
-  fixedV.conservativeResize(nfixed); fixedV.setZero();
-  fixedA.conservativeResize(nfixed); fixedA.setZero();
-
-  // initialize all fields
-  updated_x();
-  updated_u(true);
-  updated_v(true);
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::velocityVerlet()
-{
-  // (1) new positions (displacements)
-  // - apply update (nodal) : x_{n+1} = x_n + dt * v_n + .5 * dt^2 * a_n"
-  u += dt * v + ( .5 * std::pow(dt,2.) ) * a;
-  // - process update in displacements
-  updated_u();
-
-  // (2a) estimate new velocities
-  // - update velocities (DOFs)
-  V.noalias() = V_n + dt * A;
-  // - apply the fixed velocities
-  for ( size_t i=0; i<nfixed; ++i ) V(fixedDofs(i)) = fixedV(i);
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - process update in velocities
-  updated_v();
-  // - solve for accelerations (DOFs)
-  A.noalias() = Minv.cwiseProduct( - F - D.cwiseProduct(V) );
-  // - update velocities (DOFs)
-  V.noalias() = V_n + ( .5 * dt ) * ( A_n + A );
-  // - apply the fixed velocities
-  for ( size_t i=0; i<nfixed; ++i ) V(fixedDofs(i)) = fixedV(i);
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - process update in velocities
-  updated_v();
-
-  // (2b) new velocities
-  // - solve for accelerations (DOFs)
-  A.noalias() = Minv.cwiseProduct( - F - D.cwiseProduct(V) );
-  // - update velocities (DOFs)
-  V.noalias() = V_n + ( .5 * dt ) * ( A_n + A );
-  // - apply the fixed velocities
-  for ( size_t i=0; i<nfixed; ++i ) V(fixedDofs(i)) = fixedV(i);
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - process update in velocities
-  updated_v();
-
-  // (3) new accelerations
-  // - solve for accelerations (DOFs)
-  A.noalias() = Minv.cwiseProduct( - F - D.cwiseProduct(V) );
-  // - apply the fixed accelerations
-  for ( size_t i=0; i<nfixed; ++i ) A(fixedDofs(i)) = fixedA(i);
-  // - convert to nodal acceleration (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) a(i) = A(dofs(i));
-
-  // store history
-  A_n = A;  // accelerations (DOFs)
-  V_n = V;  // velocities    (DOFs)
-  t  += dt; // current time
-
-  // N.B. at this point:
-  // "a" == "A" == "A_n"  ->  new nodal accelerations, their DOF equivalents, and a 'back-up'
-  // "v" ==        "V_n"  ->  new nodal velocities,                           and a 'back-up'
-  // "u"                  ->  new nodal displacements
-  // The forces "F" correspond to this state of the system
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::Verlet()
-{
-  // (1) new nodal positions (displacements)
-  // - apply update (nodal) : x_{n+1} = x_n + dt * v_n + .5 * dt^2 * a_n"
-  u += dt * v + ( .5 * std::pow(dt,2.) ) * a;
-  // - process update in displacements
-  updated_u();
-  // - solve for accelerations (DOFs)
-  A.noalias() = Minv.cwiseProduct( - F );
-  // - apply the fixed accelerations
-  for ( size_t i=0; i<nfixed; ++i ) A(fixedDofs(i)) = fixedA(i);
-  // - convert to nodal acceleration (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) a(i) = A(dofs(i));
-
-  // (2) propagate velocities
-  // - update velocities (DOFs)
-  V.noalias() = V_n + ( .5 * dt ) * ( A_n + A );
-  // - apply the fixed velocities
-  for ( size_t i=0; i<nfixed; ++i ) V(fixedDofs(i)) = fixedV(i);
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-
-  // store history
-  A_n = A;  // accelerations (DOFs)
-  V_n = V;  // velocities    (DOFs)
-  t  += dt; // current time
-
-  // N.B. at this point:
-  // "a" == "A" == "A_n"  ->  new nodal accelerations, their DOF equivalents, and a 'back-up'
-  // "v" ==        "V_n"  ->  new nodal velocities,                           and a 'back-up'
-  // "u"                  ->  new nodal displacements
-  // The forces "F" correspond to this state of the system
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::ground()
-{
-  // set accelerations and velocities zero (DOFs)
-  A.setZero();
-  V.setZero();
-
-  // convert to nodal velocity/acceleration (several nodes may depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) a(i) = A(dofs(i));
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-
-  // store history
-  A_n = A;  // accelerations (DOFs)
-  V_n = V;  // velocities    (DOFs)
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::updated_x()
-{
-  // set the nodal positions of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->x(e,m,i) = x(conn(e,m),i);
-
-  // signal update
-  elem->updated_x();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::updated_u(bool init)
-{
-  // set the nodal displacements of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->u(e,m,i) = u(conn(e,m),i);
-
-  // signal update
-  elem->updated_u();
-
-  // update
-  if ( elem->changed_M or init ) assemble_M();
-  if ( elem->changed_D or init ) assemble_D();
-  if ( elem->changed_f or init ) assemble_F();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::updated_v(bool init)
-{
-  // set the nodal displacements of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->v(e,m,i) = v(conn(e,m),i);
-
-  // signal update
-  elem->updated_v();
-
-  // update
-  if ( elem->changed_M or init ) assemble_M();
-  if ( elem->changed_D or init ) assemble_D();
-  if ( elem->changed_f or init ) assemble_F();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::assemble_M()
-{
-  // zero-initialize
-  M.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - mass matrix, per thread
-    ColD M_(ndof);
-    M_.setZero();
-
-    // - assemble diagonal mass matrix, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          M_(dofs(conn(e,m),i)) += elem->M(e,m*ndim+i,m*ndim+i);
-
-    // - reduce "M_" per thread to total "M"
-    #pragma omp critical
-      M += M_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-
-  // compute inverse of the mass matrix
-  Minv = M.cwiseInverse();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::assemble_D()
-{
-  // zero-initialize
-  D.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - damping matrix, per thread
-    ColD D_(ndof);
-    D_.setZero();
-
-    // - assemble diagonal damping matrix, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          D_(dofs(conn(e,m),i)) += elem->D(e,m*ndim+i,m*ndim+i);
-
-    // - reduce "D_" per thread to total "D"
-    #pragma omp critical
-      D += D_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::assemble_F()
-{
-  // zero-initialize
-  F.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - force, per thread
-    ColD F_(ndof);
-    F_.setZero();
-
-    // - assemble force, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          F_(dofs(conn(e,m),i)) += elem->f(e,m,i);
-
-    // - reduce "F_" per thread to total "F"
-    #pragma omp critical
-      F += F_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-}
-
-// ======================================== Element - Quad4 ========================================
-
-namespace SmallStrain {
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Material>
-inline Quad4<Material>::Quad4(std::unique_ptr<Material> _mat, size_t _nelem)
-{
-  // copy from input
-  nelem = _nelem;
-  mat   = std::move(_mat);
-
-  // allocate matrices
-  // -
-  x     .resize({nelem,nne,ndim});
-  u     .resize({nelem,nne,ndim});
-  v     .resize({nelem,nne,ndim});
-  f     .resize({nelem,nne,ndim});
-  // -
-  M     .resize({nelem,nne*ndim,nne*ndim});
-  D     .resize({nelem,nne*ndim,nne*ndim});
-  // -
-  dNx   .resize({nelem,nip,nne,ndim});
-  // -
-  vol   .resize({nelem,nip});
-  vol_n .resize({nelem,nne});
-  // -
-  dNxi  .resize({nip,nne,ndim});
-  dNxi_n.resize({nne,nne,ndim});
-  // -
-  w     .resize({nip});
-  w_n   .resize({nne});
-
-  // zero initialize matrices (only diagonal is written, no new zero-initialization necessary)
-  M.setZero();
-  D.setZero();
-
-  // shape function gradient at all Gauss points, in local coordinates
-  // - k == 0
-  dNxi(0,0,0) = -.25*(1.+1./std::sqrt(3.));     dNxi(0,0,1) = -.25*(1.+1./std::sqrt(3.));
-  dNxi(0,1,0) = +.25*(1.+1./std::sqrt(3.));     dNxi(0,1,1) = -.25*(1.-1./std::sqrt(3.));
-  dNxi(0,2,0) = +.25*(1.-1./std::sqrt(3.));     dNxi(0,2,1) = +.25*(1.-1./std::sqrt(3.));
-  dNxi(0,3,0) = -.25*(1.-1./std::sqrt(3.));     dNxi(0,3,1) = +.25*(1.+1./std::sqrt(3.));
-  // - k == 1
-  dNxi(1,0,0) = -.25*(1.+1./std::sqrt(3.));     dNxi(1,0,1) = -.25*(1.-1./std::sqrt(3.));
-  dNxi(1,1,0) = +.25*(1.+1./std::sqrt(3.));     dNxi(1,1,1) = -.25*(1.+1./std::sqrt(3.));
-  dNxi(1,2,0) = +.25*(1.-1./std::sqrt(3.));     dNxi(1,2,1) = +.25*(1.+1./std::sqrt(3.));
-  dNxi(1,3,0) = -.25*(1.-1./std::sqrt(3.));     dNxi(1,3,1) = +.25*(1.-1./std::sqrt(3.));
-  // - k == 2
-  dNxi(2,0,0) = -.25*(1.-1./std::sqrt(3.));     dNxi(2,0,1) = -.25*(1.-1./std::sqrt(3.));
-  dNxi(2,1,0) = +.25*(1.-1./std::sqrt(3.));     dNxi(2,1,1) = -.25*(1.+1./std::sqrt(3.));
-  dNxi(2,2,0) = +.25*(1.+1./std::sqrt(3.));     dNxi(2,2,1) = +.25*(1.+1./std::sqrt(3.));
-  dNxi(2,3,0) = -.25*(1.+1./std::sqrt(3.));     dNxi(2,3,1) = +.25*(1.-1./std::sqrt(3.));
-  // - k == 3
-  dNxi(3,0,0) = -.25*(1.-1./std::sqrt(3.));     dNxi(3,0,1) = -.25*(1.+1./std::sqrt(3.));
-  dNxi(3,1,0) = +.25*(1.-1./std::sqrt(3.));     dNxi(3,1,1) = -.25*(1.-1./std::sqrt(3.));
-  dNxi(3,2,0) = +.25*(1.+1./std::sqrt(3.));     dNxi(3,2,1) = +.25*(1.-1./std::sqrt(3.));
-  dNxi(3,3,0) = -.25*(1.+1./std::sqrt(3.));     dNxi(3,3,1) = +.25*(1.+1./std::sqrt(3.));
-
-  // integration point weight at all Gauss points
-  w(0) = 1.;
-  w(1) = 1.;
-  w(2) = 1.;
-  w(3) = 1.;
-
-  // shape function gradient at all nodes, in local coordinates
-  // - k == 0
-  dNxi_n(0,0,0) = -0.5;     dNxi_n(0,0,1) = -0.5;
-  dNxi_n(0,1,0) = +0.5;     dNxi_n(0,1,1) =  0.0;
-  dNxi_n(0,2,0) =  0.0;     dNxi_n(0,2,1) =  0.0;
-  dNxi_n(0,3,0) =  0.0;     dNxi_n(0,3,1) = +0.5;
-  // - k == 1
-  dNxi_n(1,0,0) = -0.5;     dNxi_n(1,0,1) =  0.0;
-  dNxi_n(1,1,0) = +0.5;     dNxi_n(1,1,1) = -0.5;
-  dNxi_n(1,2,0) =  0.0;     dNxi_n(1,2,1) = +0.5;
-  dNxi_n(1,3,0) =  0.0;     dNxi_n(1,3,1) =  0.0;
-  // - k == 2
-  dNxi_n(2,0,0) =  0.0;     dNxi_n(2,0,1) =  0.0;
-  dNxi_n(2,1,0) =  0.0;     dNxi_n(2,1,1) = -0.5;
-  dNxi_n(2,2,0) = +0.5;     dNxi_n(2,2,1) = +0.5;
-  dNxi_n(2,3,0) = -0.5;     dNxi_n(2,3,1) =  0.0;
-  // - k == 3
-  dNxi_n(3,0,0) =  0.0;     dNxi_n(3,0,1) = -0.5;
-  dNxi_n(3,1,0) =  0.0;     dNxi_n(3,1,1) =  0.0;
-  dNxi_n(3,2,0) = +0.5;     dNxi_n(3,2,1) =  0.0;
-  dNxi_n(3,3,0) = -0.5;     dNxi_n(3,3,1) = +0.5;
-
-  // integration point weight at all nodes
-  w_n(0) = 1.;
-  w_n(1) = 1.;
-  w_n(2) = 1.;
-  w_n(3) = 1.;
-
-  // Note, the above is a specialization of the following:
-  //
-  // - Shape function gradients
-  //
-  //    dNxi(0,0) = -.25*(1.-xi(k,1)); dNxi(0,1) = -.25*(1.-xi(k,0));
-  //    dNxi(1,0) = +.25*(1.-xi(k,1)); dNxi(1,1) = -.25*(1.+xi(k,0));
-  //    dNxi(2,0) = +.25*(1.+xi(k,1)); dNxi(2,1) = +.25*(1.+xi(k,0));
-  //    dNxi(3,0) = -.25*(1.+xi(k,1)); dNxi(3,1) = +.25*(1.-xi(k,0));
-  //
-  // - Gauss point coordinates and weights
-  //
-  //    xi(0,0) = -1./std::sqrt(3.); xi(0,1) = -1./std::sqrt(3.); w(0) = 1.;
-  //    xi(1,0) = +1./std::sqrt(3.); xi(1,1) = -1./std::sqrt(3.); w(1) = 1.;
-  //    xi(2,0) = +1./std::sqrt(3.); xi(2,1) = +1./std::sqrt(3.); w(2) = 1.;
-  //    xi(3,0) = -1./std::sqrt(3.); xi(3,1) = +1./std::sqrt(3.); w(3) = 1.;
-  //
-  // - Nodal coordinates and weights
-  //
-  //    xi(0,0) = -1.; xi(0,1) = -1.; w(0) = 1.;
-  //    xi(1,0) = +1.; xi(1,1) = -1.; w(1) = 1.;
-  //    xi(2,0) = +1.; xi(2,1) = +1.; w(2) = 1.;
-  //    xi(3,0) = -1.; xi(3,1) = +1.; w(3) = 1.;
-
-}
-
-// =================================================================================================
-
-template<class Material>
-inline void Quad4<Material>::updated_x()
-{
-#pragma omp parallel
-{
-  // intermediate quantities
-  cppmat::cartesian2d::tensor2<double> J_, Jinv_;
-  double Jdet_;
-  // local views of the global arrays (speeds up indexing, and increases readability)
-  cppmat::tiny::matrix2<double,8,8> M_, D_;
-  cppmat::tiny::matrix2<double,4,2> dNxi_, dNx_, x_;
-  cppmat::tiny::vector <double,4>   w_, vol_;
-
-  // loop over all elements (in parallel)
-  #pragma omp for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-  {
-    // pointer to element positions
-    x_.map(&x(e));
-
-    // nodal quadrature
-    // ----------------
-
-    // pointer to element mass/damping matrix, nodal volume, and integration weight
-    M_  .map(&M    (e));
-    D_  .map(&D    (e));
-    vol_.map(&vol_n(e));
-    w_  .map(&w_n  (0));
-
-    // loop over nodes, i.e. the integration points
-    for ( size_t k = 0 ; k < nne ; ++k )
-    {
-      // - pointer to the shape function gradients (local coordinates)
-      dNxi_.map(&dNxi_n(k));
-
-      // - Jacobian
-      //   J(i,j) += dNxi(m,i) * xe(m,j)
-      J_(0,0) = dNxi_(0,0)*x_(0,0) + dNxi_(1,0)*x_(1,0) + dNxi_(2,0)*x_(2,0) + dNxi_(3,0)*x_(3,0);
-      J_(0,1) = dNxi_(0,0)*x_(0,1) + dNxi_(1,0)*x_(1,1) + dNxi_(2,0)*x_(2,1) + dNxi_(3,0)*x_(3,1);
-      J_(1,0) = dNxi_(0,1)*x_(0,0) + dNxi_(1,1)*x_(1,0) + dNxi_(2,1)*x_(2,0) + dNxi_(3,1)*x_(3,0);
-      J_(1,1) = dNxi_(0,1)*x_(0,1) + dNxi_(1,1)*x_(1,1) + dNxi_(2,1)*x_(2,1) + dNxi_(3,1)*x_(3,1);
-
-      // - determinant of the Jacobian
-      Jdet_ = J_.det();
-
-      // - integration point volume
-      vol_(k) = w_(k) * Jdet_;
-
-      // - assemble element mass matrix
-      //   M(m+i,n+i) += N(m) * rho * vol * N(n);
-      M_(k*2  ,k*2  ) = mat->rho(e,k) * vol_(k);
-      M_(k*2+1,k*2+1) = mat->rho(e,k) * vol_(k);
-
-      // - assemble element non-Galilean damping matrix
-      //   D(m+i,n+i) += N(m) * alpha * vol * N(n);
-      D_(k*2  ,k*2  ) = mat->alpha(e,k) * vol_(k);
-      D_(k*2+1,k*2+1) = mat->alpha(e,k) * vol_(k);
-    }
-
-    // Gaussian quadrature
-    // -------------------
-
-    // pointer to element integration volume and weight
-    vol_.map(&vol(e));
-    w_  .map(&w  (0));
-
-    // loop over Gauss points
-    for ( size_t k = 0 ; k < nip ; ++k )
-    {
-      // - pointer to the shape function gradients (local/global coordinates)
-      dNxi_.map(&dNxi(  k));
-      dNx_ .map(&dNx (e,k));
-
-      // - Jacobian
-      //   J(i,j) += dNxi(m,i) * xe(m,j)
-      J_(0,0) = dNxi_(0,0)*x_(0,0) + dNxi_(1,0)*x_(1,0) + dNxi_(2,0)*x_(2,0) + dNxi_(3,0)*x_(3,0);
-      J_(0,1) = dNxi_(0,0)*x_(0,1) + dNxi_(1,0)*x_(1,1) + dNxi_(2,0)*x_(2,1) + dNxi_(3,0)*x_(3,1);
-      J_(1,0) = dNxi_(0,1)*x_(0,0) + dNxi_(1,1)*x_(1,0) + dNxi_(2,1)*x_(2,0) + dNxi_(3,1)*x_(3,0);
-      J_(1,1) = dNxi_(0,1)*x_(0,1) + dNxi_(1,1)*x_(1,1) + dNxi_(2,1)*x_(2,1) + dNxi_(3,1)*x_(3,1);
-
-      // - determinant and inverse of the Jacobian
-      Jdet_ = J_.det();
-      Jinv_ = J_.inv();
-
-      // - integration point volume
-      vol_(k) = w_(k) * Jdet_;
-
-      // - shape function gradients (global coordinates)
-      //   dNx(m,i) += Jinv(i,j) * dNxi(m,j)
-      for ( size_t m = 0 ; m < nne ; ++m )
-      {
-        dNx_(m,0) = Jinv_(0,0) * dNxi_(m,0) + Jinv_(0,1) * dNxi_(m,1);
-        dNx_(m,1) = Jinv_(1,0) * dNxi_(m,0) + Jinv_(1,1) * dNxi_(m,1);
-      }
-    }
-  }
-
-  // set signals
-  changed_f = false;
-  changed_M = true;
-  changed_D = true;
-
-} // #pragma omp parallel
-}
-
-// =================================================================================================
-
-template<class Material>
-inline void Quad4<Material>::updated_u()
-{
-#pragma omp parallel
-{
-  // intermediate quantities
-  cppmat::cartesian2d::tensor2 <double> gradu_;
-  cppmat::cartesian2d::tensor2s<double> eps_, sig_;
-  // local views of the global arrays (speeds up indexing, and increases readability)
-  cppmat::tiny::matrix2<double,4,2> dNx_, u_, f_;
-  cppmat::tiny::vector <double,4>   vol_;
-
-  // loop over all elements (in parallel)
-  #pragma omp for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-  {
-    // pointer to element forces, displacements, and integration volume
-    f_  .map(&f  (e));
-    u_  .map(&u  (e));
-    vol_.map(&vol(e));
-
-    // zero initialize forces
-    f_.setZero();
-
-    // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < nip ; ++k )
-    {
-      // - pointer to the shape function gradients, strain and stress tensor (stored symmetric)
-      dNx_.map(&dNx     (e,k));
-      eps_.map(&mat->eps(e,k));
-      sig_.map(&mat->sig(e,k));
-
-      // - displacement gradient
-      //   gradu_(i,j) += dNx(m,i) * ue(m,j)
-      gradu_(0,0) = dNx_(0,0)*u_(0,0) + dNx_(1,0)*u_(1,0) + dNx_(2,0)*u_(2,0) + dNx_(3,0)*u_(3,0);
-      gradu_(0,1) = dNx_(0,0)*u_(0,1) + dNx_(1,0)*u_(1,1) + dNx_(2,0)*u_(2,1) + dNx_(3,0)*u_(3,1);
-      gradu_(1,0) = dNx_(0,1)*u_(0,0) + dNx_(1,1)*u_(1,0) + dNx_(2,1)*u_(2,0) + dNx_(3,1)*u_(3,0);
-      gradu_(1,1) = dNx_(0,1)*u_(0,1) + dNx_(1,1)*u_(1,1) + dNx_(2,1)*u_(2,1) + dNx_(3,1)*u_(3,1);
-
-      // - strain (stored symmetric)
-      //   eps(i,j) = .5 * ( gradu_(i,j) + gradu_(j,i) )
-      eps_(0,0) =        gradu_(0,0);
-      eps_(0,1) = .5 * ( gradu_(0,1) + gradu_(1,0) );
-      eps_(1,1) =        gradu_(1,1);
-
-      // - constitutive response
-      mat->updated_eps(e,k);
-
-      // - assemble to element force
-      //   f(m,j) += dNx(m,i) * sig(i,j) * vol;
-      for ( size_t m = 0 ; m < nne ; ++m )
-      {
-        f_(m,0) += dNx_(m,0) * sig_(0,0) * vol_(k) + dNx_(m,1) * sig_(1,0) * vol_(k);
-        f_(m,1) += dNx_(m,0) * sig_(0,1) * vol_(k) + dNx_(m,1) * sig_(1,1) * vol_(k);
-      }
-    }
-  }
-
-  // set signals
-  changed_f = true;
-  changed_M = false;
-  changed_D = false;
-}
-}
-
-// =================================================================================================
-
-template<class Material>
-inline void Quad4<Material>::updated_v()
-{
-#pragma omp parallel
-{
-  // intermediate quantities
-  cppmat::cartesian2d::tensor2 <double> gradv_;
-  cppmat::cartesian2d::tensor2s<double> epsdot_, sig_;
-  // local views of the global arrays (speeds up indexing, and increases readability)
-  cppmat::tiny::matrix2<double,4,2> dNx_, v_, f_;
-  cppmat::tiny::vector <double,4>   vol_;
-
-  // loop over all elements (in parallel)
-  #pragma omp for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-  {
-    // pointer to element forces, displacements, and integration volume
-    f_  .map(&f  (e));
-    v_  .map(&v  (e));
-    vol_.map(&vol(e));
-
-    // zero initialize forces
-    f_.setZero();
-
-    // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < nip ; ++k )
-    {
-      // - pointer to the shape function gradients, strain-rate and stress tensor (stored symmetric)
-      dNx_   .map(&dNx        (e,k));
-      epsdot_.map(&mat->epsdot(e,k));
-      sig_   .map(&mat->sig   (e,k));
-
-      // - displacement gradient
-      //   gradv_(i,j) += dNx(m,i) * ue(m,j)
-      gradv_(0,0) = dNx_(0,0)*v_(0,0) + dNx_(1,0)*v_(1,0) + dNx_(2,0)*v_(2,0) + dNx_(3,0)*v_(3,0);
-      gradv_(0,1) = dNx_(0,0)*v_(0,1) + dNx_(1,0)*v_(1,1) + dNx_(2,0)*v_(2,1) + dNx_(3,0)*v_(3,1);
-      gradv_(1,0) = dNx_(0,1)*v_(0,0) + dNx_(1,1)*v_(1,0) + dNx_(2,1)*v_(2,0) + dNx_(3,1)*v_(3,0);
-      gradv_(1,1) = dNx_(0,1)*v_(0,1) + dNx_(1,1)*v_(1,1) + dNx_(2,1)*v_(2,1) + dNx_(3,1)*v_(3,1);
-
-      // - strain (stored symmetric)
-      //   epsdot(i,j) = .5 * ( gradv_(i,j) + gradv_(j,i) )
-      epsdot_(0,0) =        gradv_(0,0);
-      epsdot_(0,1) = .5 * ( gradv_(0,1) + gradv_(1,0) );
-      epsdot_(1,1) =        gradv_(1,1);
-
-      // - constitutive response
-      mat->updated_epsdot(e,k);
-
-      // - assemble to element force
-      //   f(m,j) += dNdx(m,i) * sig(i,j) * vol;
-      for ( size_t m = 0 ; m < nne ; ++m )
-      {
-        f_(m,0) += dNx_(m,0) * sig_(0,0) * vol_(k) + dNx_(m,1) * sig_(1,0) * vol_(k);
-        f_(m,1) += dNx_(m,0) * sig_(0,1) * vol_(k) + dNx_(m,1) * sig_(1,1) * vol_(k);
-      }
-    }
-  }
-
-  // set signals
-  changed_f = true;
-  changed_M = false;
-  changed_D = false;
-}
-}
-
-// =================================================================================================
-
-template<class Material>
-inline cppmat::cartesian2d::tensor2s<double> Quad4<Material>::mean_eps(size_t e)
-{
-  cppmat::cartesian2d::tensor2s<double> eps_, tot_eps_(0.0);
-  cppmat::tiny::vector <double,4> vol_;
-  double tot_vol_ = 0.0;
-
-  // pointer to integration volume
-  vol_.map(&vol(e));
-
-  // loop over all integration points in element "e"
-  for ( size_t k = 0 ; k < nip ; ++k )
-  {
-    // - pointer to strain tensor (stored symmetric)
-    eps_.map(&mat->eps(e,k));
-
-    // - add to average
-    tot_eps_ += vol_(k) * eps_;
-    tot_vol_ += vol_(k);
-  }
-
-  // return volume average
-  return ( tot_eps_ / tot_vol_ );
-}
-
-// =================================================================================================
-
-template<class Material>
-inline cppmat::cartesian2d::tensor2s<double> Quad4<Material>::mean_sig(size_t e)
-{
-  cppmat::cartesian2d::tensor2s<double> sig_, tot_sig_(0.0);
-  cppmat::tiny::vector <double,4> vol_;
-  double tot_vol_ = 0.0;
-
-  // pointer to integration volume
-  vol_.map(&vol(e));
-
-  // loop over all integration points in element "e"
-  for ( size_t k = 0 ; k < nip ; ++k )
-  {
-    // - pointer to strain tensor (stored symmetric)
-    sig_.map(&mat->sig(e,k));
-
-    // - add to average
-    tot_sig_ += vol_(k) * sig_;
-    tot_vol_ += vol_(k);
-  }
-
-  // return volume average
-  return ( tot_sig_ / tot_vol_ );
-}
-
-// =================================================================================================
-
-template<class Material>
-inline cppmat::cartesian2d::tensor2s<double> Quad4<Material>::mean_eps()
-{
-  cppmat::cartesian2d::tensor2s<double> eps_, tot_eps_(0.0);
-  cppmat::tiny::vector <double,4> vol_;
-  double tot_vol_ = 0.0;
-
-  // loop over all elements (in parallel)
-  for ( size_t e = 0 ; e < nelem ; ++e )
-  {
-    // pointer to integration volume
-    vol_.map(&vol(e));
-
-    // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < nip ; ++k )
-    {
-      // - pointer to strain tensor (stored symmetric)
-      eps_.map(&mat->eps(e,k));
-
-      // - add to average
-      tot_eps_ += vol_(k) * eps_;
-      tot_vol_ += vol_(k);
-    }
-  }
-
-  // return volume average
-  return ( tot_eps_ / tot_vol_ );
-}
-
-// =================================================================================================
-
-template<class Material>
-inline cppmat::cartesian2d::tensor2s<double> Quad4<Material>::mean_sig()
-{
-  cppmat::cartesian2d::tensor2s<double> sig_, tot_sig_(0.0);
-  cppmat::tiny::vector <double,4> vol_;
-  double tot_vol_ = 0.0;
-
-  // loop over all elements (in parallel)
-  for ( size_t e = 0 ; e < nelem ; ++e )
-  {
-    // pointer to integration volume
-    vol_.map(&vol(e));
-
-    // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < nip ; ++k )
-    {
-      // - pointer to strain tensor (stored symmetric)
-      sig_.map(&mat->sig(e,k));
-
-      // - add to average
-      tot_sig_ += vol_(k) * sig_;
-      tot_vol_ += vol_(k);
-    }
-  }
-
-  // return volume average
-  return ( tot_sig_ / tot_vol_ );
-}
-
-// -------------------------------------------------------------------------------------------------
-
-} // namespace ...
-
-// =================================================================================================
-
-}}} // namespace ...
-
-// =================================================================================================
-
-#endif
diff --git a/src/GooseFEM/DynamicsDiagonal.h b/src/GooseFEM/DynamicsDiagonal.h
deleted file mode 100644
index 0e0fa5b..0000000
--- a/src/GooseFEM/DynamicsDiagonal.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/* =================================================================================================
-
-(c - GPLv3) T.W.J. de Geus (Tom) | tom@geus.me | www.geus.me | github.com/tdegeus/GooseFEM
-
-================================================================================================= */
-
-#ifndef GOOSEFEM_DYNAMICS_DIAGONAL_H
-#define GOOSEFEM_DYNAMICS_DIAGONAL_H
-
-// -------------------------------------------------------------------------------------------------
-
-#include "GooseFEM.h"
-
-// ================================= GooseFEM::Dynamics::Diagonal ==================================
-
-namespace GooseFEM {
-namespace Dynamics {
-namespace Diagonal {
-
-// ===================================== Simulation - Periodic =====================================
-
-template<class Element>
-class Periodic
-{
-public:
-
-  // variables
-  // ---------
-
-  // element/quadrature/material definition
-  std::unique_ptr<Element> elem;
-
-  // mesh: nodal position/displacement/velocity/acceleration, DOF-numbers, connectivity, dimensions
-  size_t nnode, nelem, nne, ndim, ndof;
-  MatS   conn, dofs;
-  MatD   x, u, v, a;
-
-  // linear system: columns (also "M" and "D" which are diagonal)
-  ColD   M, Minv, D, F, V, V_n, A, A_n;
-
-  // time integration
-  double dt, t=0.0;
-
-  // constructor
-  // -----------
-
-  Periodic(){};
-
-  Periodic(
-    std::unique_ptr<Element> elem, const MatD &x, const MatS &conn, const MatS &dofs, double dt
-  );
-
-  // functions
-  // ---------
-
-  void velocityVerlet();           // one time step of the time integrator
-  void Verlet();                   // one time step of the time integrator (no velocity dependence)
-  void ground();                   // ground the system: set A = V = 0 (also for their history)
-  void updated_x();                // process update in "x"
-  void updated_u(bool init=false); // process update in "u", if init all possible updates are made
-  void updated_v(bool init=false); // process update in "v", if init all possible updates are made
-  void assemble_M();               // assemble the mass matrix from the element mass matrices
-  void assemble_D();               // assemble the damping matrix from the element damping matrices
-  void assemble_F();               // assemble the force from the element forces
-};
-
-// =================================== Simulation - SemiPeriodic ===================================
-
-template<class Element>
-class SemiPeriodic
-{
-public:
-
-  // variables
-  // ---------
-
-  // element/quadrature/material definition
-  std::unique_ptr<Element> elem;
-
-  // mesh: nodal position/displacement/velocity/acceleration, DOF-numbers, connectivity, dimensions
-  size_t nnode, nelem, nne, ndim, ndof;
-  MatS   conn, dofs;
-  MatD   x, u, v, a;
-
-  // fixed DOFs: prescribed velocity/acceleration, DOF-numbers, dimensions
-  size_t nfixed;
-  ColS   fixedDofs;
-  ColD   fixedV, fixedA;
-
-  // linear system: columns (also "M" and "D" which are diagonal)
-  ColD   M, Minv, D, F, V, V_n, A, A_n;
-
-  // time integration
-  double dt, t=0.0;
-
-  // constructor
-  // -----------
-
-  SemiPeriodic(){};
-
-  SemiPeriodic(
-    std::unique_ptr<Element> elem, const MatD &x, const MatS &conn, const MatS &dofs,
-    const ColS &fixedDofs, double dt
-  );
-
-  // functions
-  // ---------
-
-  void velocityVerlet();           // one time step of the time integrator
-  void Verlet();                   // one time step of the time integrator (no velocity dependence)
-  void ground();                   // ground the system: set A = V = 0 (also for their history)
-  void updated_x();                // process update in "x"
-  void updated_u(bool init=false); // process update in "u", if init all possible updates are made
-  void updated_v(bool init=false); // process update in "v", if init all possible updates are made
-  void assemble_M();               // assemble the mass matrix from the element mass matrices
-  void assemble_D();               // assemble the damping matrix from the element damping matrices
-  void assemble_F();               // assemble the force from the element forces
-};
-
-// ======================================== Element - Quad4 ========================================
-
-namespace SmallStrain {
-
-template<class Material>
-class Quad4
-{
-public:
-
-  // class variables
-  // ---------------
-
-  // constitutive response per integration point, per element
-  std::unique_ptr<Material> mat;
-
-  // matrices to store the element data
-  cppmat::matrix<double> x, u, v, f, M, D, dNx, dNxi, w, vol, dNxi_n, w_n, vol_n;
-
-  // dimensions
-  size_t nelem, nip=4, nne=4, ndim=2;
-
-  // signals for solvers
-  bool changed_f=false, changed_M=true, changed_D=true;
-
-  // class functions
-  // ---------------
-
-  // constructor
-  Quad4(std::unique_ptr<Material> mat, size_t nelem);
-
-  // recompute relevant quantities after "x", "u", or "v" have been externally updated
-  void updated_x();
-  void updated_u();
-  void updated_v();
-
-  // return volume average stress and strain
-  cppmat::cartesian2d::tensor2s<double> mean_eps(size_t e);   // of element "e"
-  cppmat::cartesian2d::tensor2s<double> mean_sig(size_t e);   // of element "e"
-  cppmat::cartesian2d::tensor2s<double> mean_eps();           // of all elements
-  cppmat::cartesian2d::tensor2s<double> mean_sig();           // of all elements
-};
-
-} // namespace ...
-
-// =================================================================================================
-
-}}} // namespace ...
-
-// =================================================================================================
-
-#endif
diff --git a/src/GooseFEM/ElementQuad4.cpp b/src/GooseFEM/ElementQuad4.cpp
index 1b39bc1..011f596 100644
--- a/src/GooseFEM/ElementQuad4.cpp
+++ b/src/GooseFEM/ElementQuad4.cpp
@@ -1,639 +1,622 @@
 /* =================================================================================================
 
 (c - GPLv3) T.W.J. de Geus (Tom) | tom@geus.me | www.geus.me | github.com/tdegeus/GooseFEM
 
 ================================================================================================= */
 
 #ifndef GOOSEFEM_ELEMENTQUAD4_CPP
 #define GOOSEFEM_ELEMENTQUAD4_CPP
 
 // -------------------------------------------------------------------------------------------------
 
 #include "ElementQuad4.h"
 
 // =================================== GooseFEM::Element::Quad4 ====================================
 
 namespace GooseFEM {
 namespace Element {
 namespace Quad4 {
 
 // ================================ GooseFEM::Element::Quad4::Gauss ================================
 
 namespace Gauss {
 
 // --------------------------------- number of integration points ----------------------------------
 
 inline size_t nip()
 {
   return 4;
 }
 
 // ----------------------- integration point coordinates (local coordinates) -----------------------
 
 inline ArrD xi()
 {
   size_t nip  = 4;
   size_t ndim = 2;
 
   ArrD xi({nip,ndim});
 
   xi(0,0) = -1./std::sqrt(3.);    xi(0,1) = -1./std::sqrt(3.);
   xi(1,0) = +1./std::sqrt(3.);    xi(1,1) = -1./std::sqrt(3.);
   xi(2,0) = +1./std::sqrt(3.);    xi(2,1) = +1./std::sqrt(3.);
   xi(3,0) = -1./std::sqrt(3.);    xi(3,1) = +1./std::sqrt(3.);
 
   return xi;
 }
 
 // ----------------------------------- integration point weights -----------------------------------
 
 inline ArrD w()
 {
   size_t nip = 4;
 
   ArrD w({nip});
 
   w(0) = 1.;
   w(1) = 1.;
   w(2) = 1.;
   w(3) = 1.;
 
   return w;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 }
 
 // ================================ GooseFEM::Element::Quad4::Nodal ================================
 
 namespace Nodal {
 
 // --------------------------------- number of integration points ----------------------------------
 
 inline size_t nip()
 {
   return 4;
 }
 
 // ----------------------- integration point coordinates (local coordinates) -----------------------
 
 inline ArrD xi()
 {
   size_t nip  = 4;
   size_t ndim = 2;
 
   ArrD xi({nip,ndim});
 
   xi(0,0) = -1.;    xi(0,1) = -1.;
   xi(1,0) = +1.;    xi(1,1) = -1.;
   xi(2,0) = +1.;    xi(2,1) = +1.;
   xi(3,0) = -1.;    xi(3,1) = +1.;
 
   return xi;
 }
 
 // ----------------------------------- integration point weights -----------------------------------
 
 inline ArrD w()
 {
   size_t nip = 4;
 
   ArrD w({nip});
 
   w(0) = 1.;
   w(1) = 1.;
   w(2) = 1.;
   w(3) = 1.;
 
   return w;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 }
 
 // =================================================================================================
 
 // ------------------------------------------ constructor ------------------------------------------
 
 inline Quadrature::Quadrature(const ArrD &x, const ArrD &xi, const ArrD &w)
 : m_x(x), m_w(w), m_xi(xi)
 {
   // check input
   assert( m_x.rank()   == 3      ); // shape: [nelem, nne, ndim]
   assert( m_x.shape(1) == m_nne  ); // number of nodes per element
   assert( m_x.shape(2) == m_ndim ); // number of dimensions
 
   // extract number of elements
   m_nelem = m_x.shape(0);
 
   // integration scheme
   // - default
   if ( m_w.size() == 0 and m_xi.size() == 0 )
   {
     m_nip = Gauss::nip();
     m_xi  = Gauss::xi();
     m_w   = Gauss::w();
   }
   // - input
   else if ( m_w.size() > 0 and m_xi.size() > 0 )
   {
     m_nip = m_w.size();
   }
   // - unknown
   else
   {
     throw std::runtime_error("Input integration point coordinates and weights");
   }
 
   // check input
   assert( m_xi.rank()   == 2      ); // shape: [nip, ndim]
   assert( m_xi.shape(0) == m_nip  ); // number of integration points
   assert( m_xi.shape(1) == m_ndim ); // number of dimensions
   assert( m_w .rank()   == 1      ); // shape: [nip]
   assert( m_w .size()   == m_nip  ); // number of integration points
 
   // allocate arrays
   // - shape functions
   m_N.resize({m_nip,m_nne});
   // - shape function gradients in local coordinates
   m_dNxi.resize({m_nip,m_nne,m_ndim});
   // - shape function gradients in global coordinates
   m_dNx.resize({m_nelem,m_nip,m_nne,m_ndim});
   // - integration point volume
   m_vol.resize({m_nelem,m_nip});
 
   // shape functions
   for ( size_t k = 0 ; k < m_nip ; ++k )
   {
     m_N(k,0) = .25 * (1.-m_xi(k,0)) * (1.-m_xi(k,1));
     m_N(k,1) = .25 * (1.+m_xi(k,0)) * (1.-m_xi(k,1));
     m_N(k,2) = .25 * (1.+m_xi(k,0)) * (1.+m_xi(k,1));
     m_N(k,3) = .25 * (1.-m_xi(k,0)) * (1.+m_xi(k,1));
   }
 
   // shape function gradients in local coordinates
   for ( size_t k = 0 ; k < m_nip ; ++k )
   {
     m_dNxi(k,0,0) = -.25*(1.-m_xi(k,1));    m_dNxi(k,0,1) = -.25*(1.-m_xi(k,0));
     m_dNxi(k,1,0) = +.25*(1.-m_xi(k,1));    m_dNxi(k,1,1) = -.25*(1.+m_xi(k,0));
     m_dNxi(k,2,0) = +.25*(1.+m_xi(k,1));    m_dNxi(k,2,1) = +.25*(1.+m_xi(k,0));
     m_dNxi(k,3,0) = -.25*(1.+m_xi(k,1));    m_dNxi(k,3,1) = +.25*(1.-m_xi(k,0));
   }
 
   // compute the shape function gradients, based on "x"
   compute_dN();
 }
 
 // --------------------------- integration volume (per tensor-component) ---------------------------
 
 inline ArrD Quadrature::dV(size_t ncomp) const
 {
   if ( ncomp == 0 ) return m_vol;
 
   ArrD out = ArrD::Zero({m_nelem, m_nip, ncomp});
 
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
     for ( size_t k = 0 ; k < m_nip ; ++k )
       for ( size_t i = 0 ; i < ncomp ; ++i )
         out(e,k,i) = m_vol(e,k);
 
   return out;
 }
 
 // -------------------------------------- number of elements ---------------------------------------
 
 inline size_t Quadrature::nelem() const
 {
   return m_nelem;
 }
 
 // ---------------------------------- number of nodes per element ----------------------------------
 
 inline size_t Quadrature::nne() const
 {
   return m_nne;
 }
 
 // ------------------------------------- number of dimensions --------------------------------------
 
 inline size_t Quadrature::ndim() const
 {
   return m_ndim;
 }
 
 // --------------------------------- number of integration points ----------------------------------
 
 inline size_t Quadrature::nip() const
 {
   return m_nip;
 }
 
 // --------------------------------------- update positions ----------------------------------------
 
 inline void Quadrature::update_x(const ArrD &x)
 {
   // check input
   assert( x.rank()   == 3          ); // shape: [nelem, nne, ndim]
   assert( x.shape(0) == m_nelem    ); // number of elements
   assert( x.shape(1) == m_nne      ); // number of nodes per element
   assert( x.shape(2) == m_ndim     ); // number of dimensions
   assert( x.size()   == m_x.size() ); // total number of components (redundant)
 
   // update positions
   m_x.setCopy(x.begin(), x.end());
 
   // update the shape function gradients for the new "x"
   compute_dN();
 }
 
 // ------------------------ shape function gradients in global coordinates -------------------------
 
 inline void Quadrature::compute_dN()
 {
   #pragma omp parallel
   {
     // intermediate quantities and local views
-    double Jdet, w, vol;
+    double Jdet;
     cppmat::tiny::matrix<double,m_nne,m_ndim> dNx;
     cppmat::view::matrix<double,m_nne,m_ndim> dNxi, x;
     cppmat::tiny::cartesian::tensor2<double,2> J, Jinv;
 
     // loop over all elements (in parallel)
     #pragma omp for
     for ( size_t e = 0 ; e < m_nelem ; ++e )
     {
-      // alias
-      x.setMap(&m_x(e)); // nodal positions
+      // alias nodal positions
+      x.setMap(&m_x(e));
 
       // loop over integration points
       for ( size_t k = 0 ; k < m_nip ; ++k )
       {
-        // - alias
-        dNxi.setMap(&m_dNxi(  k)); // shape function gradients (local coordinates)
-        vol =        m_vol (e,k);  // volume
-        w   =        m_w   (  k);  // weight factor
+        // - alias shape function gradients (local coordinates)
+        dNxi.setMap(&m_dNxi(k));
 
         // - Jacobian (loops unrolled for efficiency)
         //   J(i,j) += dNxi(m,i) * xe(m,j)
         J(0,0) = dNxi(0,0)*x(0,0) + dNxi(1,0)*x(1,0) + dNxi(2,0)*x(2,0) + dNxi(3,0)*x(3,0);
         J(0,1) = dNxi(0,0)*x(0,1) + dNxi(1,0)*x(1,1) + dNxi(2,0)*x(2,1) + dNxi(3,0)*x(3,1);
         J(1,0) = dNxi(0,1)*x(0,0) + dNxi(1,1)*x(1,0) + dNxi(2,1)*x(2,0) + dNxi(3,1)*x(3,0);
         J(1,1) = dNxi(0,1)*x(0,1) + dNxi(1,1)*x(1,1) + dNxi(2,1)*x(2,1) + dNxi(3,1)*x(3,1);
 
         // - determinant and inverse of the Jacobian
         Jdet = J.det();
         Jinv = J.inv();
 
-        // - integration point volume
-        vol = w * Jdet;
-
         // - shape function gradients wrt global coordinates (loops partly unrolled for efficiency)
         //   dNx(m,i) += Jinv(i,j) * dNxi(m,j)
         for ( size_t m = 0 ; m < m_nne ; ++m )
         {
           dNx(m,0) = Jinv(0,0) * dNxi(m,0) + Jinv(0,1) * dNxi(m,1);
           dNx(m,1) = Jinv(1,0) * dNxi(m,0) + Jinv(1,1) * dNxi(m,1);
         }
 
         // - copy to matrix: shape function gradients (global coordinates)
-        std::copy(dNx.begin(), dNx.end(), m_dNx.item(e,k));
+        dNx.copyTo(m_dNx.item(e,k));
 
         // - copy to matrix: integration point volume
-        m_vol(e,k) = vol;
+        m_vol(e,k) = m_w(k) * Jdet;
       }
     }
   } // #pragma omp parallel
 }
 
 // ------------------- dyadic product "qtensor(i,j) = dNdx(m,i) * elemvec(m,j)" --------------------
 
 template<class T>
 inline ArrD Quadrature::gradN_vector(const ArrD &elemvec) const
 {
   // check input
   assert( elemvec.rank()   == 3       ); // shape: [nelem, nne, ndim]
   assert( elemvec.shape(0) == m_nelem ); // number of elements
   assert( elemvec.shape(1) == m_nne   ); // number of nodes per element
   assert( elemvec.shape(2) == m_ndim  ); // number of dimensions
 
-  // temporary tensor, to deduce size of the output
-  T dummytensor;
-  size_t tdim = dummytensor.size();
-
   // zero-initialize output: matrix of tensors
-  ArrD qtensor = ArrD::Zero({m_nelem, m_nip, tdim});
+  ArrD qtensor = ArrD::Zero({m_nelem, m_nip, T::Size()});
 
   #pragma omp parallel
   {
     // intermediate quantities and local views
     T gradu;
     cppmat::view::matrix<double,m_nne,m_ndim> dNx, u;
 
     // loop over all elements (in parallel)
     #pragma omp for
     for ( size_t e = 0 ; e < m_nelem ; ++e )
     {
-      // alias
-      u.setMap(&elemvec(e)); // element vector (e.g. nodal displacements)
+      // alias element vector (e.g. nodal displacements)
+      u.setMap(&elemvec(e));
 
       // loop over all integration points in element "e"
       for ( size_t k = 0 ; k < m_nip ; ++k )
       {
-        // - alias
-        dNx.setMap(&m_dNx(e,k)); // shape function gradients (global coordinates)
+        // - alias shape function gradients (local coordinates)
+        dNx.setMap(&m_dNx(e,k));
 
         // - evaluate dyadic product (loops unrolled for efficiency)
         //   gradu(i,j) += dNx(m,i) * ue(m,j)
         gradu(0,0) = dNx(0,0)*u(0,0) + dNx(1,0)*u(1,0) + dNx(2,0)*u(2,0) + dNx(3,0)*u(3,0);
         gradu(0,1) = dNx(0,0)*u(0,1) + dNx(1,0)*u(1,1) + dNx(2,0)*u(2,1) + dNx(3,0)*u(3,1);
         gradu(1,0) = dNx(0,1)*u(0,0) + dNx(1,1)*u(1,0) + dNx(2,1)*u(2,0) + dNx(3,1)*u(3,0);
         gradu(1,1) = dNx(0,1)*u(0,1) + dNx(1,1)*u(1,1) + dNx(2,1)*u(2,1) + dNx(3,1)*u(3,1);
 
         // - copy resulting integration point tensor
         std::copy(gradu.begin(), gradu.end(), qtensor.item(e,k));
       }
     }
   } // #pragma omp parallel
 
   return qtensor;
 }
 
 // ---------------------------------- transpose of "GradN_vector" ----------------------------------
 
 template<class T>
 inline ArrD Quadrature::gradN_vector_T(const ArrD &elemvec) const
 {
   // check input
   assert( elemvec.rank()   == 3       ); // shape: [nelem, nne, ndim]
   assert( elemvec.shape(0) == m_nelem ); // number of elements
   assert( elemvec.shape(1) == m_nne   ); // number of nodes per element
   assert( elemvec.shape(2) == m_ndim  ); // number of dimensions
 
-  // temporary tensor, to deduce size of the output
-  T dummytensor;
-  size_t tdim = dummytensor.size();
-
   // zero-initialize output: matrix of tensors
-  ArrD qtensor = ArrD::Zero({m_nelem, m_nip, tdim});
+  ArrD qtensor = ArrD::Zero({m_nelem, m_nip, T::Size()});
 
   #pragma omp parallel
   {
     // intermediate quantities and local views
     T gradu;
     cppmat::view::matrix<double,m_nne,m_ndim> dNx, u;
 
     // loop over all elements (in parallel)
     #pragma omp for
     for ( size_t e = 0 ; e < m_nelem ; ++e )
     {
-      // alias
-      u.setMap(&elemvec(e)); // element vector (e.g. nodal displacements)
+      // alias element vector (e.g. nodal displacements)
+      u.setMap(&elemvec(e));
 
       // loop over all integration points in element "e"
       for ( size_t k = 0 ; k < m_nip ; ++k )
       {
-        // - alias
-        dNx.setMap(&m_dNx(e,k)); // shape function gradients (global coordinates)
+        // - alias shape function gradients (global coordinates)
+        dNx.setMap(&m_dNx(e,k));
 
         // - evaluate dyadic product (loops unrolled for efficiency)
         //   gradu(j,i) += dNx(m,i) * ue(m,j)
         gradu(0,0) = dNx(0,0)*u(0,0) + dNx(1,0)*u(1,0) + dNx(2,0)*u(2,0) + dNx(3,0)*u(3,0);
         gradu(1,0) = dNx(0,0)*u(0,1) + dNx(1,0)*u(1,1) + dNx(2,0)*u(2,1) + dNx(3,0)*u(3,1);
         gradu(0,1) = dNx(0,1)*u(0,0) + dNx(1,1)*u(1,0) + dNx(2,1)*u(2,0) + dNx(3,1)*u(3,0);
         gradu(1,1) = dNx(0,1)*u(0,1) + dNx(1,1)*u(1,1) + dNx(2,1)*u(2,1) + dNx(3,1)*u(3,1);
 
         // - copy resulting integration point tensor
         std::copy(gradu.begin(), gradu.end(), qtensor.item(e,k));
       }
     }
   } // #pragma omp parallel
 
   return qtensor;
 }
 
 // ------------------------------- symmetric part of "GradN_vector" --------------------------------
 
 template<class T>
 inline ArrD Quadrature::symGradN_vector(const ArrD &elemvec) const
 {
   // check input
   assert( elemvec.rank()   == 3       ); // shape: [nelem, nne, ndim]
   assert( elemvec.shape(0) == m_nelem ); // number of elements
   assert( elemvec.shape(1) == m_nne   ); // number of nodes per element
   assert( elemvec.shape(2) == m_ndim  ); // number of dimensions
 
-  // temporary tensor, to deduce size of the output
-  T dummytensor;
-  size_t tdim = dummytensor.size();
-
   // zero-initialize output: matrix of tensors
-  ArrD qtensor = ArrD::Zero({m_nelem, m_nip, tdim});
+  ArrD qtensor = ArrD::Zero({m_nelem, m_nip, T::Size()});
 
   #pragma omp parallel
   {
     // intermediate quantities and local views
     T eps;
     cppmat::tiny::cartesian::tensor2<double,m_ndim> gradu;
     cppmat::view::matrix<double,m_nne,m_ndim> dNx, u;
 
     // loop over all elements (in parallel)
     #pragma omp for
     for ( size_t e = 0 ; e < m_nelem ; ++e )
     {
-      // alias
-      u.setMap(&elemvec(e)); // element vector (e.g. nodal displacements)
+      // alias element vector (e.g. nodal displacements)
+      u.setMap(&elemvec(e));
 
       // loop over all integration points in element "e"
       for ( size_t k = 0 ; k < m_nip ; ++k )
       {
-        // - alias
-        dNx.setMap(&m_dNx(e,k)); // shape function gradients (global coordinates)
+        // - alias shape function gradients (global coordinates)
+        dNx.setMap(&m_dNx(e,k));
 
         // - evaluate dyadic product (loops unrolled for efficiency)
         //   gradu(i,j) += dNx(m,i) * ue(m,j)
         gradu(0,0) = dNx(0,0)*u(0,0) + dNx(1,0)*u(1,0) + dNx(2,0)*u(2,0) + dNx(3,0)*u(3,0);
         gradu(0,1) = dNx(0,0)*u(0,1) + dNx(1,0)*u(1,1) + dNx(2,0)*u(2,1) + dNx(3,0)*u(3,1);
         gradu(1,0) = dNx(0,1)*u(0,0) + dNx(1,1)*u(1,0) + dNx(2,1)*u(2,0) + dNx(3,1)*u(3,0);
         gradu(1,1) = dNx(0,1)*u(0,1) + dNx(1,1)*u(1,1) + dNx(2,1)*u(2,1) + dNx(3,1)*u(3,1);
 
         // - symmetrize (loops unrolled for efficiency)
         //   eps(i,j) = .5 * ( gradu(i,j) + gradu(j,i) )
         eps(0,0) =        gradu(0,0);
         eps(0,1) = .5 * ( gradu(0,1) + gradu(1,0) );
         eps(1,0) =        eps  (0,1);
         eps(1,1) =        gradu(1,1);
 
         // - copy resulting integration point tensor
         std::copy(eps.begin(), eps.end(), qtensor.item(e,k));
       }
     }
   } // #pragma omp parallel
 
   return qtensor;
 }
 
 // ------- scalar product "elemmat(m*ndim+i,n*ndim+i) = N(m) * qscalar * N(n)"; for all "i" --------
 
 inline ArrD Quadrature::int_N_scalar_NT_dV(const ArrD &qscalar) const
 {
   // check input
   assert( qscalar.rank()   == 2          ); // shape: [nelem, nip]
   assert( qscalar.shape(0) == m_nelem    ); // number of elements
   assert( qscalar.shape(1) == m_nip      ); // number of integration points
 
   // zero-initialize: matrix of matrices
   ArrD elemmat = ArrD::Zero({m_nelem, m_nne*m_ndim, m_nne*m_ndim});
 
   #pragma omp parallel
   {
     // intermediate quantities and local views
     cppmat::tiny::matrix<double,m_nne*m_ndim,m_nne*m_ndim> M;
     cppmat::view::vector<double,m_nne> N;
     double rho, vol;
 
     // loop over all elements (in parallel)
     #pragma omp for
     for ( size_t e = 0 ; e < m_nelem ; ++e )
     {
       // zero-initialize (e.g. mass matrix)
       M.setZero();
 
       // loop over all integration points in element "e"
       for ( size_t k = 0 ; k < m_nip ; ++k )
       {
+        // - alias shape functions
+        N.setMap(&m_N(k));
+
         // - alias
-        N.setMap(&m_N(  k)); // shape functions
         vol = m_vol  (e,k);  // integration point volume
         rho = qscalar(e,k);  // integration point scalar (e.g. density)
 
         // - evaluate scalar product, for all dimensions, and assemble
         //   M(m*ndim+i,n*ndim+i) += N(m) * scalar * N(n) * dV
         for ( size_t m = 0 ; m < m_nne ; ++m ) {
           for ( size_t n = 0 ; n < m_nne ; ++n ) {
             M(m*m_ndim+0, n*m_ndim+0) += N(m) * rho * N(n) * vol;
             M(m*m_ndim+1, n*m_ndim+1) += N(m) * rho * N(n) * vol;
           }
         }
       }
 
       // copy result to element matrix
       std::copy(M.begin(), M.end(), elemmat.item(e));
     }
   } // #pragma omp parallel
 
   return elemmat;
 }
 
 // ------------ integral of dot product "elemvec(m,j) += dNdx(m,i) * qtensor(i,j) * dV" ------------
 
 template<class T>
 inline ArrD Quadrature::int_gradN_dot_tensor2_dV(const ArrD &qtensor) const
 {
-  #ifndef NDEBUG
-
-    // dummy variable
-    T tmp;
-
-    // check input
-    assert( qtensor.rank()   == 3          ); // shape: [nelem, nip, #tensor-components]
-    assert( qtensor.shape(0) == m_nelem    ); // number of elements
-    assert( qtensor.shape(1) == m_nip      ); // number of integration points
-    assert( qtensor.shape(2) == tmp.size() ); // tensor dimensions
-
-  #endif
+  // check input
+  assert( qtensor.rank()   == 3         ); // shape: [nelem, nip, #tensor-components]
+  assert( qtensor.shape(0) == m_nelem   ); // number of elements
+  assert( qtensor.shape(1) == m_nip     ); // number of integration points
+  assert( qtensor.shape(2) == T::Size() ); // tensor dimensions
 
   // zero-initialize output: matrix of vectors
   ArrD elemvec = ArrD::Zero({m_nelem, m_nne, m_ndim});
 
   #pragma omp parallel
   {
     // intermediate quantities and local views
     cppmat::view::matrix<double,m_nne,m_ndim> dNx;
     cppmat::tiny::matrix<double,m_nne,m_ndim> f;
     double vol;
     T sig;
 
     // loop over all elements (in parallel)
     #pragma omp for
     for ( size_t e = 0 ; e < m_nelem ; ++e )
     {
       // zero-initialize (e.g. nodal force)
       f.setZero();
 
       // loop over all integration points in element "e"
       for ( size_t k = 0 ; k < m_nip ; ++k )
       {
         // - alias
         dNx.setMap (&m_dNx  (e,k)); // shape function gradients (global coordinates)
         sig.setCopy(&qtensor(e,k)); // integration point tensor (e.g. stress)
         vol = m_vol         (e,k);  // integration point volume
 
         // - evaluate dot product, and assemble (loops partly unrolled for efficiency)
         //   f(m,j) += dNdx(m,i) * sig(i,j) * dV;
         for ( size_t m = 0 ; m < m_nne ; ++m )
         {
           f(m,0) += ( dNx(m,0) * sig(0,0) + dNx(m,1) * sig(1,0) ) * vol;
           f(m,1) += ( dNx(m,0) * sig(0,1) + dNx(m,1) * sig(1,1) ) * vol;
         }
       }
 
       // copy result to element vector
       std::copy(f.begin(), f.end(), elemvec.item(e));
     }
   } // #pragma omp parallel
 
   return elemvec;
 }
 
 // ---------------------- wrappers with default storage (no template needed) -----------------------
 
 inline ArrD Quadrature::gradN_vector(const ArrD &elemvec) const
 {
   return gradN_vector<cppmat::tiny::cartesian::tensor2<double,2>>(elemvec);
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline ArrD Quadrature::gradN_vector_T(const ArrD &elemvec) const
 {
   return gradN_vector_T<cppmat::tiny::cartesian::tensor2<double,2>>(elemvec);
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline ArrD Quadrature::symGradN_vector(const ArrD &elemvec) const
 {
   return symGradN_vector<cppmat::tiny::cartesian::tensor2s<double,2>>(elemvec);
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline ArrD Quadrature::int_gradN_dot_tensor2_dV(const ArrD &qtensor) const
 {
   assert( qtensor.rank() == 3 ); // shape: [nelem, nip, #tensor-components]
 
   if ( qtensor.shape(2) == m_ndim*m_ndim )
+
     return int_gradN_dot_tensor2_dV<cppmat::tiny::cartesian::tensor2<double,2>>(qtensor);
+
   else if ( qtensor.shape(2) == (m_ndim+1)*m_ndim/2 )
+
     return int_gradN_dot_tensor2_dV<cppmat::tiny::cartesian::tensor2s<double,2>>(qtensor);
+
   else
+
     throw std::runtime_error("assert: qtensor.shape(2) == 4 or qtensor.shape(2) == 3");
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline ArrD Quadrature::int_gradN_dot_tensor2s_dV(const ArrD &qtensor) const
 {
   return int_gradN_dot_tensor2_dV<cppmat::tiny::cartesian::tensor2s<double,2>>(qtensor);
 }
 
 // -------------------------------------------------------------------------------------------------
 
 }}} // namespace ...
 
 // =================================================================================================
 
 #endif
diff --git a/src/GooseFEM/OverdampedDynamicsDiagonal.cpp b/src/GooseFEM/OverdampedDynamicsDiagonal.cpp
deleted file mode 100644
index 86589c1..0000000
--- a/src/GooseFEM/OverdampedDynamicsDiagonal.cpp
+++ /dev/null
@@ -1,883 +0,0 @@
-/* =================================================================================================
-
-(c - GPLv3) T.W.J. de Geus (Tom) | tom@geus.me | www.geus.me | github.com/tdegeus/GooseFEM
-
-================================================================================================= */
-
-#ifndef GOOSEFEM_OVERDAMPEDDYNAMICS_DIAGONAL_CPP
-#define GOOSEFEM_OVERDAMPEDDYNAMICS_DIAGONAL_CPP
-
-// -------------------------------------------------------------------------------------------------
-
-#include "OverdampedDynamicsDiagonal.h"
-
-// ============================ GooseFEM::OverdampedDynamics::Diagonal =============================
-
-namespace GooseFEM {
-namespace OverdampedDynamics {
-namespace Diagonal {
-
-// ===================================== Simulation - Periodic =====================================
-
-template<class Element>
-inline Periodic<Element>::Periodic(
-  std::unique_ptr<Element> _elem, const MatD &_x, const MatS &_conn, const MatS &_dofs, double _dt
-)
-: elem(std::move(_elem)), conn(_conn), dofs(_dofs), x(_x), dt(_dt)
-{
-  // compute sizes
-  nnode = static_cast<size_t>(x.rows());
-  ndim  = static_cast<size_t>(x.cols());
-  nelem = static_cast<size_t>(conn.rows());
-  nne   = static_cast<size_t>(conn.cols());
-  ndof  = dofs.maxCoeff()+1;
-
-  // basic checks (mostly the user is 'trusted')
-  assert( static_cast<size_t>(dofs.size()) == nnode * ndim );
-  assert(                     ndof         <  nnode * ndim );
-
-  // allocate and zero-initialize nodal quantities
-  u  .conservativeResize(nnode,ndim); u  .setZero();
-  u_n.conservativeResize(nnode,ndim); u_n.setZero();
-  v  .conservativeResize(nnode,ndim); v  .setZero();
-
-  // allocate and zero-initialize linear system (DOFs)
-  D   .conservativeResize(ndof);
-  Dinv.conservativeResize(ndof);
-  F   .conservativeResize(ndof);
-  V   .conservativeResize(ndof);
-
-  // initialize all fields
-  updated_x();
-  updated_u(true);
-  updated_v(true);
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::forwardEuler()
-{
-  // (1) compute the velocities
-  // - solve for velocities (DOFs)
-  V.noalias() = Dinv.cwiseProduct( - F );
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-
-  // (2) update positions
-  u += dt * v;
-
-  // process update in displacements and velocities
-  updated_u();
-  updated_v();
-
-  // update time
-  t += dt;
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::midpoint()
-{
-  // back-up history
-  u_n = u;
-
-  // (1) compute trial state
-  // - solve for trial velocities (DOFs)
-  V.noalias() = Dinv.cwiseProduct( - F );
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - update to trial positions
-  u = u_n + 0.5 * dt * v;
-  // - process update in displacements and velocities
-  updated_u();
-  updated_v();
-
-  // (2) compute actual state
-  // - solve for velocities (DOFs)
-  V.noalias() = Dinv.cwiseProduct( - F );
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - update to trial positions
-  u = u_n + dt * v;
-  // - process update in displacements and velocities
-  updated_u();
-  updated_v();
-
-  // update time
-  t += dt;
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::updated_x()
-{
-  // set the nodal positions of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->x(e,m,i) = x(conn(e,m),i);
-
-  // signal update
-  elem->updated_x();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::updated_u(bool init)
-{
-  // set the nodal displacements of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->u(e,m,i) = u(conn(e,m),i);
-
-  // signal update
-  elem->updated_u();
-
-  // update
-  if ( elem->changed_D or init ) assemble_D();
-  if ( elem->changed_f or init ) assemble_F();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::updated_v(bool init)
-{
-  // set the nodal displacements of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->v(e,m,i) = v(conn(e,m),i);
-
-  // signal update
-  elem->updated_v();
-
-  // update
-  if ( elem->changed_D or init ) assemble_D();
-  if ( elem->changed_f or init ) assemble_F();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::assemble_D()
-{
-  // zero-initialize
-  D.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - damping matrix, per thread
-    ColD D_(ndof);
-    D_.setZero();
-
-    // - assemble diagonal damping matrix, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          D_(dofs(conn(e,m),i)) += elem->D(e,m*ndim+i,m*ndim+i);
-
-    // - reduce "D_" per thread to total "D"
-    #pragma omp critical
-      D += D_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-
-  // compute inverse
-  Dinv = D.cwiseInverse();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void Periodic<Element>::assemble_F()
-{
-  // zero-initialize
-  F.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - force, per thread
-    ColD F_(ndof);
-    F_.setZero();
-
-    // - assemble force, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          F_(dofs(conn(e,m),i)) += elem->f(e,m,i);
-
-    // - reduce "F_" per thread to total "F"
-    #pragma omp critical
-      F += F_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-}
-
-// =================================== Simulation - SemiPeriodic ===================================
-
-template<class Element>
-inline SemiPeriodic<Element>::SemiPeriodic(
-  std::unique_ptr<Element> _elem, const MatD &_x, const MatS &_conn, const MatS &_dofs,
-  const ColS &_fixedDofs, double _dt
-)
-: elem(std::move(_elem)), conn(_conn), dofs(_dofs), x(_x), fixedDofs(_fixedDofs), dt(_dt)
-{
-  // compute sizes
-  nfixed    = static_cast<size_t>(fixedDofs.size());
-  nnode     = static_cast<size_t>(x.rows());
-  ndim      = static_cast<size_t>(x.cols());
-  nelem     = static_cast<size_t>(conn.rows());
-  nne       = static_cast<size_t>(conn.cols());
-  ndof      = dofs.maxCoeff()+1;
-
-  // basic checks (mostly the user is 'trusted')
-  assert( static_cast<size_t>(dofs.size()) == nnode * ndim );
-  assert(                     ndof         <  nnode * ndim );
-
-  #ifndef NDEBUG
-  for ( size_t i = 0 ; i < nfixed ; ++i ) assert( fixedDofs(i) < ndof );
-  #endif
-
-  // allocate and zero-initialize nodal quantities
-  u  .conservativeResize(nnode,ndim); u  .setZero();
-  u_n.conservativeResize(nnode,ndim); u_n.setZero();
-  v  .conservativeResize(nnode,ndim); v  .setZero();
-
-  // allocate and zero-initialize linear system (DOFs)
-  D   .conservativeResize(ndof);
-  Dinv.conservativeResize(ndof);
-  F   .conservativeResize(ndof);
-  V   .conservativeResize(ndof);
-
-  // fixed DOFs : default zero velocity
-  fixedV.conservativeResize(nfixed); fixedV.setZero();
-
-  // initialize all fields
-  updated_x();
-  updated_u(true);
-  updated_v(true);
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::forwardEuler()
-{
-  // (1) compute the velocities
-  // - solve for velocities (DOFs)
-  V.noalias() = Dinv.cwiseProduct( - F );
-  // - apply the fixed velocities
-  for ( size_t i=0; i<nfixed; ++i ) V(fixedDofs(i)) = fixedV(i);
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-
-  // (2) update positions
-  u += dt * v;
-
-  // process update in displacements and velocities
-  updated_u();
-  updated_v();
-
-  // update time
-  t += dt;
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::midpoint()
-{
-  // back-up history
-  u_n = u;
-
-  // (1) compute trial state
-  // - solve for trial velocities (DOFs)
-  V.noalias() = Dinv.cwiseProduct( - F );
-  // - apply the fixed velocities
-  for ( size_t i=0; i<nfixed; ++i ) V(fixedDofs(i)) = fixedV(i);
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - update to trial positions
-  u = u_n + 0.5 * dt * v;
-  // - process update in displacements and velocities
-  updated_u();
-  updated_v();
-
-  // (2) compute actual state
-  // - solve for velocities (DOFs)
-  V.noalias() = Dinv.cwiseProduct( - F );
-  // - apply the fixed velocities
-  for ( size_t i=0; i<nfixed; ++i ) V(fixedDofs(i)) = fixedV(i);
-  // - convert to nodal velocities (periodicity implies that several nodes depend on the same DOF)
-  for ( size_t i=0; i<nnode*ndim; ++i ) v(i) = V(dofs(i));
-  // - update to trial positions
-  u = u_n + dt * v;
-  // - process update in displacements and velocities
-  updated_u();
-  updated_v();
-
-  // update time
-  t += dt;
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::updated_x()
-{
-  // set the nodal positions of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->x(e,m,i) = x(conn(e,m),i);
-
-  // signal update
-  elem->updated_x();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::updated_u(bool init)
-{
-  // set the nodal displacements of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->u(e,m,i) = u(conn(e,m),i);
-
-  // signal update
-  elem->updated_u();
-
-  // update
-  if ( elem->changed_D or init ) assemble_D();
-  if ( elem->changed_f or init ) assemble_F();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::updated_v(bool init)
-{
-  // set the nodal displacements of all elements (in parallel)
-  #pragma omp parallel for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-    for ( size_t m = 0 ; m < nne ; ++m )
-      for ( size_t i = 0 ; i < ndim ; ++i )
-        elem->v(e,m,i) = v(conn(e,m),i);
-
-  // signal update
-  elem->updated_v();
-
-  // update
-  if ( elem->changed_D or init ) assemble_D();
-  if ( elem->changed_f or init ) assemble_F();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::assemble_D()
-{
-  // zero-initialize
-  D.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - damping matrix, per thread
-    ColD D_(ndof);
-    D_.setZero();
-
-    // - assemble diagonal damping matrix, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          D_(dofs(conn(e,m),i)) += elem->D(e,m*ndim+i,m*ndim+i);
-
-    // - reduce "D_" per thread to total "D"
-    #pragma omp critical
-      D += D_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-
-  // compute inverse
-  Dinv = D.cwiseInverse();
-}
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Element>
-inline void SemiPeriodic<Element>::assemble_F()
-{
-  // zero-initialize
-  F.setZero();
-
-  // temporarily disable parallelization by Eigen
-  Eigen::setNbThreads(1);
-  // assemble
-  #pragma omp parallel
-  {
-    // - force, per thread
-    ColD F_(ndof);
-    F_.setZero();
-
-    // - assemble force, per thread
-    #pragma omp for
-    for ( size_t e = 0 ; e < nelem ; ++e )
-      for ( size_t m = 0 ; m < nne ; ++m )
-        for ( size_t i = 0 ; i < ndim ; ++i )
-          F_(dofs(conn(e,m),i)) += elem->f(e,m,i);
-
-    // - reduce "F_" per thread to total "F"
-    #pragma omp critical
-      F += F_;
-  }
-  // automatic parallelization by Eigen
-  Eigen::setNbThreads(0);
-}
-
-// ======================================== Element - Quad4 ========================================
-
-namespace SmallStrain {
-
-// -------------------------------------------------------------------------------------------------
-
-template<class Material>
-inline Quad4<Material>::Quad4(std::unique_ptr<Material> _mat, size_t _nelem)
-{
-  // copy from input
-  nelem = _nelem;
-  mat   = std::move(_mat);
-
-  // allocate matrices
-  // -
-  x     .resize({nelem,nne,ndim});
-  u     .resize({nelem,nne,ndim});
-  v     .resize({nelem,nne,ndim});
-  f     .resize({nelem,nne,ndim});
-  // -
-  D     .resize({nelem,nne*ndim,nne*ndim});
-  // -
-  dNx   .resize({nelem,nip,nne,ndim});
-  // -
-  vol   .resize({nelem,nip});
-  vol_n .resize({nelem,nne});
-  // -
-  dNxi  .resize({nip,nne,ndim});
-  dNxi_n.resize({nne,nne,ndim});
-  // -
-  w     .resize({nip});
-  w_n   .resize({nne});
-
-  // zero initialize matrices (only diagonal is written, no new zero-initialization necessary)
-  D.setZero();
-
-  // shape function gradient at all Gauss points, in local coordinates
-  // - k == 0
-  dNxi(0,0,0) = -.25*(1.+1./std::sqrt(3.));     dNxi(0,0,1) = -.25*(1.+1./std::sqrt(3.));
-  dNxi(0,1,0) = +.25*(1.+1./std::sqrt(3.));     dNxi(0,1,1) = -.25*(1.-1./std::sqrt(3.));
-  dNxi(0,2,0) = +.25*(1.-1./std::sqrt(3.));     dNxi(0,2,1) = +.25*(1.-1./std::sqrt(3.));
-  dNxi(0,3,0) = -.25*(1.-1./std::sqrt(3.));     dNxi(0,3,1) = +.25*(1.+1./std::sqrt(3.));
-  // - k == 1
-  dNxi(1,0,0) = -.25*(1.+1./std::sqrt(3.));     dNxi(1,0,1) = -.25*(1.-1./std::sqrt(3.));
-  dNxi(1,1,0) = +.25*(1.+1./std::sqrt(3.));     dNxi(1,1,1) = -.25*(1.+1./std::sqrt(3.));
-  dNxi(1,2,0) = +.25*(1.-1./std::sqrt(3.));     dNxi(1,2,1) = +.25*(1.+1./std::sqrt(3.));
-  dNxi(1,3,0) = -.25*(1.-1./std::sqrt(3.));     dNxi(1,3,1) = +.25*(1.-1./std::sqrt(3.));
-  // - k == 2
-  dNxi(2,0,0) = -.25*(1.-1./std::sqrt(3.));     dNxi(2,0,1) = -.25*(1.-1./std::sqrt(3.));
-  dNxi(2,1,0) = +.25*(1.-1./std::sqrt(3.));     dNxi(2,1,1) = -.25*(1.+1./std::sqrt(3.));
-  dNxi(2,2,0) = +.25*(1.+1./std::sqrt(3.));     dNxi(2,2,1) = +.25*(1.+1./std::sqrt(3.));
-  dNxi(2,3,0) = -.25*(1.+1./std::sqrt(3.));     dNxi(2,3,1) = +.25*(1.-1./std::sqrt(3.));
-  // - k == 3
-  dNxi(3,0,0) = -.25*(1.-1./std::sqrt(3.));     dNxi(3,0,1) = -.25*(1.+1./std::sqrt(3.));
-  dNxi(3,1,0) = +.25*(1.-1./std::sqrt(3.));     dNxi(3,1,1) = -.25*(1.-1./std::sqrt(3.));
-  dNxi(3,2,0) = +.25*(1.+1./std::sqrt(3.));     dNxi(3,2,1) = +.25*(1.-1./std::sqrt(3.));
-  dNxi(3,3,0) = -.25*(1.+1./std::sqrt(3.));     dNxi(3,3,1) = +.25*(1.+1./std::sqrt(3.));
-
-  // integration point weight at all Gauss points
-  w(0) = 1.;
-  w(1) = 1.;
-  w(2) = 1.;
-  w(3) = 1.;
-
-  // shape function gradient at all nodes, in local coordinates
-  // - k == 0
-  dNxi_n(0,0,0) = -0.5;     dNxi_n(0,0,1) = -0.5;
-  dNxi_n(0,1,0) = +0.5;     dNxi_n(0,1,1) =  0.0;
-  dNxi_n(0,2,0) =  0.0;     dNxi_n(0,2,1) =  0.0;
-  dNxi_n(0,3,0) =  0.0;     dNxi_n(0,3,1) = +0.5;
-  // - k == 1
-  dNxi_n(1,0,0) = -0.5;     dNxi_n(1,0,1) =  0.0;
-  dNxi_n(1,1,0) = +0.5;     dNxi_n(1,1,1) = -0.5;
-  dNxi_n(1,2,0) =  0.0;     dNxi_n(1,2,1) = +0.5;
-  dNxi_n(1,3,0) =  0.0;     dNxi_n(1,3,1) =  0.0;
-  // - k == 2
-  dNxi_n(2,0,0) =  0.0;     dNxi_n(2,0,1) =  0.0;
-  dNxi_n(2,1,0) =  0.0;     dNxi_n(2,1,1) = -0.5;
-  dNxi_n(2,2,0) = +0.5;     dNxi_n(2,2,1) = +0.5;
-  dNxi_n(2,3,0) = -0.5;     dNxi_n(2,3,1) =  0.0;
-  // - k == 3
-  dNxi_n(3,0,0) =  0.0;     dNxi_n(3,0,1) = -0.5;
-  dNxi_n(3,1,0) =  0.0;     dNxi_n(3,1,1) =  0.0;
-  dNxi_n(3,2,0) = +0.5;     dNxi_n(3,2,1) =  0.0;
-  dNxi_n(3,3,0) = -0.5;     dNxi_n(3,3,1) = +0.5;
-
-  // integration point weight at all nodes
-  w_n(0) = 1.;
-  w_n(1) = 1.;
-  w_n(2) = 1.;
-  w_n(3) = 1.;
-
-  // Note, the above is a specialization of the following:
-  //
-  // - Shape function gradients
-  //
-  //    dNxi(0,0) = -.25*(1.-xi(k,1)); dNxi(0,1) = -.25*(1.-xi(k,0));
-  //    dNxi(1,0) = +.25*(1.-xi(k,1)); dNxi(1,1) = -.25*(1.+xi(k,0));
-  //    dNxi(2,0) = +.25*(1.+xi(k,1)); dNxi(2,1) = +.25*(1.+xi(k,0));
-  //    dNxi(3,0) = -.25*(1.+xi(k,1)); dNxi(3,1) = +.25*(1.-xi(k,0));
-  //
-  // - Gauss point coordinates and weights
-  //
-  //    xi(0,0) = -1./std::sqrt(3.); xi(0,1) = -1./std::sqrt(3.); w(0) = 1.;
-  //    xi(1,0) = +1./std::sqrt(3.); xi(1,1) = -1./std::sqrt(3.); w(1) = 1.;
-  //    xi(2,0) = +1./std::sqrt(3.); xi(2,1) = +1./std::sqrt(3.); w(2) = 1.;
-  //    xi(3,0) = -1./std::sqrt(3.); xi(3,1) = +1./std::sqrt(3.); w(3) = 1.;
-  //
-  // - Nodal coordinates and weights
-  //
-  //    xi(0,0) = -1.; xi(0,1) = -1.; w(0) = 1.;
-  //    xi(1,0) = +1.; xi(1,1) = -1.; w(1) = 1.;
-  //    xi(2,0) = +1.; xi(2,1) = +1.; w(2) = 1.;
-  //    xi(3,0) = -1.; xi(3,1) = +1.; w(3) = 1.;
-
-}
-
-// =================================================================================================
-
-template<class Material>
-inline void Quad4<Material>::updated_x()
-{
-#pragma omp parallel
-{
-  // intermediate quantities
-  cppmat::cartesian2d::tensor2<double> J_, Jinv_;
-  double Jdet_;
-  // local views of the global arrays (speeds up indexing, and increases readability)
-  cppmat::tiny::matrix2<double,8,8> M_, D_;
-  cppmat::tiny::matrix2<double,4,2> dNxi_, dNx_, x_;
-  cppmat::tiny::vector <double,4>   w_, vol_;
-
-  // loop over all elements (in parallel)
-  #pragma omp for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-  {
-    // pointer to element positions
-    x_.map(&x(e));
-
-    // nodal quadrature
-    // ----------------
-
-    // pointer to element damping matrix, nodal volume, and integration weight
-    D_  .map(&D    (e));
-    vol_.map(&vol_n(e));
-    w_  .map(&w_n  (0));
-
-    // loop over nodes, i.e. the integration points
-    for ( size_t k = 0 ; k < nne ; ++k )
-    {
-      // - pointer to the shape function gradients (local coordinates)
-      dNxi_.map(&dNxi_n(k));
-
-      // - Jacobian
-      //   J(i,j) += dNxi(m,i) * xe(m,j)
-      J_(0,0) = dNxi_(0,0)*x_(0,0) + dNxi_(1,0)*x_(1,0) + dNxi_(2,0)*x_(2,0) + dNxi_(3,0)*x_(3,0);
-      J_(0,1) = dNxi_(0,0)*x_(0,1) + dNxi_(1,0)*x_(1,1) + dNxi_(2,0)*x_(2,1) + dNxi_(3,0)*x_(3,1);
-      J_(1,0) = dNxi_(0,1)*x_(0,0) + dNxi_(1,1)*x_(1,0) + dNxi_(2,1)*x_(2,0) + dNxi_(3,1)*x_(3,0);
-      J_(1,1) = dNxi_(0,1)*x_(0,1) + dNxi_(1,1)*x_(1,1) + dNxi_(2,1)*x_(2,1) + dNxi_(3,1)*x_(3,1);
-
-      // - determinant of the Jacobian
-      Jdet_ = J_.det();
-
-      // - integration point volume
-      vol_(k) = w_(k) * Jdet_;
-
-      // - assemble element non-Galilean damping matrix
-      //   D(m+i,n+i) += N(m) * alpha * vol * N(n);
-      D_(k*2  ,k*2  ) = mat->alpha(e,k) * vol_(k);
-      D_(k*2+1,k*2+1) = mat->alpha(e,k) * vol_(k);
-    }
-
-    // Gaussian quadrature
-    // -------------------
-
-    // pointer to element integration volume and weight
-    vol_.map(&vol(e));
-    w_  .map(&w  (0));
-
-    // loop over Gauss points
-    for ( size_t k = 0 ; k < nip ; ++k )
-    {
-      // - pointer to the shape function gradients (local/global coordinates)
-      dNxi_.map(&dNxi(  k));
-      dNx_ .map(&dNx (e,k));
-
-      // - Jacobian
-      //   J(i,j) += dNxi(m,i) * xe(m,j)
-      J_(0,0) = dNxi_(0,0)*x_(0,0) + dNxi_(1,0)*x_(1,0) + dNxi_(2,0)*x_(2,0) + dNxi_(3,0)*x_(3,0);
-      J_(0,1) = dNxi_(0,0)*x_(0,1) + dNxi_(1,0)*x_(1,1) + dNxi_(2,0)*x_(2,1) + dNxi_(3,0)*x_(3,1);
-      J_(1,0) = dNxi_(0,1)*x_(0,0) + dNxi_(1,1)*x_(1,0) + dNxi_(2,1)*x_(2,0) + dNxi_(3,1)*x_(3,0);
-      J_(1,1) = dNxi_(0,1)*x_(0,1) + dNxi_(1,1)*x_(1,1) + dNxi_(2,1)*x_(2,1) + dNxi_(3,1)*x_(3,1);
-
-      // - determinant and inverse of the Jacobian
-      Jdet_ = J_.det();
-      Jinv_ = J_.inv();
-
-      // - integration point volume
-      vol_(k) = w_(k) * Jdet_;
-
-      // - shape function gradients (global coordinates)
-      //   dNx(m,i) += Jinv(i,j) * dNxi(m,j)
-      for ( size_t m = 0 ; m < nne ; ++m )
-      {
-        dNx_(m,0) = Jinv_(0,0) * dNxi_(m,0) + Jinv_(0,1) * dNxi_(m,1);
-        dNx_(m,1) = Jinv_(1,0) * dNxi_(m,0) + Jinv_(1,1) * dNxi_(m,1);
-      }
-    }
-  }
-
-  // set signals
-  changed_f = false;
-  changed_D = true;
-
-} // #pragma omp parallel
-}
-
-// =================================================================================================
-
-template<class Material>
-inline void Quad4<Material>::updated_u()
-{
-#pragma omp parallel
-{
-  // intermediate quantities
-  cppmat::cartesian2d::tensor2 <double> gradu_;
-  cppmat::cartesian2d::tensor2s<double> eps_, sig_;
-  // local views of the global arrays (speeds up indexing, and increases readability)
-  cppmat::tiny::matrix2<double,4,2> dNx_, u_, f_;
-  cppmat::tiny::vector <double,4>   vol_;
-
-  // loop over all elements (in parallel)
-  #pragma omp for
-  for ( size_t e = 0 ; e < nelem ; ++e )
-  {
-    // pointer to element forces, displacements, and integration volume
-    f_  .map(&f  (e));
-    u_  .map(&u  (e));
-    vol_.map(&vol(e));
-
-    // zero initialize forces
-    f_.setZero();
-
-    // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < nip ; ++k )
-    {
-      // - pointer to the shape function gradients, strain and stress tensor (stored symmetric)
-      dNx_.map(&dNx     (e,k));
-      eps_.map(&mat->eps(e,k));
-      sig_.map(&mat->sig(e,k));
-
-      // - displacement gradient
-      //   gradu_(i,j) += dNx(m,i) * ue(m,j)
-      gradu_(0,0) = dNx_(0,0)*u_(0,0) + dNx_(1,0)*u_(1,0) + dNx_(2,0)*u_(2,0) + dNx_(3,0)*u_(3,0);
-      gradu_(0,1) = dNx_(0,0)*u_(0,1) + dNx_(1,0)*u_(1,1) + dNx_(2,0)*u_(2,1) + dNx_(3,0)*u_(3,1);
-      gradu_(1,0) = dNx_(0,1)*u_(0,0) + dNx_(1,1)*u_(1,0) + dNx_(2,1)*u_(2,0) + dNx_(3,1)*u_(3,0);
-      gradu_(1,1) = dNx_(0,1)*u_(0,1) + dNx_(1,1)*u_(1,1) + dNx_(2,1)*u_(2,1) + dNx_(3,1)*u_(3,1);
-
-      // - strain (stored symmetric)
-      //   eps(i,j) = .5 * ( gradu_(i,j) + gradu_(j,i) )
-      eps_(0,0) =        gradu_(0,0);
-      eps_(0,1) = .5 * ( gradu_(0,1) + gradu_(1,0) );
-      eps_(1,1) =        gradu_(1,1);
-
-      // - constitutive response
-      mat->updated_eps(e,k);
-
-      // - assemble to element force
-      //   f(m,j) += dNx(m,i) * sig(i,j) * vol;
-      for ( size_t m = 0 ; m < nne ; ++m )
-      {
-        f_(m,0) += dNx_(m,0) * sig_(0,0) * vol_(k) + dNx_(m,1) * sig_(1,0) * vol_(k);
-        f_(m,1) += dNx_(m,0) * sig_(0,1) * vol_(k) + dNx_(m,1) * sig_(1,1) * vol_(k);
-      }
-    }
-  }
-
-  // set signals
-  changed_f = true;
-  changed_D = false;
-}
-}
-
-// =================================================================================================
-
-template<class Material>
-inline void Quad4<Material>::updated_v()
-{
-  changed_f = true;
-  changed_D = false;
-}
-
-// =================================================================================================
-
-template<class Material>
-inline cppmat::cartesian2d::tensor2s<double> Quad4<Material>::mean_eps(size_t e)
-{
-  cppmat::cartesian2d::tensor2s<double> eps_, tot_eps_(0.0);
-  cppmat::tiny::vector <double,4> vol_;
-  double tot_vol_ = 0.0;
-
-  // pointer to integration volume
-  vol_.map(&vol(e));
-
-  // loop over all integration points in element "e"
-  for ( size_t k = 0 ; k < nip ; ++k )
-  {
-    // - pointer to strain tensor (stored symmetric)
-    eps_.map(&mat->eps(e,k));
-
-    // - add to average
-    tot_eps_ += vol_(k) * eps_;
-    tot_vol_ += vol_(k);
-  }
-
-  // return volume average
-  return ( tot_eps_ / tot_vol_ );
-}
-
-// =================================================================================================
-
-template<class Material>
-inline cppmat::cartesian2d::tensor2s<double> Quad4<Material>::mean_sig(size_t e)
-{
-  cppmat::cartesian2d::tensor2s<double> sig_, tot_sig_(0.0);
-  cppmat::tiny::vector <double,4> vol_;
-  double tot_vol_ = 0.0;
-
-  // pointer to integration volume
-  vol_.map(&vol(e));
-
-  // loop over all integration points in element "e"
-  for ( size_t k = 0 ; k < nip ; ++k )
-  {
-    // - pointer to strain tensor (stored symmetric)
-    sig_.map(&mat->sig(e,k));
-
-    // - add to average
-    tot_sig_ += vol_(k) * sig_;
-    tot_vol_ += vol_(k);
-  }
-
-  // return volume average
-  return ( tot_sig_ / tot_vol_ );
-}
-
-// =================================================================================================
-
-template<class Material>
-inline cppmat::cartesian2d::tensor2s<double> Quad4<Material>::mean_eps()
-{
-  cppmat::cartesian2d::tensor2s<double> eps_, tot_eps_(0.0);
-  cppmat::tiny::vector <double,4> vol_;
-  double tot_vol_ = 0.0;
-
-  // loop over all elements (in parallel)
-  for ( size_t e = 0 ; e < nelem ; ++e )
-  {
-    // pointer to integration volume
-    vol_.map(&vol(e));
-
-    // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < nip ; ++k )
-    {
-      // - pointer to strain tensor (stored symmetric)
-      eps_.map(&mat->eps(e,k));
-
-      // - add to average
-      tot_eps_ += vol_(k) * eps_;
-      tot_vol_ += vol_(k);
-    }
-  }
-
-  // return volume average
-  return ( tot_eps_ / tot_vol_ );
-}
-
-// =================================================================================================
-
-template<class Material>
-inline cppmat::cartesian2d::tensor2s<double> Quad4<Material>::mean_sig()
-{
-  cppmat::cartesian2d::tensor2s<double> sig_, tot_sig_(0.0);
-  cppmat::tiny::vector <double,4> vol_;
-  double tot_vol_ = 0.0;
-
-  // loop over all elements (in parallel)
-  for ( size_t e = 0 ; e < nelem ; ++e )
-  {
-    // pointer to integration volume
-    vol_.map(&vol(e));
-
-    // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < nip ; ++k )
-    {
-      // - pointer to strain tensor (stored symmetric)
-      sig_.map(&mat->sig(e,k));
-
-      // - add to average
-      tot_sig_ += vol_(k) * sig_;
-      tot_vol_ += vol_(k);
-    }
-  }
-
-  // return volume average
-  return ( tot_sig_ / tot_vol_ );
-}
-
-// -------------------------------------------------------------------------------------------------
-
-} // namespace ...
-
-// =================================================================================================
-
-}}} // namespace ...
-
-// =================================================================================================
-
-#endif
diff --git a/src/GooseFEM/OverdampedDynamicsDiagonal.h b/src/GooseFEM/OverdampedDynamicsDiagonal.h
deleted file mode 100644
index 0f624d7..0000000
--- a/src/GooseFEM/OverdampedDynamicsDiagonal.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/* =================================================================================================
-
-(c - GPLv3) T.W.J. de Geus (Tom) | tom@geus.me | www.geus.me | github.com/tdegeus/GooseFEM
-
-================================================================================================= */
-
-#ifndef GOOSEFEM_OVERDAMPEDDYNAMICS_DIAGONAL_H
-#define GOOSEFEM_OVERDAMPEDDYNAMICS_DIAGONAL_H
-
-// -------------------------------------------------------------------------------------------------
-
-#include "GooseFEM.h"
-
-// ============================ GooseFEM::OverdampedDynamics::Diagonal =============================
-
-namespace GooseFEM {
-namespace OverdampedDynamics {
-namespace Diagonal {
-
-// ===================================== Simulation - Periodic =====================================
-
-template<class Element>
-class Periodic
-{
-public:
-
-  // variables
-  // ---------
-
-  // element/quadrature/material definition
-  std::unique_ptr<Element> elem;
-
-  // mesh: nodal position/displacement/velocity, DOF-numbers, connectivity, dimensions
-  size_t nnode, nelem, nne, ndim, ndof;
-  MatS   conn, dofs;
-  MatD   x, u, u_n, v;
-
-  // linear system: columns (also "D" which is diagonal)
-  ColD   D, Dinv, F, V;
-
-  // time integration
-  double dt, t=0.0;
-
-  // constructor
-  // -----------
-
-  Periodic(){};
-
-  Periodic(
-    std::unique_ptr<Element> elem, const MatD &x, const MatS &conn, const MatS &dofs, double dt
-  );
-
-  // functions
-  // ---------
-
-  void forwardEuler();             // one time step of the time integrator
-  void midpoint();                 // one time step of the time integrator
-  void updated_x();                // process update in "x"
-  void updated_u(bool init=false); // process update in "u", if init all possible updates are made
-  void updated_v(bool init=false); // process update in "v", if init all possible updates are made
-  void assemble_D();               // assemble the damping matrix from the element damping matrices
-  void assemble_F();               // assemble the force from the element forces
-};
-
-// =================================== Simulation - SemiPeriodic ===================================
-
-template<class Element>
-class SemiPeriodic
-{
-public:
-
-  // variables
-  // ---------
-
-  // element/quadrature/material definition
-  std::unique_ptr<Element> elem;
-
-  // mesh: nodal position/displacement/velocity, DOF-numbers, connectivity, dimensions
-  size_t nnode, nelem, nne, ndim, ndof;
-  MatS   conn, dofs;
-  MatD   x, u, u_n, v;
-
-  // fixed DOFs: prescribed velocity, DOF-numbers, dimensions
-  size_t nfixed;
-  ColS   fixedDofs;
-  ColD   fixedV;
-
-  // linear system: columns (also "D" which is diagonal)
-  ColD   D, Dinv, F, V;
-
-  // time integration
-  double dt, t=0.0;
-
-  // constructor
-  // -----------
-
-  SemiPeriodic(){};
-
-  SemiPeriodic(
-    std::unique_ptr<Element> elem, const MatD &x, const MatS &conn, const MatS &dofs,
-    const ColS &fixedDofs, double dt
-  );
-
-  // functions
-  // ---------
-
-  void forwardEuler();             // one time step of the time integrator
-  void midpoint();                 // one time step of the time integrator
-  void updated_x();                // process update in "x"
-  void updated_u(bool init=false); // process update in "u", if init all possible updates are made
-  void updated_v(bool init=false); // process update in "v", if init all possible updates are made
-  void assemble_D();               // assemble the damping matrix from the element damping matrices
-  void assemble_F();               // assemble the force from the element forces
-};
-
-// ======================================== Element - Quad4 ========================================
-
-namespace SmallStrain {
-
-template<class Material>
-class Quad4
-{
-public:
-
-  // class variables
-  // ---------------
-
-  // constitutive response per integration point, per element
-  std::unique_ptr<Material> mat;
-
-  // matrices to store the element data
-  cppmat::matrix<double> x, u, v, f, D, dNx, dNxi, w, vol, dNxi_n, w_n, vol_n;
-
-  // dimensions
-  size_t nelem, nip=4, nne=4, ndim=2;
-
-  // signals for solvers
-  bool changed_f=false, changed_D=true;
-
-  // class functions
-  // ---------------
-
-  // constructor
-  Quad4(std::unique_ptr<Material> mat, size_t nelem);
-
-  // recompute relevant quantities after "x", "u", or "v" have been externally updated
-  void updated_x();
-  void updated_u();
-  void updated_v();
-
-  // return volume average stress and strain
-  cppmat::cartesian2d::tensor2s<double> mean_eps(size_t e);   // of element "e"
-  cppmat::cartesian2d::tensor2s<double> mean_sig(size_t e);   // of element "e"
-  cppmat::cartesian2d::tensor2s<double> mean_eps();           // of all elements
-  cppmat::cartesian2d::tensor2s<double> mean_sig();           // of all elements
-};
-
-} // namespace ...
-
-// =================================================================================================
-
-}}} // namespace ...
-
-// =================================================================================================
-
-#endif