diff --git a/include/xGooseFEM/ElementHex8.h b/include/xGooseFEM/ElementHex8.h
index 4c4b25b..f65efbe 100644
--- a/include/xGooseFEM/ElementHex8.h
+++ b/include/xGooseFEM/ElementHex8.h
@@ -1,132 +1,161 @@
 /* =================================================================================================
 
 (c - GPLv3) T.W.J. de Geus (Tom) | tom@geus.me | www.geus.me | github.com/tdegeus/GooseFEM
 
 ================================================================================================= */
 
 #ifndef XGOOSEFEM_ELEMENTHEX8_H
 #define XGOOSEFEM_ELEMENTHEX8_H
 
 // -------------------------------------------------------------------------------------------------
 
 #include "GooseFEM.h"
 
-// ==================================== GooseFEM::Element::Hex8 ====================================
+// =================================================================================================
 
 namespace xGooseFEM {
 namespace Element {
 namespace Hex8 {
 
-// ======================================== tensor algebra =========================================
+// =================================================================================================
 
 using T2 = xt::xtensor_fixed<double, xt::xshape<3,3>>;
 
 inline double inv(const T2 &A, T2 &Ainv);
 
-// ================================ GooseFEM::Element::Hex8::Gauss =================================
+// =================================================================================================
 
 namespace Gauss {
 inline size_t                nip(); // number of integration points
 inline xt::xtensor<double,2> xi();  // integration point coordinates (local coordinates)
 inline xt::xtensor<double,1> w();   // integration point weights
 }
 
-// ================================ GooseFEM::Element::Hex8::Nodal =================================
+// =================================================================================================
 
 namespace Nodal {
 inline size_t                nip(); // number of integration points
 inline xt::xtensor<double,2> xi();  // integration point coordinates (local coordinates)
 inline xt::xtensor<double,1> w();   // integration point weights
 }
 
 // =================================================================================================
 
 // ------------------------------------------ quadrature -------------------------------------------
 
 class Quadrature
 {
-private:
-
-  // dimensions (flexible)
-  size_t m_nelem; // number of elements
-  size_t m_nip;   // number of integration points
-
-  // dimensions (fixed for this element type)
-  static const size_t m_nne=8;  // number of nodes per element
-  static const size_t m_ndim=3; // number of dimensions
-
-  // data arrays
-  xt::xtensor<double,3> m_x;    // nodal positions stored per element [nelem, nne, ndim]
-  xt::xtensor<double,1> m_w;    // weight of each integration point [nip]
-  xt::xtensor<double,2> m_xi;   // local coordinate of each integration point [nip, ndim]
-  xt::xtensor<double,2> m_N;    // shape functions [nip, nne]
-  xt::xtensor<double,3> m_dNxi; // shape function gradients w.r.t. local  coordinate [nip, nne, ndim]
-  xt::xtensor<double,4> m_dNx;  // shape function gradients w.r.t. global coordinate [nelem, nip, nne, ndim]
-  xt::xtensor<double,2> m_vol;  // integration point volume [nelem, nip]
-
-private:
-
-  // compute "vol" and "dNdx" based on current "x"
-  void compute_dN();
-
 public:
 
-  // convention:
+  // fixed dimensions:
+  //    ndim = 3   -  number of dimensions
+  //    nne  = 8   -  number of nodes per element
+  //
+  // naming convention:
   //    "elemmat"  -  matrices stored per element       -  [nelem, nne*ndim, nne*ndim]
   //    "elemvec"  -  nodal vectors stored per element  -  [nelem, nne, ndim]
   //    "qtensor"  -  integration point tensor          -  [nelem, nip, ndim, ndim]
   //    "qscalar"  -  integration point scalar          -  [nelem, nip]
 
   // constructor: integration point coordinates and weights are optional (default: Gauss)
+
   Quadrature() = default;
-  Quadrature(const xt::xtensor<double,3> &x);
+
+  Quadrature(const xt::xtensor<double,3> &x) : Quadrature(x, Gauss::xi(), Gauss::w()){};
+
   Quadrature(const xt::xtensor<double,3> &x, const xt::xtensor<double,2> &xi, const xt::xtensor<double,1> &w);
 
   // update the nodal positions (shape of "x" should match the earlier definition)
+
   void update_x(const xt::xtensor<double,3> &x);
 
   // return dimensions
-  size_t nelem() const; // number of elements
-  size_t nne()   const; // number of nodes per element
-  size_t ndim()  const; // number of dimension
-  size_t nip()   const; // number of integration points
+
+  size_t nelem() const { return m_nelem; }; // number of elements
+  size_t nne()   const { return m_nne;   }; // number of nodes per element
+  size_t ndim()  const { return m_ndim;  }; // number of dimension
+  size_t nip()   const { return m_nip;   }; // number of integration points
 
   // return integration volume
-  // - in-place
+
   void dV(xt::xtensor<double,2> &qscalar) const;
-  void dV(xt::xtensor<double,4> &qtensor) const;
-  // - return qscalar/qtensor
+
+  void dV(xt::xtensor<double,4> &qtensor) const; // same volume for all tensor components
+
+  // dyadic product "qtensor(i,j) += dNdx(m,i) * elemvec(m,j)", its transpose and its symmetric part
+
+  void gradN_vector(const xt::xtensor<double,3> &elemvec,
+    xt::xtensor<double,4> &qtensor) const;
+
+  void gradN_vector_T(const xt::xtensor<double,3> &elemvec,
+    xt::xtensor<double,4> &qtensor) const;
+
+  void symGradN_vector(const xt::xtensor<double,3> &elemvec,
+    xt::xtensor<double,4> &qtensor) const;
+
+  // integral of the scalar product "elemmat(m*ndim+i,n*ndim+i) += N(m) * qscalar * N(n) * dV"
+
+  void int_N_scalar_NT_dV(const xt::xtensor<double,2> &qscalar,
+    xt::xtensor<double,3> &elemmat) const;
+
+  // integral of the dot product "elemvec(m,j) += dNdx(m,i) * qtensor(i,j) * dV"
+
+  void int_gradN_dot_tensor2_dV(const xt::xtensor<double,4> &qtensor,
+    xt::xtensor<double,3> &elemvec) const;
+
+  // integral of the dot product "elemmat(m*2+j, n*2+k) += dNdx(m,i) * qtensor(i,j,k,l) * dNdx(n,l) * dV"
+
+  void int_gradN_dot_tensor4_dot_gradNT_dV(const xt::xtensor<double,6> &qtensor,
+    xt::xtensor<double,3> &elemmat) const;
+
+  // auto allocation of the functions above
+
   xt::xtensor<double,2> dV() const;
+
   xt::xtensor<double,4> dVtensor() const;
 
-  // dyadic product "qtensor(i,j) += dNdx(m,i) * elemvec(m,j)", its transpose and its symmetric part
-  // - in-place
-  void gradN_vector   (const xt::xtensor<double,3> &elemvec, xt::xtensor<double,4> &qtensor) const;
-  void gradN_vector_T (const xt::xtensor<double,3> &elemvec, xt::xtensor<double,4> &qtensor) const;
-  void symGradN_vector(const xt::xtensor<double,3> &elemvec, xt::xtensor<double,4> &qtensor) const;
-  // - return qtensor
-  xt::xtensor<double,4> gradN_vector   (const xt::xtensor<double,3> &elemvec) const;
-  xt::xtensor<double,4> gradN_vector_T (const xt::xtensor<double,3> &elemvec) const;
+  xt::xtensor<double,4> gradN_vector(const xt::xtensor<double,3> &elemvec) const;
+
+  xt::xtensor<double,4> gradN_vector_T(const xt::xtensor<double,3> &elemvec) const;
+
   xt::xtensor<double,4> symGradN_vector(const xt::xtensor<double,3> &elemvec) const;
 
-  // integral of the scalar product "elemmat(m*ndim+i,n*ndim+i) += N(m) * qscalar * N(n) * dV"
-  // - in-place
-  void int_N_scalar_NT_dV(const xt::xtensor<double,2> &qscalar, xt::xtensor<double,3> &elemmat) const;
-  // - return elemmat
   xt::xtensor<double,3> int_N_scalar_NT_dV(const xt::xtensor<double,2> &qscalar) const;
 
-  // integral of the dot product "elemvec(m,j) += dNdx(m,i) * qtensor(i,j) * dV"
-  // - in-place
-  void int_gradN_dot_tensor2_dV(const xt::xtensor<double,4> &qtensor, xt::xtensor<double,3> &elemvec) const;
-  // - return elemvec
   xt::xtensor<double,3> int_gradN_dot_tensor2_dV(const xt::xtensor<double,4> &qtensor) const;
 
+  xt::xtensor<double,3> int_gradN_dot_tensor4_dot_gradNT_dV(const xt::xtensor<double,6> &qtensor) const;
+
+private:
+
+  // compute "vol" and "dNdx" based on current "x"
+  void compute_dN();
+
+private:
+
+  // dimensions (flexible)
+  size_t m_nelem; // number of elements
+  size_t m_nip;   // number of integration points
+
+  // dimensions (fixed for this element type)
+  static const size_t m_nne=8;  // number of nodes per element
+  static const size_t m_ndim=3; // number of dimensions
+
+  // data arrays
+  xt::xtensor<double,3> m_x;    // nodal positions stored per element [nelem, nne, ndim]
+  xt::xtensor<double,1> m_w;    // weight of each integration point [nip]
+  xt::xtensor<double,2> m_xi;   // local coordinate of each integration point [nip, ndim]
+  xt::xtensor<double,2> m_N;    // shape functions [nip, nne]
+  xt::xtensor<double,3> m_dNxi; // shape function gradients w.r.t. local  coordinate [nip, nne, ndim]
+  xt::xtensor<double,4> m_dNx;  // shape function gradients w.r.t. global coordinate [nelem, nip, nne, ndim]
+  xt::xtensor<double,2> m_vol;  // integration point volume [nelem, nip]
+
 };
 
 // -------------------------------------------------------------------------------------------------
 
 }}} // namespace ...
 
 // =================================================================================================
 
 #endif
diff --git a/include/xGooseFEM/ElementHex8.hpp b/include/xGooseFEM/ElementHex8.hpp
index 1abddbc..f3b4e2a 100644
--- a/include/xGooseFEM/ElementHex8.hpp
+++ b/include/xGooseFEM/ElementHex8.hpp
@@ -1,714 +1,676 @@
 /* =================================================================================================
 
 (c - GPLv3) T.W.J. de Geus (Tom) | tom@geus.me | www.geus.me | github.com/tdegeus/GooseFEM
 
 ================================================================================================= */
 
 #ifndef XGOOSEFEM_ELEMENTHEX8_CPP
 #define XGOOSEFEM_ELEMENTHEX8_CPP
 
 // -------------------------------------------------------------------------------------------------
 
 #include "ElementHex8.h"
 
-// ==================================== GooseFEM::Element::Hex8 ====================================
+// =================================================================================================
 
 namespace xGooseFEM {
 namespace Element {
 namespace Hex8 {
 
-// ======================================== tensor algebra =========================================
+// =================================================================================================
 
 inline double inv(const T2 &A, T2 &Ainv)
 {
   // compute determinant
-  double det = ( A[0] * A[4] * A[8] +
-                 A[1] * A[5] * A[6] +
-                 A[2] * A[3] * A[7] ) -
-               ( A[2] * A[4] * A[6] +
-                 A[1] * A[3] * A[8] +
-                 A[0] * A[5] * A[7] );
+  double det = ( A(0,0) * A(1,1) * A(2,2) +
+                 A(0,1) * A(1,2) * A(2,0) +
+                 A(0,2) * A(1,0) * A(2,1) ) -
+               ( A(0,2) * A(1,1) * A(2,0) +
+                 A(0,1) * A(1,0) * A(2,2) +
+                 A(0,0) * A(1,2) * A(2,1) );
 
   // compute inverse
-  Ainv[0] = (A[4]*A[8]-A[5]*A[7]) / det;
-  Ainv[1] = (A[2]*A[7]-A[1]*A[8]) / det;
-  Ainv[2] = (A[1]*A[5]-A[2]*A[4]) / det;
-  Ainv[3] = (A[5]*A[6]-A[3]*A[8]) / det;
-  Ainv[4] = (A[0]*A[8]-A[2]*A[6]) / det;
-  Ainv[5] = (A[2]*A[3]-A[0]*A[5]) / det;
-  Ainv[6] = (A[3]*A[7]-A[4]*A[6]) / det;
-  Ainv[7] = (A[1]*A[6]-A[0]*A[7]) / det;
-  Ainv[8] = (A[0]*A[4]-A[1]*A[3]) / det;
+  Ainv(0,0) = (A(1,1)*A(2,2)-A(1,2)*A(2,1)) / det;
+  Ainv(0,1) = (A(0,2)*A(2,1)-A(0,1)*A(2,2)) / det;
+  Ainv(0,2) = (A(0,1)*A(1,2)-A(0,2)*A(1,1)) / det;
+  Ainv(1,0) = (A(1,2)*A(2,0)-A(1,0)*A(2,2)) / det;
+  Ainv(1,1) = (A(0,0)*A(2,2)-A(0,2)*A(2,0)) / det;
+  Ainv(1,2) = (A(0,2)*A(1,0)-A(0,0)*A(1,2)) / det;
+  Ainv(2,0) = (A(1,0)*A(2,1)-A(1,1)*A(2,0)) / det;
+  Ainv(2,1) = (A(0,1)*A(2,0)-A(0,0)*A(2,1)) / det;
+  Ainv(2,2) = (A(0,0)*A(1,1)-A(0,1)*A(1,0)) / det;
 
   return det;
 }
 
-// ================================ GooseFEM::Element::Hex8::Gauss =================================
+// =================================================================================================
 
 namespace Gauss {
 
-// --------------------------------- number of integration points ----------------------------------
+// -------------------------------------------------------------------------------------------------
 
 inline size_t nip()
 {
   return 8;
 }
 
-// ----------------------- integration point coordinates (local coordinates) -----------------------
+// -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,2> xi()
 {
   size_t nip  = 8;
   size_t ndim = 3;
 
   xt::xtensor<double,2> xi = xt::empty<double>({nip,ndim});
 
   xi(0,0) = -1./std::sqrt(3.);    xi(0,1) = -1./std::sqrt(3.);    xi(0,2) = -1./std::sqrt(3.);
   xi(1,0) = +1./std::sqrt(3.);    xi(1,1) = -1./std::sqrt(3.);    xi(1,2) = -1./std::sqrt(3.);
   xi(2,0) = +1./std::sqrt(3.);    xi(2,1) = +1./std::sqrt(3.);    xi(2,2) = -1./std::sqrt(3.);
   xi(3,0) = -1./std::sqrt(3.);    xi(3,1) = +1./std::sqrt(3.);    xi(3,2) = -1./std::sqrt(3.);
   xi(4,0) = -1./std::sqrt(3.);    xi(4,1) = -1./std::sqrt(3.);    xi(4,2) = +1./std::sqrt(3.);
   xi(5,0) = +1./std::sqrt(3.);    xi(5,1) = -1./std::sqrt(3.);    xi(5,2) = +1./std::sqrt(3.);
   xi(6,0) = +1./std::sqrt(3.);    xi(6,1) = +1./std::sqrt(3.);    xi(6,2) = +1./std::sqrt(3.);
   xi(7,0) = -1./std::sqrt(3.);    xi(7,1) = +1./std::sqrt(3.);    xi(7,2) = +1./std::sqrt(3.);
 
   return xi;
 }
 
-// ----------------------------------- integration point weights -----------------------------------
+// -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,1> w()
 {
   size_t nip = 8;
 
   xt::xtensor<double,1> w = xt::empty<double>({nip});
 
   w(0) = 1.;
   w(1) = 1.;
   w(2) = 1.;
   w(3) = 1.;
   w(4) = 1.;
   w(5) = 1.;
   w(6) = 1.;
   w(7) = 1.;
 
   return w;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 }
 
-// ================================ GooseFEM::Element::Hex8::Nodal ================================
+// =================================================================================================
 
 namespace Nodal {
 
-// --------------------------------- number of integration points ----------------------------------
+// -------------------------------------------------------------------------------------------------
 
 inline size_t nip()
 {
   return 8;
 }
 
-// ----------------------- integration point coordinates (local coordinates) -----------------------
+// -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,2> xi()
 {
   size_t nip  = 8;
   size_t ndim = 3;
 
   xt::xtensor<double,2> xi = xt::empty<double>({nip,ndim});
 
   xi(0,0) = -1.;    xi(0,1) = -1.;    xi(0,2) = -1.;
   xi(1,0) = +1.;    xi(1,1) = -1.;    xi(1,2) = -1.;
   xi(2,0) = +1.;    xi(2,1) = +1.;    xi(2,2) = -1.;
   xi(3,0) = -1.;    xi(3,1) = +1.;    xi(3,2) = -1.;
   xi(4,0) = -1.;    xi(4,1) = -1.;    xi(4,2) = +1.;
   xi(5,0) = +1.;    xi(5,1) = -1.;    xi(5,2) = +1.;
   xi(6,0) = +1.;    xi(6,1) = +1.;    xi(6,2) = +1.;
   xi(7,0) = -1.;    xi(7,1) = +1.;    xi(7,2) = +1.;
 
   return xi;
 }
 
-// ----------------------------------- integration point weights -----------------------------------
+// -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,1> w()
 {
   size_t nip = 8;
 
   xt::xtensor<double,1> w = xt::empty<double>({nip});
 
   w(0) = 1.;
   w(1) = 1.;
   w(2) = 1.;
   w(3) = 1.;
   w(4) = 1.;
   w(5) = 1.;
   w(6) = 1.;
   w(7) = 1.;
 
   return w;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 }
 
 // =================================================================================================
 
-// ------------------------------------------ constructor ------------------------------------------
-
-inline Quadrature::Quadrature(const xt::xtensor<double,3> &x) : m_x(x)
-{
-  assert( m_x.shape()[1] == m_nne  );
-  assert( m_x.shape()[2] == m_ndim );
-
-  // integration scheme
-  m_nip = Gauss::nip();
-  m_xi  = Gauss::xi();
-  m_w   = Gauss::w();
-
-  // extract number of elements
-  m_nelem = m_x.shape()[0];
-
-  // allocate arrays
-  m_N    = xt::empty<double>({         m_nip, m_nne        });
-  m_dNxi = xt::empty<double>({         m_nip, m_nne, m_ndim});
-  m_dNx  = xt::empty<double>({m_nelem, m_nip, m_nne, m_ndim});
-  m_vol  = xt::empty<double>({m_nelem, m_nip               });
-
-  // shape functions
-  for ( size_t k = 0 ; k < m_nip ; ++k )
-  {
-    m_N(k,0) = .125 * (1.-m_xi(k,0)) * (1.-m_xi(k,1)) * (1.-m_xi(k,2));
-    m_N(k,1) = .125 * (1.+m_xi(k,0)) * (1.-m_xi(k,1)) * (1.-m_xi(k,2));
-    m_N(k,2) = .125 * (1.+m_xi(k,0)) * (1.+m_xi(k,1)) * (1.-m_xi(k,2));
-    m_N(k,3) = .125 * (1.-m_xi(k,0)) * (1.+m_xi(k,1)) * (1.-m_xi(k,2));
-    m_N(k,4) = .125 * (1.-m_xi(k,0)) * (1.-m_xi(k,1)) * (1.+m_xi(k,2));
-    m_N(k,5) = .125 * (1.+m_xi(k,0)) * (1.-m_xi(k,1)) * (1.+m_xi(k,2));
-    m_N(k,6) = .125 * (1.+m_xi(k,0)) * (1.+m_xi(k,1)) * (1.+m_xi(k,2));
-    m_N(k,7) = .125 * (1.-m_xi(k,0)) * (1.+m_xi(k,1)) * (1.+m_xi(k,2));
-  }
-
-  // shape function gradients in local coordinates
-  for ( size_t k = 0 ; k < m_nip ; ++k )
-  {
-    // - dN / dxi_0
-    m_dNxi(k,0,0) = -.125*(1.-m_xi(k,1))*(1.-m_xi(k,2));
-    m_dNxi(k,1,0) = +.125*(1.-m_xi(k,1))*(1.-m_xi(k,2));
-    m_dNxi(k,2,0) = +.125*(1.+m_xi(k,1))*(1.-m_xi(k,2));
-    m_dNxi(k,3,0) = -.125*(1.+m_xi(k,1))*(1.-m_xi(k,2));
-    m_dNxi(k,4,0) = -.125*(1.-m_xi(k,1))*(1.+m_xi(k,2));
-    m_dNxi(k,5,0) = +.125*(1.-m_xi(k,1))*(1.+m_xi(k,2));
-    m_dNxi(k,6,0) = +.125*(1.+m_xi(k,1))*(1.+m_xi(k,2));
-    m_dNxi(k,7,0) = -.125*(1.+m_xi(k,1))*(1.+m_xi(k,2));
-    // - dN / dxi_1
-    m_dNxi(k,0,1) = -.125*(1.-m_xi(k,0))*(1.-m_xi(k,2));
-    m_dNxi(k,1,1) = -.125*(1.+m_xi(k,0))*(1.-m_xi(k,2));
-    m_dNxi(k,2,1) = +.125*(1.+m_xi(k,0))*(1.-m_xi(k,2));
-    m_dNxi(k,3,1) = +.125*(1.-m_xi(k,0))*(1.-m_xi(k,2));
-    m_dNxi(k,4,1) = -.125*(1.-m_xi(k,0))*(1.+m_xi(k,2));
-    m_dNxi(k,5,1) = -.125*(1.+m_xi(k,0))*(1.+m_xi(k,2));
-    m_dNxi(k,6,1) = +.125*(1.+m_xi(k,0))*(1.+m_xi(k,2));
-    m_dNxi(k,7,1) = +.125*(1.-m_xi(k,0))*(1.+m_xi(k,2));
-    // - dN / dxi_2
-    m_dNxi(k,0,2) = -.125*(1.-m_xi(k,0))*(1.-m_xi(k,1));
-    m_dNxi(k,1,2) = -.125*(1.+m_xi(k,0))*(1.-m_xi(k,1));
-    m_dNxi(k,2,2) = -.125*(1.+m_xi(k,0))*(1.+m_xi(k,1));
-    m_dNxi(k,3,2) = -.125*(1.-m_xi(k,0))*(1.+m_xi(k,1));
-    m_dNxi(k,4,2) = +.125*(1.-m_xi(k,0))*(1.-m_xi(k,1));
-    m_dNxi(k,5,2) = +.125*(1.+m_xi(k,0))*(1.-m_xi(k,1));
-    m_dNxi(k,6,2) = +.125*(1.+m_xi(k,0))*(1.+m_xi(k,1));
-    m_dNxi(k,7,2) = +.125*(1.-m_xi(k,0))*(1.+m_xi(k,1));
-  }
-
-  // compute the shape function gradients, based on "x"
-  compute_dN();
-}
-
-// ------------------------------------------ constructor ------------------------------------------
-
 inline Quadrature::Quadrature(const xt::xtensor<double,3> &x, const xt::xtensor<double,2> &xi,
   const xt::xtensor<double,1> &w) : m_x(x), m_w(w), m_xi(xi)
 {
+  // check input
   assert( m_x.shape()[1] == m_nne  );
   assert( m_x.shape()[2] == m_ndim );
 
   // extract number of elements and number of integration points
   m_nelem = m_x.shape()[0];
   m_nip   = m_w.size();
 
+  // check input
   assert( m_xi.shape()[0] == m_nip  );
   assert( m_xi.shape()[1] == m_ndim );
   assert( m_w .size()     == m_nip  );
 
   // allocate arrays
   m_N    = xt::empty<double>({         m_nip, m_nne        });
   m_dNxi = xt::empty<double>({         m_nip, m_nne, m_ndim});
   m_dNx  = xt::empty<double>({m_nelem, m_nip, m_nne, m_ndim});
   m_vol  = xt::empty<double>({m_nelem, m_nip               });
 
   // shape functions
-  for ( size_t k = 0 ; k < m_nip ; ++k )
+  for ( size_t q = 0 ; q < m_nip ; ++q )
   {
-    m_N(k,0) = .125 * (1.-m_xi(k,0)) * (1.-m_xi(k,1)) * (1.-m_xi(k,2));
-    m_N(k,1) = .125 * (1.+m_xi(k,0)) * (1.-m_xi(k,1)) * (1.-m_xi(k,2));
-    m_N(k,2) = .125 * (1.+m_xi(k,0)) * (1.+m_xi(k,1)) * (1.-m_xi(k,2));
-    m_N(k,3) = .125 * (1.-m_xi(k,0)) * (1.+m_xi(k,1)) * (1.-m_xi(k,2));
-    m_N(k,4) = .125 * (1.-m_xi(k,0)) * (1.-m_xi(k,1)) * (1.+m_xi(k,2));
-    m_N(k,5) = .125 * (1.+m_xi(k,0)) * (1.-m_xi(k,1)) * (1.+m_xi(k,2));
-    m_N(k,6) = .125 * (1.+m_xi(k,0)) * (1.+m_xi(k,1)) * (1.+m_xi(k,2));
-    m_N(k,7) = .125 * (1.-m_xi(k,0)) * (1.+m_xi(k,1)) * (1.+m_xi(k,2));
+    m_N(q,0) = .125 * (1.-m_xi(q,0)) * (1.-m_xi(q,1)) * (1.-m_xi(q,2));
+    m_N(q,1) = .125 * (1.+m_xi(q,0)) * (1.-m_xi(q,1)) * (1.-m_xi(q,2));
+    m_N(q,2) = .125 * (1.+m_xi(q,0)) * (1.+m_xi(q,1)) * (1.-m_xi(q,2));
+    m_N(q,3) = .125 * (1.-m_xi(q,0)) * (1.+m_xi(q,1)) * (1.-m_xi(q,2));
+    m_N(q,4) = .125 * (1.-m_xi(q,0)) * (1.-m_xi(q,1)) * (1.+m_xi(q,2));
+    m_N(q,5) = .125 * (1.+m_xi(q,0)) * (1.-m_xi(q,1)) * (1.+m_xi(q,2));
+    m_N(q,6) = .125 * (1.+m_xi(q,0)) * (1.+m_xi(q,1)) * (1.+m_xi(q,2));
+    m_N(q,7) = .125 * (1.-m_xi(q,0)) * (1.+m_xi(q,1)) * (1.+m_xi(q,2));
   }
 
   // shape function gradients in local coordinates
-  for ( size_t k = 0 ; k < m_nip ; ++k )
+  for ( size_t q = 0 ; q < m_nip ; ++q )
   {
     // - dN / dxi_0
-    m_dNxi(k,0,0) = -.125*(1.-m_xi(k,1))*(1.-m_xi(k,2));
-    m_dNxi(k,1,0) = +.125*(1.-m_xi(k,1))*(1.-m_xi(k,2));
-    m_dNxi(k,2,0) = +.125*(1.+m_xi(k,1))*(1.-m_xi(k,2));
-    m_dNxi(k,3,0) = -.125*(1.+m_xi(k,1))*(1.-m_xi(k,2));
-    m_dNxi(k,4,0) = -.125*(1.-m_xi(k,1))*(1.+m_xi(k,2));
-    m_dNxi(k,5,0) = +.125*(1.-m_xi(k,1))*(1.+m_xi(k,2));
-    m_dNxi(k,6,0) = +.125*(1.+m_xi(k,1))*(1.+m_xi(k,2));
-    m_dNxi(k,7,0) = -.125*(1.+m_xi(k,1))*(1.+m_xi(k,2));
+    m_dNxi(q,0,0) = -.125*(1.-m_xi(q,1))*(1.-m_xi(q,2));
+    m_dNxi(q,1,0) = +.125*(1.-m_xi(q,1))*(1.-m_xi(q,2));
+    m_dNxi(q,2,0) = +.125*(1.+m_xi(q,1))*(1.-m_xi(q,2));
+    m_dNxi(q,3,0) = -.125*(1.+m_xi(q,1))*(1.-m_xi(q,2));
+    m_dNxi(q,4,0) = -.125*(1.-m_xi(q,1))*(1.+m_xi(q,2));
+    m_dNxi(q,5,0) = +.125*(1.-m_xi(q,1))*(1.+m_xi(q,2));
+    m_dNxi(q,6,0) = +.125*(1.+m_xi(q,1))*(1.+m_xi(q,2));
+    m_dNxi(q,7,0) = -.125*(1.+m_xi(q,1))*(1.+m_xi(q,2));
     // - dN / dxi_1
-    m_dNxi(k,0,1) = -.125*(1.-m_xi(k,0))*(1.-m_xi(k,2));
-    m_dNxi(k,1,1) = -.125*(1.+m_xi(k,0))*(1.-m_xi(k,2));
-    m_dNxi(k,2,1) = +.125*(1.+m_xi(k,0))*(1.-m_xi(k,2));
-    m_dNxi(k,3,1) = +.125*(1.-m_xi(k,0))*(1.-m_xi(k,2));
-    m_dNxi(k,4,1) = -.125*(1.-m_xi(k,0))*(1.+m_xi(k,2));
-    m_dNxi(k,5,1) = -.125*(1.+m_xi(k,0))*(1.+m_xi(k,2));
-    m_dNxi(k,6,1) = +.125*(1.+m_xi(k,0))*(1.+m_xi(k,2));
-    m_dNxi(k,7,1) = +.125*(1.-m_xi(k,0))*(1.+m_xi(k,2));
+    m_dNxi(q,0,1) = -.125*(1.-m_xi(q,0))*(1.-m_xi(q,2));
+    m_dNxi(q,1,1) = -.125*(1.+m_xi(q,0))*(1.-m_xi(q,2));
+    m_dNxi(q,2,1) = +.125*(1.+m_xi(q,0))*(1.-m_xi(q,2));
+    m_dNxi(q,3,1) = +.125*(1.-m_xi(q,0))*(1.-m_xi(q,2));
+    m_dNxi(q,4,1) = -.125*(1.-m_xi(q,0))*(1.+m_xi(q,2));
+    m_dNxi(q,5,1) = -.125*(1.+m_xi(q,0))*(1.+m_xi(q,2));
+    m_dNxi(q,6,1) = +.125*(1.+m_xi(q,0))*(1.+m_xi(q,2));
+    m_dNxi(q,7,1) = +.125*(1.-m_xi(q,0))*(1.+m_xi(q,2));
     // - dN / dxi_2
-    m_dNxi(k,0,2) = -.125*(1.-m_xi(k,0))*(1.-m_xi(k,1));
-    m_dNxi(k,1,2) = -.125*(1.+m_xi(k,0))*(1.-m_xi(k,1));
-    m_dNxi(k,2,2) = -.125*(1.+m_xi(k,0))*(1.+m_xi(k,1));
-    m_dNxi(k,3,2) = -.125*(1.-m_xi(k,0))*(1.+m_xi(k,1));
-    m_dNxi(k,4,2) = +.125*(1.-m_xi(k,0))*(1.-m_xi(k,1));
-    m_dNxi(k,5,2) = +.125*(1.+m_xi(k,0))*(1.-m_xi(k,1));
-    m_dNxi(k,6,2) = +.125*(1.+m_xi(k,0))*(1.+m_xi(k,1));
-    m_dNxi(k,7,2) = +.125*(1.-m_xi(k,0))*(1.+m_xi(k,1));
+    m_dNxi(q,0,2) = -.125*(1.-m_xi(q,0))*(1.-m_xi(q,1));
+    m_dNxi(q,1,2) = -.125*(1.+m_xi(q,0))*(1.-m_xi(q,1));
+    m_dNxi(q,2,2) = -.125*(1.+m_xi(q,0))*(1.+m_xi(q,1));
+    m_dNxi(q,3,2) = -.125*(1.-m_xi(q,0))*(1.+m_xi(q,1));
+    m_dNxi(q,4,2) = +.125*(1.-m_xi(q,0))*(1.-m_xi(q,1));
+    m_dNxi(q,5,2) = +.125*(1.+m_xi(q,0))*(1.-m_xi(q,1));
+    m_dNxi(q,6,2) = +.125*(1.+m_xi(q,0))*(1.+m_xi(q,1));
+    m_dNxi(q,7,2) = +.125*(1.-m_xi(q,0))*(1.+m_xi(q,1));
   }
 
   // compute the shape function gradients, based on "x"
   compute_dN();
 }
 
-// --------------------------- integration volume (per tensor-component) ---------------------------
+// -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::dV(xt::xtensor<double,2> &qscalar) const
 {
   assert( qscalar.shape()[0] == m_nelem );
   assert( qscalar.shape()[1] == m_nip   );
 
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
-    for ( size_t k = 0 ; k < m_nip ; ++k )
-      qscalar(e,k) = m_vol(e,k);
+    for ( size_t q = 0 ; q < m_nip ; ++q )
+      qscalar(e,q) = m_vol(e,q);
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::dV(xt::xtensor<double,4> &qtensor) const
 {
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nne   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
 
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
       for ( size_t i = 0 ; i < m_ndim ; ++i )
         for ( size_t j = 0 ; j < m_ndim ; ++j )
-          qtensor(e,k,i,j) = m_vol(e,k);
+          qtensor(e,q,i,j) = m_vol(e,q);
 }
 
 // -------------------------------------------------------------------------------------------------
 
-inline xt::xtensor<double,2> Quadrature::dV() const
-{
-  xt::xtensor<double,2> out = xt::empty<double>({m_nelem, m_nip});
-
-  this->dV(out);
-
-  return out;
-}
-
-// -------------------------------------------------------------------------------------------------
-
-inline xt::xtensor<double,4> Quadrature::dVtensor() const
-{
-  xt::xtensor<double,4> out = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
-
-  this->dV(out);
-
-  return out;
-}
-
-// -------------------------------------- number of elements ---------------------------------------
-
-inline size_t Quadrature::nelem() const
-{
-  return m_nelem;
-}
-
-// ---------------------------------- number of nodes per element ----------------------------------
-
-inline size_t Quadrature::nne() const
-{
-  return m_nne;
-}
-
-// ------------------------------------- number of dimensions --------------------------------------
-
-inline size_t Quadrature::ndim() const
-{
-  return m_ndim;
-}
-
-// --------------------------------- number of integration points ----------------------------------
-
-inline size_t Quadrature::nip() const
-{
-  return m_nip;
-}
-
-// --------------------------------------- update positions ----------------------------------------
-
 inline void Quadrature::update_x(const xt::xtensor<double,3> &x)
 {
   assert( x.shape()[0] == m_nelem    );
   assert( x.shape()[1] == m_nne      );
   assert( x.shape()[2] == m_ndim     );
   assert( x.size()     == m_x.size() );
 
   // update positions
   xt::noalias(m_x) = x;
 
   // update the shape function gradients for the new "x"
   compute_dN();
 }
 
-// ------------------------ shape function gradients in global coordinates -------------------------
+// -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::compute_dN()
 {
   // loop over all elements (in parallel)
   #pragma omp parallel
   {
     // allocate local variables
     T2 J, Jinv;
 
     #pragma omp for
     for ( size_t e = 0 ; e < m_nelem ; ++e )
     {
       // alias nodal positions
       auto x = xt::adapt(&m_x(e,0,0), xt::xshape<m_nne,m_ndim>());
 
       // loop over integration points
-      for ( size_t k = 0 ; k < m_nip ; ++k )
+      for ( size_t q = 0 ; q < m_nip ; ++q )
       {
         // - alias
-        auto dNxi = xt::adapt(&m_dNxi(  k,0,0), xt::xshape<m_nne,m_ndim>());
-        auto dNx  = xt::adapt(&m_dNx (e,k,0,0), xt::xshape<m_nne,m_ndim>());
+        auto dNxi = xt::adapt(&m_dNxi(  q,0,0), xt::xshape<m_nne,m_ndim>());
+        auto dNx  = xt::adapt(&m_dNx (e,q,0,0), xt::xshape<m_nne,m_ndim>());
 
         // - zero-initialize
         J.fill(0.0);
 
         // - Jacobian
         for ( size_t m = 0 ; m < m_nne ; ++m )
           for ( size_t i = 0 ; i < m_ndim ; ++i )
             for ( size_t j = 0 ; j < m_ndim ; ++j )
               J(i,j) += dNxi(m,i) * x(m,j);
 
         // - determinant and inverse of the Jacobian
         double Jdet = inv(J, Jinv);
 
         // - shape function gradients wrt global coordinates (loops partly unrolled for efficiency)
         //   dNx(m,i) += Jinv(i,j) * dNxi(m,j);
         for ( size_t m = 0 ; m < m_nne ; ++m )
         {
           dNx(m,0) = Jinv(0,0) * dNxi(m,0) + Jinv(0,1) * dNxi(m,1) + Jinv(0,2) * dNxi(m,2);
           dNx(m,1) = Jinv(1,0) * dNxi(m,0) + Jinv(1,1) * dNxi(m,1) + Jinv(1,2) * dNxi(m,2);
           dNx(m,2) = Jinv(2,0) * dNxi(m,0) + Jinv(2,1) * dNxi(m,1) + Jinv(2,2) * dNxi(m,2);
         }
 
         // - integration point volume
-        m_vol(e,k) = m_w(k) * Jdet;
+        m_vol(e,q) = m_w(q) * Jdet;
       }
     }
   }
 }
 
-// ------------------- dyadic product "qtensor(i,j) = dNdx(m,i) * elemvec(m,j)" --------------------
+// -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::gradN_vector(
   const xt::xtensor<double,3> &elemvec, xt::xtensor<double,4> &qtensor) const
 {
   assert( elemvec.shape()[0] == m_nelem );
   assert( elemvec.shape()[1] == m_nne   );
   assert( elemvec.shape()[2] == m_ndim  );
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nne   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
 
   // zero-initialize output: matrix of tensors
   qtensor.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias element vector (e.g. nodal displacements)
     auto u = xt::adapt(&elemvec(e,0,0), xt::xshape<m_nne,m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto dNx   = xt::adapt(&m_dNx  (e,k,0,0), xt::xshape<m_nne ,m_ndim>());
-      auto gradu = xt::adapt(&qtensor(e,k,0,0), xt::xshape<m_ndim,m_ndim>());
+      auto dNx   = xt::adapt(&m_dNx  (e,q,0,0), xt::xshape<m_nne ,m_ndim>());
+      auto gradu = xt::adapt(&qtensor(e,q,0,0), xt::xshape<m_ndim,m_ndim>());
 
       // - evaluate dyadic product
       for ( size_t m = 0 ; m < m_nne ; ++m )
         for ( size_t i = 0 ; i < m_ndim ; ++i )
           for ( size_t j = 0 ; j < m_ndim ; ++j )
             gradu(i,j) += dNx(m,i) * u(m,j);
     }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
-inline xt::xtensor<double,4> Quadrature::gradN_vector(const xt::xtensor<double,3> &elemvec) const
-{
-  xt::xtensor<double,4> qtensor = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
-
-  this->gradN_vector(elemvec, qtensor);
-
-  return qtensor;
-}
-
-// ---------------------------------- transpose of "gradN_vector" ----------------------------------
-
 inline void Quadrature::gradN_vector_T(
   const xt::xtensor<double,3> &elemvec, xt::xtensor<double,4> &qtensor) const
 {
   assert( elemvec.shape()[0] == m_nelem );
   assert( elemvec.shape()[1] == m_nne   );
   assert( elemvec.shape()[2] == m_ndim  );
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nne   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
 
   // zero-initialize output: matrix of tensors
   qtensor.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias element vector (e.g. nodal displacements)
     auto u = xt::adapt(&elemvec(e,0,0), xt::xshape<m_nne,m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto dNx   = xt::adapt(&m_dNx  (e,k,0,0), xt::xshape<m_nne ,m_ndim>());
-      auto gradu = xt::adapt(&qtensor(e,k,0,0), xt::xshape<m_ndim,m_ndim>());
+      auto dNx   = xt::adapt(&m_dNx  (e,q,0,0), xt::xshape<m_nne ,m_ndim>());
+      auto gradu = xt::adapt(&qtensor(e,q,0,0), xt::xshape<m_ndim,m_ndim>());
 
       // - evaluate transpose of dyadic product
       for ( size_t m = 0 ; m < m_nne ; ++m )
         for ( size_t i = 0 ; i < m_ndim ; ++i )
           for ( size_t j = 0 ; j < m_ndim ; ++j )
             gradu(j,i) += dNx(m,i) * u(m,j);
     }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
-inline xt::xtensor<double,4> Quadrature::gradN_vector_T(const xt::xtensor<double,3> &elemvec) const
-{
-  xt::xtensor<double,4> qtensor = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
-
-  this->gradN_vector_T(elemvec, qtensor);
-
-  return qtensor;
-}
-
-// ------------------------------- symmetric part of "gradN_vector" --------------------------------
-
 inline void Quadrature::symGradN_vector(
   const xt::xtensor<double,3> &elemvec, xt::xtensor<double,4> &qtensor) const
 {
   assert( elemvec.shape()[0] == m_nelem );
   assert( elemvec.shape()[1] == m_nne   );
   assert( elemvec.shape()[2] == m_ndim  );
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nne   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
 
   // zero-initialize output: matrix of tensors
   qtensor.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias element vector (e.g. nodal displacements)
     auto u = xt::adapt(&elemvec(e,0,0), xt::xshape<m_nne,m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto dNx = xt::adapt(&m_dNx  (e,k,0,0), xt::xshape<m_nne ,m_ndim>());
-      auto eps = xt::adapt(&qtensor(e,k,0,0), xt::xshape<m_ndim,m_ndim>());
+      auto dNx = xt::adapt(&m_dNx  (e,q,0,0), xt::xshape<m_nne ,m_ndim>());
+      auto eps = xt::adapt(&qtensor(e,q,0,0), xt::xshape<m_ndim,m_ndim>());
 
       // - evaluate symmetrized dyadic product
       for ( size_t m = 0 ; m < m_nne ; ++m ) {
         for ( size_t i = 0 ; i < m_ndim ; ++i ) {
           for ( size_t j = 0 ; j < m_ndim ; ++j ) {
             eps(i,j) += dNx(m,i) * u(m,j) / 2.;
             eps(j,i) += dNx(m,i) * u(m,j) / 2.;
           }
         }
       }
     }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
-inline xt::xtensor<double,4> Quadrature::symGradN_vector(const xt::xtensor<double,3> &elemvec) const
-{
-  xt::xtensor<double,4> qtensor = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
-
-  this->symGradN_vector(elemvec, qtensor);
-
-  return qtensor;
-}
-
-// ------- scalar product "elemmat(m*ndim+i,n*ndim+i) = N(m) * qscalar * N(n)"; for all "i" --------
-
 inline void Quadrature::int_N_scalar_NT_dV(
   const xt::xtensor<double,2> &qscalar, xt::xtensor<double,3> &elemmat) const
 {
   assert( qscalar.shape()[0] == m_nelem      );
   assert( qscalar.shape()[1] == m_nip        );
   assert( elemmat.shape()[0] == m_nelem      );
   assert( elemmat.shape()[1] == m_nne*m_ndim );
   assert( elemmat.shape()[2] == m_nne*m_ndim );
 
   // zero-initialize: matrix of matrices
   elemmat.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias (e.g. mass matrix)
     auto M = xt::adapt(&elemmat(e,0,0), xt::xshape<m_nne*m_ndim,m_nne*m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto  N   = xt::adapt(&m_N(k,0), xt::xshape<m_nne>());
-      auto& vol = m_vol  (e,k);
-      auto& rho = qscalar(e,k);
+      auto  N   = xt::adapt(&m_N(q,0), xt::xshape<m_nne>());
+      auto& vol = m_vol  (e,q);
+      auto& rho = qscalar(e,q);
 
       // - evaluate scalar product, for all dimensions, and assemble
       //   M(m*ndim+i,n*ndim+i) += N(m) * scalar * N(n) * dV
       for ( size_t m = 0 ; m < m_nne ; ++m ) {
         for ( size_t n = 0 ; n < m_nne ; ++n ) {
           M(m*m_ndim+0, n*m_ndim+0) += N(m) * rho * N(n) * vol;
           M(m*m_ndim+1, n*m_ndim+1) += N(m) * rho * N(n) * vol;
           M(m*m_ndim+2, n*m_ndim+2) += N(m) * rho * N(n) * vol;
         }
       }
     }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
-inline xt::xtensor<double,3> Quadrature::int_N_scalar_NT_dV(const xt::xtensor<double,2> &qscalar) const
-{
-  xt::xtensor<double,3> elemmat = xt::empty<double>({m_nelem, m_nne*m_ndim, m_nne*m_ndim});
-
-  this->int_N_scalar_NT_dV(qscalar, elemmat);
-
-  return elemmat;
-}
-
-// ------------ integral of dot product "elemvec(m,j) += dNdx(m,i) * qtensor(i,j) * dV" ------------
-
 inline void Quadrature::int_gradN_dot_tensor2_dV(const xt::xtensor<double,4> &qtensor,
   xt::xtensor<double,3> &elemvec) const
 {
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nip   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
   assert( elemvec.shape()[0] == m_nelem );
   assert( elemvec.shape()[1] == m_nne   );
   assert( elemvec.shape()[2] == m_ndim  );
 
   // zero-initialize output: matrix of vectors
   elemvec.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias (e.g. nodal force)
     auto f = xt::adapt(&elemvec(e,0,0), xt::xshape<m_nne,m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto  dNx = xt::adapt(&m_dNx  (e,k,0,0), xt::xshape<m_nne ,m_ndim>());
-      auto  sig = xt::adapt(&qtensor(e,k,0,0), xt::xshape<m_ndim,m_ndim>());
-      auto& vol = m_vol(e,k);
+      auto  dNx = xt::adapt(&m_dNx  (e,q,0,0), xt::xshape<m_nne ,m_ndim>());
+      auto  sig = xt::adapt(&qtensor(e,q,0,0), xt::xshape<m_ndim,m_ndim>());
+      auto& vol = m_vol(e,q);
 
       // - evaluate dot product, and assemble
       for ( size_t m = 0 ; m < m_nne ; ++m )
       {
         f(m,0) += ( dNx(m,0) * sig(0,0) + dNx(m,1) * sig(1,0) + dNx(m,2) * sig(2,0) ) * vol;
         f(m,1) += ( dNx(m,0) * sig(0,1) + dNx(m,1) * sig(1,1) + dNx(m,2) * sig(2,1) ) * vol;
         f(m,2) += ( dNx(m,0) * sig(0,2) + dNx(m,1) * sig(1,2) + dNx(m,2) * sig(2,2) ) * vol;
       }
     }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
-inline xt::xtensor<double,3> Quadrature::int_gradN_dot_tensor2_dV(const xt::xtensor<double,4> &qtensor) const
+inline void Quadrature::int_gradN_dot_tensor4_dot_gradNT_dV(const xt::xtensor<double,6> &qtensor,
+  xt::xtensor<double,3> &elemmat) const
+{
+  assert( qtensor.shape()[0] == m_nelem );
+  assert( qtensor.shape()[1] == m_nip   );
+  assert( qtensor.shape()[2] == m_ndim  );
+  assert( qtensor.shape()[3] == m_ndim  );
+  assert( qtensor.shape()[4] == m_ndim  );
+  assert( qtensor.shape()[5] == m_ndim  );
+
+  assert( elemmat.shape()[0] == m_nelem      );
+  assert( elemmat.shape()[1] == m_nne*m_ndim );
+  assert( elemmat.shape()[2] == m_nne*m_ndim );
+
+  // zero-initialize output: matrix of vector
+  elemmat.fill(0.0);
+
+  // loop over all elements (in parallel)
+  #pragma omp parallel for
+  for ( size_t e = 0 ; e < m_nelem ; ++e )
+  {
+    // alias (e.g. nodal force)
+    auto K = xt::adapt(&elemmat(e,0,0), xt::xshape<m_nne*m_ndim,m_nne*m_ndim>());
+
+    // loop over all integration points in element "e"
+    for ( size_t q = 0 ; q < m_nip ; ++q ){
+
+      // - alias
+      auto  dNx = xt::adapt(&m_dNx(e,q,0,0), xt::xshape<m_nne,m_ndim>());
+      auto  C   = xt::adapt(&qtensor(e,q,0,0,0,0), xt::xshape<m_ndim,m_ndim,m_ndim,m_ndim>());
+      auto& vol = m_vol(e,q);
+
+      // - evaluate dot product, and assemble
+      for ( size_t m = 0 ; m < m_nne ; ++m )
+        for ( size_t n = 0 ; n < m_nne ; ++n )
+          for ( size_t i = 0 ; i < m_ndim ; ++i )
+            for ( size_t j = 0 ; j < m_ndim ; ++j )
+              for ( size_t k = 0 ; k < m_ndim ; ++k )
+                for ( size_t l = 0 ; l < m_ndim ; ++l )
+                  K(m*m_ndim+j, n*m_ndim+k) += dNx(m,i) * C(i,j,k,l) * dNx(n,l) * vol;
+     }
+  }
+}
+
+// -------------------------------------------------------------------------------------------------
+
+inline xt::xtensor<double,2> Quadrature::dV() const
+{
+  xt::xtensor<double,2> out = xt::empty<double>({m_nelem, m_nip});
+
+  this->dV(out);
+
+  return out;
+}
+
+// -------------------------------------------------------------------------------------------------
+
+inline xt::xtensor<double,4> Quadrature::dVtensor() const
+{
+  xt::xtensor<double,4> out = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
+
+  this->dV(out);
+
+  return out;
+}
+
+// -------------------------------------------------------------------------------------------------
+
+inline xt::xtensor<double,4> Quadrature::gradN_vector(const xt::xtensor<double,3> &elemvec) const
+{
+  xt::xtensor<double,4> qtensor = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
+
+  this->gradN_vector(elemvec, qtensor);
+
+  return qtensor;
+}
+
+// -------------------------------------------------------------------------------------------------
+
+inline xt::xtensor<double,4> Quadrature::gradN_vector_T(const xt::xtensor<double,3> &elemvec) const
+{
+  xt::xtensor<double,4> qtensor = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
+
+  this->gradN_vector_T(elemvec, qtensor);
+
+  return qtensor;
+}
+
+// -------------------------------------------------------------------------------------------------
+
+inline xt::xtensor<double,4> Quadrature::symGradN_vector(const xt::xtensor<double,3> &elemvec) const
+{
+  xt::xtensor<double,4> qtensor = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
+
+  this->symGradN_vector(elemvec, qtensor);
+
+  return qtensor;
+}
+
+// -------------------------------------------------------------------------------------------------
+
+inline xt::xtensor<double,3> Quadrature::int_N_scalar_NT_dV(
+  const xt::xtensor<double,2> &qscalar) const
+{
+  xt::xtensor<double,3> elemmat = xt::empty<double>({m_nelem, m_nne*m_ndim, m_nne*m_ndim});
+
+  this->int_N_scalar_NT_dV(qscalar, elemmat);
+
+  return elemmat;
+}
+
+// -------------------------------------------------------------------------------------------------
+
+inline xt::xtensor<double,3> Quadrature::int_gradN_dot_tensor2_dV(
+  const xt::xtensor<double,4> &qtensor) const
 {
   xt::xtensor<double,3> elemvec = xt::empty<double>({m_nelem, m_nne, m_ndim});
 
   this->int_gradN_dot_tensor2_dV(qtensor, elemvec);
 
   return elemvec;
 }
 
 // -------------------------------------------------------------------------------------------------
 
+inline xt::xtensor<double,3> Quadrature::int_gradN_dot_tensor4_dot_gradNT_dV(
+  const xt::xtensor<double,6> &qtensor) const
+ {
+   xt::xtensor<double,3> elemmat = xt::empty<double>({m_nelem, m_ndim*m_nne, m_ndim*m_nne});
+
+   this->int_gradN_dot_tensor4_dot_gradNT_dV(qtensor, elemmat);
+
+   return elemmat;
+ }
+
+// -------------------------------------------------------------------------------------------------
+
 }}} // namespace ...
 
 // =================================================================================================
 
 #endif
diff --git a/include/xGooseFEM/ElementQuad4.h b/include/xGooseFEM/ElementQuad4.h
index 1bac6d1..4311538 100644
--- a/include/xGooseFEM/ElementQuad4.h
+++ b/include/xGooseFEM/ElementQuad4.h
@@ -1,156 +1,161 @@
 /* =================================================================================================
 
 (c - GPLv3) T.W.J. de Geus (Tom) | tom@geus.me | www.geus.me | github.com/tdegeus/GooseFEM
 
 ================================================================================================= */
 
 #ifndef XGOOSEFEM_ELEMENTQUAD4_H
 #define XGOOSEFEM_ELEMENTQUAD4_H
 
 // -------------------------------------------------------------------------------------------------
 
 #include "GooseFEM.h"
 
-// =================================== GooseFEM::Element::Quad4 ====================================
+// =================================================================================================
 
 namespace xGooseFEM {
 namespace Element {
 namespace Quad4 {
 
-// ======================================== tensor algebra =========================================
+// =================================================================================================
 
 using T2 = xt::xtensor_fixed<double, xt::xshape<2,2>>;
 
 inline double inv(const T2 &A, T2 &Ainv);
 
-// ================================ GooseFEM::Element::Quad4::Gauss ================================
+// =================================================================================================
 
 namespace Gauss {
 inline size_t                nip(); // number of integration points
 inline xt::xtensor<double,2> xi();  // integration point coordinates (local coordinates)
 inline xt::xtensor<double,1> w();   // integration point weights
 }
 
-// ================================ GooseFEM::Element::Quad4::Nodal ================================
+// =================================================================================================
 
 namespace Nodal {
 inline size_t                nip(); // number of integration points
 inline xt::xtensor<double,2> xi();  // integration point coordinates (local coordinates)
 inline xt::xtensor<double,1> w();   // integration point weights
 }
 
 // =================================================================================================
 
 // ------------------------------------------ quadrature -------------------------------------------
 
 class Quadrature
 {
-private:
-
-  // dimensions (flexible)
-  size_t m_nelem; // number of elements
-  size_t m_nip;   // number of integration points
-
-  // dimensions (fixed for this element type)
-  static const size_t m_nne=4;  // number of nodes per element
-  static const size_t m_ndim=2; // number of dimensions
-
-  // data arrays
-  xt::xtensor<double,3> m_x;    // nodal positions stored per element [nelem, nne, ndim]
-  xt::xtensor<double,1> m_w;    // weight of each integration point [nip]
-  xt::xtensor<double,2> m_xi;   // local coordinate of each integration point [nip, ndim]
-  xt::xtensor<double,2> m_N;    // shape functions [nip, nne]
-  xt::xtensor<double,3> m_dNxi; // shape function gradients w.r.t. local  coordinate [nip, nne, ndim]
-  xt::xtensor<double,4> m_dNx;  // shape function gradients w.r.t. global coordinate [nelem, nip, nne, ndim]
-  xt::xtensor<double,2> m_vol;  // integration point volume [nelem, nip]
-
-private:
-
-  // compute "vol" and "dNdx" based on current "x"
-  void compute_dN();
-
 public:
 
-  // convention:
+  // fixed dimensions:
+  //    ndim = 2   -  number of dimensions
+  //    nne  = 4   -  number of nodes per element
+  //
+  // naming convention:
   //    "elemmat"  -  matrices stored per element       -  [nelem, nne*ndim, nne*ndim]
   //    "elemvec"  -  nodal vectors stored per element  -  [nelem, nne, ndim]
   //    "qtensor"  -  integration point tensor          -  [nelem, nip, ndim, ndim]
   //    "qscalar"  -  integration point scalar          -  [nelem, nip]
 
   // constructor: integration point coordinates and weights are optional (default: Gauss)
 
   Quadrature() = default;
 
-  Quadrature(const xt::xtensor<double,3> &x);
+  Quadrature(const xt::xtensor<double,3> &x) : Quadrature(x, Gauss::xi(), Gauss::w()){};
 
   Quadrature(const xt::xtensor<double,3> &x, const xt::xtensor<double,2> &xi, const xt::xtensor<double,1> &w);
 
   // update the nodal positions (shape of "x" should match the earlier definition)
 
   void update_x(const xt::xtensor<double,3> &x);
 
   // return dimensions
 
-  size_t nelem() const; // number of elements
-  size_t nne()   const; // number of nodes per element
-  size_t ndim()  const; // number of dimension
-  size_t nip()   const; // number of integration points
+  size_t nelem() const { return m_nelem; }; // number of elements
+  size_t nne()   const { return m_nne;   }; // number of nodes per element
+  size_t ndim()  const { return m_ndim;  }; // number of dimension
+  size_t nip()   const { return m_nip;   }; // number of integration points
 
   // return integration volume
 
   void dV(xt::xtensor<double,2> &qscalar) const;
+
   void dV(xt::xtensor<double,4> &qtensor) const; // same volume for all tensor components
 
   // dyadic product "qtensor(i,j) += dNdx(m,i) * elemvec(m,j)", its transpose and its symmetric part
 
-  void gradN_vector   (const xt::xtensor<double,3> &elemvec,
+  void gradN_vector(const xt::xtensor<double,3> &elemvec,
     xt::xtensor<double,4> &qtensor) const;
 
-  void gradN_vector_T (const xt::xtensor<double,3> &elemvec,
+  void gradN_vector_T(const xt::xtensor<double,3> &elemvec,
     xt::xtensor<double,4> &qtensor) const;
 
   void symGradN_vector(const xt::xtensor<double,3> &elemvec,
     xt::xtensor<double,4> &qtensor) const;
 
   // integral of the scalar product "elemmat(m*ndim+i,n*ndim+i) += N(m) * qscalar * N(n) * dV"
 
   void int_N_scalar_NT_dV(const xt::xtensor<double,2> &qscalar,
     xt::xtensor<double,3> &elemmat) const;
 
   // integral of the dot product "elemvec(m,j) += dNdx(m,i) * qtensor(i,j) * dV"
 
   void int_gradN_dot_tensor2_dV(const xt::xtensor<double,4> &qtensor,
     xt::xtensor<double,3> &elemvec) const;
 
   // integral of the dot product "elemmat(m*2+j, n*2+k) += dNdx(m,i) * qtensor(i,j,k,l) * dNdx(n,l) * dV"
 
   void int_gradN_dot_tensor4_dot_gradNT_dV(const xt::xtensor<double,6> &qtensor,
     xt::xtensor<double,3> &elemmat) const;
 
   // auto allocation of the functions above
 
   xt::xtensor<double,2> dV() const;
 
   xt::xtensor<double,4> dVtensor() const;
 
   xt::xtensor<double,4> gradN_vector(const xt::xtensor<double,3> &elemvec) const;
 
   xt::xtensor<double,4> gradN_vector_T(const xt::xtensor<double,3> &elemvec) const;
 
   xt::xtensor<double,4> symGradN_vector(const xt::xtensor<double,3> &elemvec) const;
 
   xt::xtensor<double,3> int_N_scalar_NT_dV(const xt::xtensor<double,2> &qscalar) const;
 
   xt::xtensor<double,3> int_gradN_dot_tensor2_dV(const xt::xtensor<double,4> &qtensor) const;
 
   xt::xtensor<double,3> int_gradN_dot_tensor4_dot_gradNT_dV(const xt::xtensor<double,6> &qtensor) const;
 
+private:
+
+  // compute "vol" and "dNdx" based on current "x"
+  void compute_dN();
+
+private:
+
+  // dimensions (flexible)
+  size_t m_nelem; // number of elements
+  size_t m_nip;   // number of integration points
+
+  // dimensions (fixed for this element type)
+  static const size_t m_nne=4;  // number of nodes per element
+  static const size_t m_ndim=2; // number of dimensions
+
+  // data arrays
+  xt::xtensor<double,3> m_x;    // nodal positions stored per element [nelem, nne, ndim]
+  xt::xtensor<double,1> m_w;    // weight of each integration point [nip]
+  xt::xtensor<double,2> m_xi;   // local coordinate of each integration point [nip, ndim]
+  xt::xtensor<double,2> m_N;    // shape functions [nip, nne]
+  xt::xtensor<double,3> m_dNxi; // shape function gradients w.r.t. local  coordinate [nip, nne, ndim]
+  xt::xtensor<double,4> m_dNx;  // shape function gradients w.r.t. global coordinate [nelem, nip, nne, ndim]
+  xt::xtensor<double,2> m_vol;  // integration point volume [nelem, nip]
+
 };
 
 // -------------------------------------------------------------------------------------------------
 
 }}} // namespace ...
 
 // =================================================================================================
 
 #endif
diff --git a/include/xGooseFEM/ElementQuad4.hpp b/include/xGooseFEM/ElementQuad4.hpp
index 1550a14..2e8637a 100644
--- a/include/xGooseFEM/ElementQuad4.hpp
+++ b/include/xGooseFEM/ElementQuad4.hpp
@@ -1,702 +1,625 @@
 /* =================================================================================================
 
 (c - GPLv3) T.W.J. de Geus (Tom) | tom@geus.me | www.geus.me | github.com/tdegeus/GooseFEM
 
 ================================================================================================= */
 
 #ifndef XGOOSEFEM_ELEMENTQUAD4_CPP
 #define XGOOSEFEM_ELEMENTQUAD4_CPP
 
 // -------------------------------------------------------------------------------------------------
 
 #include "ElementQuad4.h"
 
 // =================================================================================================
 
 namespace xGooseFEM {
 namespace Element {
 namespace Quad4 {
 
 // =================================================================================================
 
 inline double inv(const T2 &A, T2 &Ainv)
 {
   // compute determinant
-  double det = A[0] * A[3] - A[1] * A[2];
+  double det = A(0,0) * A(1,1) - A(0,1) * A(1,0);
 
   // compute inverse
-  Ainv[0] =       A[3] / det;
-  Ainv[1] = -1. * A[1] / det;
-  Ainv[2] = -1. * A[2] / det;
-  Ainv[3] =       A[0] / det;
+  Ainv(0,0) =       A(1,1) / det;
+  Ainv(0,1) = -1. * A(0,1) / det;
+  Ainv(1,0) = -1. * A(1,0) / det;
+  Ainv(1,1) =       A(0,0) / det;
 
   return det;
 }
 
 // =================================================================================================
 
 namespace Gauss {
 
 // -------------------------------------------------------------------------------------------------
 
 inline size_t nip()
 {
   return 4;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,2> xi()
 {
   size_t nip  = 4;
   size_t ndim = 2;
 
   xt::xtensor<double,2> xi = xt::empty<double>({nip,ndim});
 
   xi(0,0) = -1./std::sqrt(3.);    xi(0,1) = -1./std::sqrt(3.);
   xi(1,0) = +1./std::sqrt(3.);    xi(1,1) = -1./std::sqrt(3.);
   xi(2,0) = +1./std::sqrt(3.);    xi(2,1) = +1./std::sqrt(3.);
   xi(3,0) = -1./std::sqrt(3.);    xi(3,1) = +1./std::sqrt(3.);
 
   return xi;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,1> w()
 {
   size_t nip = 4;
 
   xt::xtensor<double,1> w = xt::empty<double>({nip});
 
   w(0) = 1.;
   w(1) = 1.;
   w(2) = 1.;
   w(3) = 1.;
 
   return w;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 }
 
 // =================================================================================================
 
 namespace Nodal {
 
 // -------------------------------------------------------------------------------------------------
 
 inline size_t nip()
 {
   return 4;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,2> xi()
 {
   size_t nip  = 4;
   size_t ndim = 2;
 
   xt::xtensor<double,2> xi = xt::empty<double>({nip,ndim});
 
   xi(0,0) = -1.;    xi(0,1) = -1.;
   xi(1,0) = +1.;    xi(1,1) = -1.;
   xi(2,0) = +1.;    xi(2,1) = +1.;
   xi(3,0) = -1.;    xi(3,1) = +1.;
 
   return xi;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,1> w()
 {
   size_t nip = 4;
 
   xt::xtensor<double,1> w = xt::empty<double>({nip});
 
   w(0) = 1.;
   w(1) = 1.;
   w(2) = 1.;
   w(3) = 1.;
 
   return w;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 }
 
 // =================================================================================================
 
-// -------------------------------------------------------------------------------------------------
-
-inline Quadrature::Quadrature(const xt::xtensor<double,3> &x) : m_x(x)
-{
-  assert( m_x.shape()[1] == m_nne  );
-  assert( m_x.shape()[2] == m_ndim );
-
-  // integration scheme
-  m_nip = Gauss::nip();
-  m_xi  = Gauss::xi();
-  m_w   = Gauss::w();
-
-  // extract number of elements
-  m_nelem = m_x.shape()[0];
-
-  // allocate arrays
-  m_N    = xt::empty<double>({         m_nip, m_nne        });
-  m_dNxi = xt::empty<double>({         m_nip, m_nne, m_ndim});
-  m_dNx  = xt::empty<double>({m_nelem, m_nip, m_nne, m_ndim});
-  m_vol  = xt::empty<double>({m_nelem, m_nip               });
-
-  // shape functions
-  for ( size_t k = 0 ; k < m_nip ; ++k )
-  {
-    m_N(k,0) = .25 * (1.-m_xi(k,0)) * (1.-m_xi(k,1));
-    m_N(k,1) = .25 * (1.+m_xi(k,0)) * (1.-m_xi(k,1));
-    m_N(k,2) = .25 * (1.+m_xi(k,0)) * (1.+m_xi(k,1));
-    m_N(k,3) = .25 * (1.-m_xi(k,0)) * (1.+m_xi(k,1));
-  }
-
-  // shape function gradients in local coordinates
-  for ( size_t k = 0 ; k < m_nip ; ++k )
-  {
-    // - dN / dxi_0
-    m_dNxi(k,0,0) = -.25*(1.-m_xi(k,1));
-    m_dNxi(k,1,0) = +.25*(1.-m_xi(k,1));
-    m_dNxi(k,2,0) = +.25*(1.+m_xi(k,1));
-    m_dNxi(k,3,0) = -.25*(1.+m_xi(k,1));
-    // - dN / dxi_1
-    m_dNxi(k,0,1) = -.25*(1.-m_xi(k,0));
-    m_dNxi(k,1,1) = -.25*(1.+m_xi(k,0));
-    m_dNxi(k,2,1) = +.25*(1.+m_xi(k,0));
-    m_dNxi(k,3,1) = +.25*(1.-m_xi(k,0));
-  }
-
-  // compute the shape function gradients, based on "x"
-  compute_dN();
-}
-
-// -------------------------------------------------------------------------------------------------
-
 inline Quadrature::Quadrature(const xt::xtensor<double,3> &x, const xt::xtensor<double,2> &xi,
   const xt::xtensor<double,1> &w) : m_x(x), m_w(w), m_xi(xi)
 {
+  // check input
   assert( m_x.shape()[1] == m_nne  );
   assert( m_x.shape()[2] == m_ndim );
 
   // extract number of elements and number of integration points
   m_nelem = m_x.shape()[0];
   m_nip   = m_w.size();
 
+  // check input
   assert( m_xi.shape()[0] == m_nip  );
   assert( m_xi.shape()[1] == m_ndim );
   assert( m_w .size()     == m_nip  );
 
   // allocate arrays
   m_N    = xt::empty<double>({         m_nip, m_nne        });
   m_dNxi = xt::empty<double>({         m_nip, m_nne, m_ndim});
   m_dNx  = xt::empty<double>({m_nelem, m_nip, m_nne, m_ndim});
   m_vol  = xt::empty<double>({m_nelem, m_nip               });
 
   // shape functions
-  for ( size_t k = 0 ; k < m_nip ; ++k )
+  for ( size_t q = 0 ; q < m_nip ; ++q )
   {
-    m_N(k,0) = .25 * (1.-m_xi(k,0)) * (1.-m_xi(k,1));
-    m_N(k,1) = .25 * (1.+m_xi(k,0)) * (1.-m_xi(k,1));
-    m_N(k,2) = .25 * (1.+m_xi(k,0)) * (1.+m_xi(k,1));
-    m_N(k,3) = .25 * (1.-m_xi(k,0)) * (1.+m_xi(k,1));
+    m_N(q,0) = .25 * (1.-m_xi(q,0)) * (1.-m_xi(q,1));
+    m_N(q,1) = .25 * (1.+m_xi(q,0)) * (1.-m_xi(q,1));
+    m_N(q,2) = .25 * (1.+m_xi(q,0)) * (1.+m_xi(q,1));
+    m_N(q,3) = .25 * (1.-m_xi(q,0)) * (1.+m_xi(q,1));
   }
 
   // shape function gradients in local coordinates
-  for ( size_t k = 0 ; k < m_nip ; ++k )
+  for ( size_t q = 0 ; q < m_nip ; ++q )
   {
     // - dN / dxi_0
-    m_dNxi(k,0,0) = -.25*(1.-m_xi(k,1));
-    m_dNxi(k,1,0) = +.25*(1.-m_xi(k,1));
-    m_dNxi(k,2,0) = +.25*(1.+m_xi(k,1));
-    m_dNxi(k,3,0) = -.25*(1.+m_xi(k,1));
+    m_dNxi(q,0,0) = -.25*(1.-m_xi(q,1));
+    m_dNxi(q,1,0) = +.25*(1.-m_xi(q,1));
+    m_dNxi(q,2,0) = +.25*(1.+m_xi(q,1));
+    m_dNxi(q,3,0) = -.25*(1.+m_xi(q,1));
     // - dN / dxi_1
-    m_dNxi(k,0,1) = -.25*(1.-m_xi(k,0));
-    m_dNxi(k,1,1) = -.25*(1.+m_xi(k,0));
-    m_dNxi(k,2,1) = +.25*(1.+m_xi(k,0));
-    m_dNxi(k,3,1) = +.25*(1.-m_xi(k,0));
+    m_dNxi(q,0,1) = -.25*(1.-m_xi(q,0));
+    m_dNxi(q,1,1) = -.25*(1.+m_xi(q,0));
+    m_dNxi(q,2,1) = +.25*(1.+m_xi(q,0));
+    m_dNxi(q,3,1) = +.25*(1.-m_xi(q,0));
   }
 
   // compute the shape function gradients, based on "x"
   compute_dN();
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::dV(xt::xtensor<double,2> &qscalar) const
 {
   assert( qscalar.shape()[0] == m_nelem );
   assert( qscalar.shape()[1] == m_nip   );
 
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
-    for ( size_t k = 0 ; k < m_nip ; ++k )
-      qscalar(e,k) = m_vol(e,k);
+    for ( size_t q = 0 ; q < m_nip ; ++q )
+      qscalar(e,q) = m_vol(e,q);
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::dV(xt::xtensor<double,4> &qtensor) const
 {
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nne   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
 
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
       for ( size_t i = 0 ; i < m_ndim ; ++i )
         for ( size_t j = 0 ; j < m_ndim ; ++j )
-          qtensor(e,k,i,j) = m_vol(e,k);
-}
-
-// -------------------------------------------------------------------------------------------------
-
-inline size_t Quadrature::nelem() const
-{
-  return m_nelem;
-}
-
-// -------------------------------------------------------------------------------------------------
-
-inline size_t Quadrature::nne() const
-{
-  return m_nne;
-}
-
-// -------------------------------------------------------------------------------------------------
-
-inline size_t Quadrature::ndim() const
-{
-  return m_ndim;
-}
-
-// -------------------------------------------------------------------------------------------------
-
-inline size_t Quadrature::nip() const
-{
-  return m_nip;
+          qtensor(e,q,i,j) = m_vol(e,q);
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::update_x(const xt::xtensor<double,3> &x)
 {
   assert( x.shape()[0] == m_nelem    );
   assert( x.shape()[1] == m_nne      );
   assert( x.shape()[2] == m_ndim     );
   assert( x.size()     == m_x.size() );
 
   // update positions
   xt::noalias(m_x) = x;
 
   // update the shape function gradients for the new "x"
   compute_dN();
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::compute_dN()
 {
   // loop over all elements (in parallel)
   #pragma omp parallel
   {
     // allocate local variables
     T2 J, Jinv;
 
     #pragma omp for
     for ( size_t e = 0 ; e < m_nelem ; ++e )
     {
       // alias nodal positions
       auto x = xt::adapt(&m_x(e,0,0), xt::xshape<m_nne,m_ndim>());
 
       // loop over integration points
-      for ( size_t k = 0 ; k < m_nip ; ++k )
+      for ( size_t q = 0 ; q < m_nip ; ++q )
       {
         // - alias
-        auto dNxi = xt::adapt(&m_dNxi(  k,0,0), xt::xshape<m_nne,m_ndim>());
-        auto dNx  = xt::adapt(&m_dNx (e,k,0,0), xt::xshape<m_nne,m_ndim>());
+        auto dNxi = xt::adapt(&m_dNxi(  q,0,0), xt::xshape<m_nne,m_ndim>());
+        auto dNx  = xt::adapt(&m_dNx (e,q,0,0), xt::xshape<m_nne,m_ndim>());
 
         // - Jacobian (loops unrolled for efficiency)
         //   J(i,j) += dNxi(m,i) * x(m,j);
         J(0,0) = dNxi(0,0)*x(0,0) + dNxi(1,0)*x(1,0) + dNxi(2,0)*x(2,0) + dNxi(3,0)*x(3,0);
         J(0,1) = dNxi(0,0)*x(0,1) + dNxi(1,0)*x(1,1) + dNxi(2,0)*x(2,1) + dNxi(3,0)*x(3,1);
         J(1,0) = dNxi(0,1)*x(0,0) + dNxi(1,1)*x(1,0) + dNxi(2,1)*x(2,0) + dNxi(3,1)*x(3,0);
         J(1,1) = dNxi(0,1)*x(0,1) + dNxi(1,1)*x(1,1) + dNxi(2,1)*x(2,1) + dNxi(3,1)*x(3,1);
 
         // - determinant and inverse of the Jacobian
         double Jdet = inv(J, Jinv);
 
         // - shape function gradients wrt global coordinates (loops partly unrolled for efficiency)
         //   dNx(m,i) += Jinv(i,j) * dNxi(m,j);
         for ( size_t m = 0 ; m < m_nne ; ++m )
         {
           dNx(m,0) = Jinv(0,0) * dNxi(m,0) + Jinv(0,1) * dNxi(m,1);
           dNx(m,1) = Jinv(1,0) * dNxi(m,0) + Jinv(1,1) * dNxi(m,1);
         }
 
         // - integration point volume
-        m_vol(e,k) = m_w(k) * Jdet;
+        m_vol(e,q) = m_w(q) * Jdet;
       }
     }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::gradN_vector(
   const xt::xtensor<double,3> &elemvec, xt::xtensor<double,4> &qtensor) const
 {
   assert( elemvec.shape()[0] == m_nelem );
   assert( elemvec.shape()[1] == m_nne   );
   assert( elemvec.shape()[2] == m_ndim  );
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nne   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
 
   // zero-initialize output: matrix of tensors
   qtensor.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias element vector (e.g. nodal displacements)
     auto u = xt::adapt(&elemvec(e,0,0), xt::xshape<m_nne,m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto dNx   = xt::adapt(&m_dNx  (e,k,0,0), xt::xshape<m_nne ,m_ndim>());
-      auto gradu = xt::adapt(&qtensor(e,k,0,0), xt::xshape<m_ndim,m_ndim>());
+      auto dNx   = xt::adapt(&m_dNx  (e,q,0,0), xt::xshape<m_nne ,m_ndim>());
+      auto gradu = xt::adapt(&qtensor(e,q,0,0), xt::xshape<m_ndim,m_ndim>());
 
       // - evaluate dyadic product (loops unrolled for efficiency)
       //   gradu(i,j) += dNx(m,i) * u(m,j)
       gradu(0,0) = dNx(0,0)*u(0,0) + dNx(1,0)*u(1,0) + dNx(2,0)*u(2,0) + dNx(3,0)*u(3,0);
       gradu(0,1) = dNx(0,0)*u(0,1) + dNx(1,0)*u(1,1) + dNx(2,0)*u(2,1) + dNx(3,0)*u(3,1);
       gradu(1,0) = dNx(0,1)*u(0,0) + dNx(1,1)*u(1,0) + dNx(2,1)*u(2,0) + dNx(3,1)*u(3,0);
       gradu(1,1) = dNx(0,1)*u(0,1) + dNx(1,1)*u(1,1) + dNx(2,1)*u(2,1) + dNx(3,1)*u(3,1);
     }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::gradN_vector_T(
   const xt::xtensor<double,3> &elemvec, xt::xtensor<double,4> &qtensor) const
 {
   assert( elemvec.shape()[0] == m_nelem );
   assert( elemvec.shape()[1] == m_nne   );
   assert( elemvec.shape()[2] == m_ndim  );
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nne   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
 
   // zero-initialize output: matrix of tensors
   qtensor.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias element vector (e.g. nodal displacements)
     auto u = xt::adapt(&elemvec(e,0,0), xt::xshape<m_nne,m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto dNx   = xt::adapt(&m_dNx  (e,k,0,0), xt::xshape<m_nne ,m_ndim>());
-      auto gradu = xt::adapt(&qtensor(e,k,0,0), xt::xshape<m_ndim,m_ndim>());
+      auto dNx   = xt::adapt(&m_dNx  (e,q,0,0), xt::xshape<m_nne ,m_ndim>());
+      auto gradu = xt::adapt(&qtensor(e,q,0,0), xt::xshape<m_ndim,m_ndim>());
 
       // - evaluate transpose of dyadic product (loops unrolled for efficiency)
       //   gradu(j,i) += dNx(m,i) * u(m,j)
       gradu(0,0) = dNx(0,0)*u(0,0) + dNx(1,0)*u(1,0) + dNx(2,0)*u(2,0) + dNx(3,0)*u(3,0);
       gradu(1,0) = dNx(0,0)*u(0,1) + dNx(1,0)*u(1,1) + dNx(2,0)*u(2,1) + dNx(3,0)*u(3,1);
       gradu(0,1) = dNx(0,1)*u(0,0) + dNx(1,1)*u(1,0) + dNx(2,1)*u(2,0) + dNx(3,1)*u(3,0);
       gradu(1,1) = dNx(0,1)*u(0,1) + dNx(1,1)*u(1,1) + dNx(2,1)*u(2,1) + dNx(3,1)*u(3,1);
     }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::symGradN_vector(
   const xt::xtensor<double,3> &elemvec, xt::xtensor<double,4> &qtensor) const
 {
   assert( elemvec.shape()[0] == m_nelem );
   assert( elemvec.shape()[1] == m_nne   );
   assert( elemvec.shape()[2] == m_ndim  );
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nne   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
 
   // zero-initialize output: matrix of tensors
   qtensor.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias element vector (e.g. nodal displacements)
     auto u = xt::adapt(&elemvec(e,0,0), xt::xshape<m_nne,m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto dNx = xt::adapt(&m_dNx  (e,k,0,0), xt::xshape<m_nne ,m_ndim>());
-      auto eps = xt::adapt(&qtensor(e,k,0,0), xt::xshape<m_ndim,m_ndim>());
+      auto dNx = xt::adapt(&m_dNx  (e,q,0,0), xt::xshape<m_nne ,m_ndim>());
+      auto eps = xt::adapt(&qtensor(e,q,0,0), xt::xshape<m_ndim,m_ndim>());
 
       // - evaluate symmetrized dyadic product (loops unrolled for efficiency)
       //   grad(i,j) += dNx(m,i) * u(m,j)
       //   eps (j,i)  = 0.5 * ( grad(i,j) + grad(j,i) )
       eps(0,0) =   dNx(0,0)*u(0,0) + dNx(1,0)*u(1,0) + dNx(2,0)*u(2,0) + dNx(3,0)*u(3,0);
       eps(1,1) =   dNx(0,1)*u(0,1) + dNx(1,1)*u(1,1) + dNx(2,1)*u(2,1) + dNx(3,1)*u(3,1);
       eps(0,1) = ( dNx(0,0)*u(0,1) + dNx(1,0)*u(1,1) + dNx(2,0)*u(2,1) + dNx(3,0)*u(3,1) +
-                   dNx(0,1)*u(0,0) + dNx(1,1)*u(1,0) + dNx(2,1)*u(2,0) + dNx(3,1)*u(3,0) ) / 2.;
+                   dNx(0,1)*u(0,0) + dNx(1,1)*u(1,0) + dNx(2,1)*u(2,0) + dNx(3,1)*u(3,0) ) * 0.5;
       eps(1,0) =   eps(0,1);
     }
   }
 }
 
-// ------- scalar product "elemmat(m*ndim+i,n*ndim+i) = N(m) * qscalar * N(n)"; for all "i" --------
+// -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::int_N_scalar_NT_dV(
   const xt::xtensor<double,2> &qscalar, xt::xtensor<double,3> &elemmat) const
 {
   assert( qscalar.shape()[0] == m_nelem      );
   assert( qscalar.shape()[1] == m_nip        );
   assert( elemmat.shape()[0] == m_nelem      );
   assert( elemmat.shape()[1] == m_nne*m_ndim );
   assert( elemmat.shape()[2] == m_nne*m_ndim );
 
   // zero-initialize: matrix of matrices
   elemmat.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias (e.g. mass matrix)
     auto M = xt::adapt(&elemmat(e,0,0), xt::xshape<m_nne*m_ndim,m_nne*m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto  N   = xt::adapt(&m_N(k,0), xt::xshape<m_nne>());
-      auto& vol = m_vol  (e,k);
-      auto& rho = qscalar(e,k);
+      auto  N   = xt::adapt(&m_N(q,0), xt::xshape<m_nne>());
+      auto& vol = m_vol  (e,q);
+      auto& rho = qscalar(e,q);
 
       // - evaluate scalar product, for all dimensions, and assemble
       //   M(m*ndim+i,n*ndim+i) += N(m) * scalar * N(n) * dV
       for ( size_t m = 0 ; m < m_nne ; ++m ) {
         for ( size_t n = 0 ; n < m_nne ; ++n ) {
           M(m*m_ndim+0, n*m_ndim+0) += N(m) * rho * N(n) * vol;
           M(m*m_ndim+1, n*m_ndim+1) += N(m) * rho * N(n) * vol;
         }
       }
     }
   }
 }
 
-// ------------ integral of dot product "elemvec(m,j) += dNdx(m,i) * qtensor(i,j) * dV" ------------
+// -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::int_gradN_dot_tensor2_dV(const xt::xtensor<double,4> &qtensor,
   xt::xtensor<double,3> &elemvec) const
 {
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nip   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
   assert( elemvec.shape()[0] == m_nelem );
   assert( elemvec.shape()[1] == m_nne   );
   assert( elemvec.shape()[2] == m_ndim  );
 
   // zero-initialize output: matrix of vectors
   elemvec.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias (e.g. nodal force)
     auto f = xt::adapt(&elemvec(e,0,0), xt::xshape<m_nne,m_ndim>());
 
     // loop over all integration points in element "e"
-    for ( size_t k = 0 ; k < m_nip ; ++k )
+    for ( size_t q = 0 ; q < m_nip ; ++q )
     {
       // - alias
-      auto  dNx = xt::adapt(&m_dNx  (e,k,0,0), xt::xshape<m_nne ,m_ndim>());
-      auto  sig = xt::adapt(&qtensor(e,k,0,0), xt::xshape<m_ndim,m_ndim>());
-      auto& vol = m_vol(e,k);
+      auto  dNx = xt::adapt(&m_dNx  (e,q,0,0), xt::xshape<m_nne ,m_ndim>());
+      auto  sig = xt::adapt(&qtensor(e,q,0,0), xt::xshape<m_ndim,m_ndim>());
+      auto& vol = m_vol(e,q);
 
       // - evaluate dot product, and assemble
       for ( size_t m = 0 ; m < m_nne ; ++m )
       {
         f(m,0) += ( dNx(m,0) * sig(0,0) + dNx(m,1) * sig(1,0) ) * vol;
         f(m,1) += ( dNx(m,0) * sig(0,1) + dNx(m,1) * sig(1,1) ) * vol;
       }
     }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline void Quadrature::int_gradN_dot_tensor4_dot_gradNT_dV(const xt::xtensor<double,6> &qtensor,
   xt::xtensor<double,3> &elemmat) const
 {
   assert( qtensor.shape()[0] == m_nelem );
   assert( qtensor.shape()[1] == m_nip   );
   assert( qtensor.shape()[2] == m_ndim  );
   assert( qtensor.shape()[3] == m_ndim  );
   assert( qtensor.shape()[4] == m_ndim  );
   assert( qtensor.shape()[5] == m_ndim  );
 
   assert( elemmat.shape()[0] == m_nelem      );
   assert( elemmat.shape()[1] == m_nne*m_ndim );
   assert( elemmat.shape()[2] == m_nne*m_ndim );
 
   // zero-initialize output: matrix of vector
   elemmat.fill(0.0);
 
   // loop over all elements (in parallel)
   #pragma omp parallel for
   for ( size_t e = 0 ; e < m_nelem ; ++e )
   {
     // alias (e.g. nodal force)
     auto K = xt::adapt(&elemmat(e,0,0), xt::xshape<m_nne*m_ndim,m_nne*m_ndim>());
 
     // loop over all integration points in element "e"
     for ( size_t q = 0 ; q < m_nip ; ++q ){
 
       // - alias
       auto  dNx = xt::adapt(&m_dNx(e,q,0,0), xt::xshape<m_nne,m_ndim>());
       auto  C   = xt::adapt(&qtensor(e,q,0,0,0,0), xt::xshape<m_ndim,m_ndim,m_ndim,m_ndim>());
       auto& vol = m_vol(e,q);
 
       // - evaluate dot product, and assemble
       for ( size_t m = 0 ; m < m_nne ; ++m )
         for ( size_t n = 0 ; n < m_nne ; ++n )
           for ( size_t i = 0 ; i < m_ndim ; ++i )
             for ( size_t j = 0 ; j < m_ndim ; ++j )
               for ( size_t k = 0 ; k < m_ndim ; ++k )
                 for ( size_t l = 0 ; l < m_ndim ; ++l )
                   K(m*m_ndim+j, n*m_ndim+k) += dNx(m,i) * C(i,j,k,l) * dNx(n,l) * vol;
      }
   }
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,2> Quadrature::dV() const
 {
   xt::xtensor<double,2> out = xt::empty<double>({m_nelem, m_nip});
 
   this->dV(out);
 
   return out;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,4> Quadrature::dVtensor() const
 {
   xt::xtensor<double,4> out = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
 
   this->dV(out);
 
   return out;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,4> Quadrature::gradN_vector(const xt::xtensor<double,3> &elemvec) const
 {
   xt::xtensor<double,4> qtensor = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
 
   this->gradN_vector(elemvec, qtensor);
 
   return qtensor;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,4> Quadrature::gradN_vector_T(const xt::xtensor<double,3> &elemvec) const
 {
   xt::xtensor<double,4> qtensor = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
 
   this->gradN_vector_T(elemvec, qtensor);
 
   return qtensor;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,4> Quadrature::symGradN_vector(const xt::xtensor<double,3> &elemvec) const
 {
   xt::xtensor<double,4> qtensor = xt::empty<double>({m_nelem, m_nip, m_ndim, m_ndim});
 
   this->symGradN_vector(elemvec, qtensor);
 
   return qtensor;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,3> Quadrature::int_N_scalar_NT_dV(
   const xt::xtensor<double,2> &qscalar) const
 {
   xt::xtensor<double,3> elemmat = xt::empty<double>({m_nelem, m_nne*m_ndim, m_nne*m_ndim});
 
   this->int_N_scalar_NT_dV(qscalar, elemmat);
 
   return elemmat;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,3> Quadrature::int_gradN_dot_tensor2_dV(
   const xt::xtensor<double,4> &qtensor) const
 {
   xt::xtensor<double,3> elemvec = xt::empty<double>({m_nelem, m_nne, m_ndim});
 
   this->int_gradN_dot_tensor2_dV(qtensor, elemvec);
 
   return elemvec;
 }
 
 // -------------------------------------------------------------------------------------------------
 
 inline xt::xtensor<double,3> Quadrature::int_gradN_dot_tensor4_dot_gradNT_dV(
   const xt::xtensor<double,6> &qtensor) const
  {
    xt::xtensor<double,3> elemmat = xt::empty<double>({m_nelem, m_ndim*m_nne, m_ndim*m_nne});
 
    this->int_gradN_dot_tensor4_dot_gradNT_dV(qtensor, elemmat);
 
    return elemmat;
  }
 
 // -------------------------------------------------------------------------------------------------
 
 }}} // namespace ...
 
 // =================================================================================================
 
 #endif