diff --git a/wavelet2.c b/wavelet2.c
index 2c847a1..2fd46e8 100644
--- a/wavelet2.c
+++ b/wavelet2.c
@@ -1,671 +1,670 @@
 /* 
    =========================================================================
    2-dimensional wavelet transform on (2^l) x (2^l) sized arrays of 3-tuples,
    where l > 1.
    
    Compile with -DWAVELET_TEST to build standalone unit tests.
    
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 
 #include "wavelet2.h"
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 
 
-#define COEFSIZE     (sizeof(Coeff3))
+
 #define min(a, b)    ((a) < (b) ? (a) : (b))
-#define coeffAvg(a)  (((a).coeff [0] + (a).coeff [1] + (a).coeff [2]) / 3)
+#define coeffAvg(a)  (0.333333 * ((a).coeff [0] + (a).coeff [1] + (a).coeff [2]))
 
 
+/* Haar wavelet coeffs */
+static const WAVELET_COEFF h2 = 1 / sqrt(2);
 
 /* Daubechies D4 wavelet coeffs */   
-static const WAVELET_COEFF h4Norm = 0.25 / sqrt(2), sqrt3 = sqrt(3),
+static const WAVELET_COEFF h4Norm = 0.25 * h2, sqrt3 = sqrt(3),
                            h4 [4] = {
                               h4Norm * (1 + sqrt3), h4Norm * (3 + sqrt3),
                               h4Norm * (3 - sqrt3), h4Norm * (1 - sqrt3)
                            },
                            g4 [4] = { h4 [3], -h4 [2], h4 [1], -h4 [0] };
-/* Haar wavelet coeffs */
-static const WAVELET_COEFF h2 = 1 / sqrt(2);
 
 
 
-#ifdef WAVELET_DBG
-   static void zeroCoeffs (CoeffArray2 y, unsigned l)
-   /* Zero output array to facilitate debugging */
-   {
-      const unsigned len = 1 << l;
-      unsigned       i;
-      
-      for (i = 0; i < len; i++)
-         memset(y [i], 0, len * COEFSIZE);
-   }
+static void zeroCoeffs (WaveletMatrix y, unsigned l)
+/* Zero output array to facilitate debugging */
+{
+   const unsigned lenSq = 1 << (l << 1);
+   memset(y, 0, lenSq * COEFFSIZE);
+}
 
 
 
-   static void dumpCoeffs (const CoeffArray2 y, const CoeffArray2 yt, 
-      unsigned l
-   ) {
-      const unsigned len = 1 << l;
-      unsigned       i, j, k;
+static void dumpCoeffs (const WaveletMatrix y, const WaveletMatrix yt, 
+   unsigned l
+) 
+/* Dump arrays y and yt side-by-side to stdout (skip yt if NULL) */
+{
+   const unsigned len = 1 << l;
+   unsigned       i, j;
 
-      for (i = 0; i < len; i++) {
-         for (j = 0; j < len; j++)
-            printf("%4.3f\t", coeffAvg(y [i][j]));
-            
-         printf("-->\t");
-         for (j = 0; j < len; j++)
-            printf("%4.3f\t", coeffAvg(yt [i][j]));
-            
-         putchar('\n');
+   for (i = 0; i < len; i++) {
+      for (j = 0; j < len; j++)
+         printf("%4.3f\t", coeffAvg(y [WAVELET_IDX(i, j, l)]));
+      
+      if (yt) {
+          printf("--->\t");
+          for (j = 0; j < len; j++)
+             printf("%4.3f\t", coeffAvg(yt [WAVELET_IDX(i, j, l)]));
       }
+         
+      putchar('\n');
    }
-#endif
+}
 
 
 
-static int haarStep (CoeffArray2 y, CoeffArray2 yt, unsigned l)
+static int haarStep (WaveletMatrix y, WaveletMatrix yt, unsigned l)
 /* 
    Single step of forward 2D Haar wavelet transform on array y of size
    (2^l)x(2^l) containing 3-tuples, where l >= 1.  The transform is
    performed over y's 2nd axis (i.e.  horizontally, assuming row-major
    addressing as per C convention).
    
    Note the triplets per array element are _not_ decorrelated, but
    transformed independently.
    
    The wavelet coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the generated
    coefficients vertically, and prepares the array for another transform
    over its 2nd axis in a subsequent call (this time as input y).
    
    The result of a subsequent transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    decorrelated.
    
    Returns 0 on success, else -1.
 */
 {
    static unsigned   axis = 0;
    const unsigned    len = 1 << l, hlen = 1 << (l - 1);
    unsigned          h, i, j, k;
    
    if (len < 2 || !y || !yt)
       /* Input shorter than wavelet support, or no input/output */
       return -1;
    
 #ifdef WAVELET_DBG
    zeroCoeffs(yt, l);
 #endif
 
    /* NOTE: yt is transposed on the fly such that the next function call
     * transforms over the alternate axis. This is done by simply swapping
     * the indices during assignment */
+   WaveletCoeff3  *y0 = y, *y1 = y + 1;
+   
    for (i = 0; i < len; i++) {
+#if 0
       for (j = 0; j < len; j += 2) {
          h = j >> 1;
          for (k = 0; k < 3; k++) {
             /* Smooth/approx/avg/lowpass */
-            yt [h       ][i].coeff [k] = h2 * (
-               y [i][j    ].coeff [k] + 
-               y [i][j + 1].coeff [k]
+            yt [WAVELET_IDX(h,        i, l)].coeff [k] = h2 * (
+               y [WAVELET_IDX(i, j,     l)].coeff [k] + 
+               y [WAVELET_IDX(i, j + 1, l)].coeff [k]
             );
 
             /* Detail/diff/highpass */
-            yt [hlen + h][i].coeff [k] = h2 * (
-               y [i][j    ].coeff [k] - 
-               y [i][j + 1].coeff [k]
-            );
+            yt [WAVELET_IDX(hlen + h, i, l)].coeff [k] = h2 * (
+               y [WAVELET_IDX(i, j,     l)].coeff [k] -
+               y [WAVELET_IDX(i, j + 1, l)].coeff [k]
+            );               
+         }
+      }
+#else
+      /* Optimised access through pointers to minimise indexing */
+      WaveletCoeff3  *yt0  = yt  + WAVELET_IDX(0,     i, l), 
+                     *yt1  = yt  + WAVELET_IDX(hlen,  i, l);
+      
+      for (j = 0; j < len; 
+         j += 2, yt0 += len, yt1 += len, y0 += 2, y1 += 2
+      ) {
+         h = j >> 1;
+         for (k = 0; k < 3; k++) {
+            /* Smooth/approx/avg/lowpass */
+            yt0 -> coeff [k] = h2 * (y0 -> coeff [k] + y1 -> coeff [k]);
+
+            /* Detail/diff/highpass */
+            yt1 -> coeff [k] = h2 * (y0 -> coeff [k] - y1 -> coeff [k]);
          }
       }
+#endif
    }
 
 #ifdef WAVELET_DBG
    printf("%s FWD HAAR (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
    dumpCoeffs(y, yt, l);
    putchar('\n');
    axis ^= 1;
 #endif
 
    return 0;
 }
 
 
 
-static int haarInvStep (CoeffArray2 y, CoeffArray2 yt, unsigned l)
+static int haarInvStep (WaveletMatrix y, WaveletMatrix yt, unsigned l)
 /* 
    Single step of inverse 2D Haar wavelet transform on coefficient array y
    of size (2^l) x (2^l) containing 3-tuples, where l >= 1.  This reverses
    the forward transform above.  The transform is inverted over y's 2nd axis
    (i.e.  horizontally, assuming row-major addressing as per C convention). 
    
    The inverted coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the inverted
    coefficients vertically, and prepares the array for another inverse
    transform over its 2nd axis in a subsequent call (this time as input y).
 
    The result of a subsequent inverse transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    inversely transformed.
    
    Returns 0 on success, else -1.
 */
 {
    static unsigned   axis = 1;
    const unsigned    len = 1 << l, hlen = 1 << (l - 1);
    unsigned          h, i, j, k;
 
    if (len < 2 || !y || !yt)
       /* Too few coeffs for reconstruction, or no input/output */
       return -1;
 
 #ifdef WAVELET_DBG
    zeroCoeffs(yt, l);
 #endif   
 
    /* NOTE: i, j are swapped relative to the forward transform, as axis
     * order is now reversed. */
    /* NOTE: yt is transposed on the fly such that the next function call
     * inverts over the alternate axis. This is done by simply swapping
     * the indices during assignment */    
+   WaveletCoeff3 *yt0 = yt, *yt1 = yt + 1;
+   
    for (i = 0; i < len; i++) {
+#if 0
       for (j = 0; j < len; j += 2) {
          h = j >> 1;            
          for (k = 0; k < 3; k++) {
-            yt [i][j    ].coeff [k] = h2 * (
-               y [h       ][i].coeff [k] +   /* Avg */
-               y [hlen + h][i].coeff [k]     /* Diff */
+            yt [WAVELET_IDX(i, j,     l)].coeff [k] = h2 * (
+               y [WAVELET_IDX(h,        i, l)].coeff [k] +   /* Avg */
+               y [WAVELET_IDX(hlen + h, i, l)].coeff [k]     /* Diff */
             );
                
-            yt [i][j + 1].coeff [k] = h2 * (
-               y [h       ][i].coeff [k] -   /* Avg */
-               y [hlen + h][i].coeff [k]     /* Diff */
+            yt [WAVELET_IDX(i, j + 1, l)].coeff [k] = h2 * (
+               y [WAVELET_IDX(h,        i, l)].coeff [k] -   /* Avg */
+               y [WAVELET_IDX(hlen + h, i, l)].coeff [k]     /* Diff */
             );
          }
       }
+#else
+      /* Optimised access through pointers to minimise indexing */
+      WaveletCoeff3  *y0   = y   + WAVELET_IDX(0,    i, l),
+                     *y1   = y   + WAVELET_IDX(hlen, i, l);
+                     
+      for (j = 0; j < len; 
+         j += 2, yt0 += 2, yt1 += 2, y0 += len, y1 += len
+      ) {
+         h = j >> 1;            
+         for (k = 0; k < 3; k++) {
+            yt0 -> coeff [k] = h2 * (
+               y0 -> coeff [k] +   /* Avg */
+               y1 -> coeff [k]     /* Diff */
+            );
+               
+            yt1 -> coeff [k] = h2 * (
+               y0 -> coeff [k] -   /* Avg */
+               y1 -> coeff [k]     /* Diff */
+            );
+         }
+      }      
+#endif
    }
 
 #ifdef WAVELET_DBG
    printf("%s INV HAAR (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
    dumpCoeffs(y, yt, l);
    putchar('\n');
    axis ^= 1;
 #endif         
 
    return 0;
 }
 
 
 
-static int d4Step (CoeffArray2 y, CoeffArray2 yt, unsigned l)
+static int d4Step (WaveletMatrix y, WaveletMatrix yt, unsigned l)
 /* 
    Single step of forward 2D Daubechies D4 wavelet transform on array y of
    size (2^l)x(2^l) containing 3-tuples, where l >= 2.  The transform is
    performed over y's 2nd axis (i.e.  horizontally, assuming row-major
    addressing as per C convention).
    
    Note the triplets per array element are _not_ decorrelated, but
    transformed independently.
    
    The wavelet coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the generated
    coefficients vertically, and prepares the array for another transform
    over its 2nd axis in a subsequent call (this time as input y).
    
    The result of a subsequent transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    decorrelated.
    
    Returns 0 on success, else -1.
 */
 { 
    static unsigned   axis = 0;
    const unsigned    len = 1 << l, hlen = 1 << (l - 1);
    unsigned          h, i, j, k;
    
    if (len < 4 || !y || !yt)
       /* Input shorter than wavelet support, or no input/output */
       return -1;
 
 #ifdef WAVELET_DBG
    zeroCoeffs(yt, l);
 #endif
 
    /* NOTE: yt is transposed on the fly such that the next function call
     * transforms over the alternate axis. This is done by simply swapping
     * the indices during assignment */
    for (i = 0; i < len; i++) {
       /* Transform until upper boundary */
       for (j = 0; j < len - 2; j += 2) {
          h = j >> 1;
          for (k = 0; k < 3; k++) {
             /* Smooth/approx/avg/lowpass */
-            yt [h       ][i].coeff [k] =
-               h4 [0] * y [i][j    ].coeff [k] + 
-               h4 [1] * y [i][j + 1].coeff [k] +
-               h4 [2] * y [i][j + 2].coeff [k] +
-               h4 [3] * y [i][j + 3].coeff [k];
+            yt [WAVELET_IDX(h,        i, l)].coeff [k] =
+               h4 [0] * y [WAVELET_IDX(i, j,     l)].coeff [k] + 
+               h4 [1] * y [WAVELET_IDX(i, j + 1, l)].coeff [k] +
+               h4 [2] * y [WAVELET_IDX(i, j + 2, l)].coeff [k] +
+               h4 [3] * y [WAVELET_IDX(i, j + 3, l)].coeff [k];
 
             /* Detail/diff/highpass */
-            yt [hlen + h][i].coeff [k] =
-               g4 [0] * y [i][j    ].coeff [k] +
-               g4 [1] * y [i][j + 1].coeff [k] +
-               g4 [2] * y [i][j + 2].coeff [k] +
-               g4 [3] * y [i][j + 3].coeff [k];
+            yt [WAVELET_IDX(hlen + h, i, l)].coeff [k] =
+               g4 [0] * y [WAVELET_IDX(i, j,     l)].coeff [k] +
+               g4 [1] * y [WAVELET_IDX(i, j + 1, l)].coeff [k] +
+               g4 [2] * y [WAVELET_IDX(i, j + 2, l)].coeff [k] +
+               g4 [3] * y [WAVELET_IDX(i, j + 3, l)].coeff [k];
          }
       }
 
       /* Transform at upper boundary with wraparound.
          Note j is set to last index from previous loop */
       h = j >> 1;
       for (k = 0; k < 3; k++) {
          /* Smooth/approx/avg/lowpass */
-         yt [h       ][i].coeff [k] =
-            h4 [0] * y [i][j    ].coeff [k] +
-            h4 [1] * y [i][j + 1].coeff [k] +
-            h4 [2] * y [i][0    ].coeff [k] +
-            h4 [3] * y [i][1    ].coeff [k];
+         yt [WAVELET_IDX(h,        i, l)].coeff [k] =
+            h4 [0] * y [WAVELET_IDX(i, j,     l)].coeff [k] +
+            h4 [1] * y [WAVELET_IDX(i, j + 1, l)].coeff [k] +
+            h4 [2] * y [WAVELET_IDX(i, 0,     l)].coeff [k] +
+            h4 [3] * y [WAVELET_IDX(i, 1,     l)].coeff [k];
 
          /* Detail/diff/highpass */
-         yt [hlen + h][i].coeff [k] =
-            g4 [0] * y [i][j    ].coeff [k] +
-            g4 [1] * y [i][j + 1].coeff [k] +
-            g4 [2] * y [i][0    ].coeff [k] +
-            g4 [3] * y [i][1    ].coeff [k];
+         yt [WAVELET_IDX(hlen + h, i, l)].coeff [k] =
+            g4 [0] * y [WAVELET_IDX(i, j,     l)].coeff [k] +
+            g4 [1] * y [WAVELET_IDX(i, j + 1, l)].coeff [k] +
+            g4 [2] * y [WAVELET_IDX(i, 0,     l)].coeff [k] +
+            g4 [3] * y [WAVELET_IDX(i, 1,     l)].coeff [k];
       }
    }
 
 #ifdef WAVELET_DBG
    printf("%s FWD D4 (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
    dumpCoeffs(y, yt, l);
    putchar('\n');
    axis ^= 1;
 #endif
 
    return 0;
 }
 
 
 
-static int d4InvStep (CoeffArray2 y, CoeffArray2 yt, unsigned l)
+static int d4InvStep (WaveletMatrix y, WaveletMatrix yt, unsigned l)
 /*
    Single step of inverse 2D Daubechies D4 wavelet transform on coefficient array y
    of size (2^l) x (2^l) containing 3-tuples, where l >= 2.  This reverses
    the forward transform above.  The transform is inverted over y's 2nd axis
    (i.e.  horizontally, assuming row-major addressing as per C convention). 
    
    The inverted coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the inverted
    coefficients vertically, and prepares the array for another inverse
    transform over its 2nd axis in a subsequent call (this time as input y).
 
    The result of a subsequent inverse transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    inversely transformed.
    
    Returns 0 on success, else -1.
 */
 {
    static unsigned   axis = 1;
    const unsigned    len = 1 << l, hlen = 1 << (l - 1);
    unsigned          h, i, j, k;
    
    if (len < 4 || !y || !yt)
       /* Too few coeffs for reconstruction, or no input/output */
       return -1;
 
 #ifdef WAVELET_DBG
    zeroCoeffs(yt, l);
 #endif
 
    /* NOTE: i, j are swapped relative to the forward transform, as axis
     * order is now reversed. */
    /* NOTE: yt is transposed on the fly such that the next function call
     * inverts over the alternate axis. This is done by simply swapping
     * the indices during assignment */
    for (i = 0; i < len; i++) {
       /* Invert at lower boundary with wraparound */
       for (k = 0; k < 3; k++) {
-         yt [i][0].coeff [k] =
-            h4 [2] * y [hlen - 1][i].coeff [k] + /* Last avg */
-            g4 [2] * y [len  - 1][i].coeff [k] + /* Last diff */
-            h4 [0] * y [0       ][i].coeff [k] + /* First avg */
-            g4 [0] * y [hlen    ][i].coeff [k];  /* First diff */
+         yt [WAVELET_IDX(i, 0, l)].coeff [k] =
+            h4 [2] * y [WAVELET_IDX(hlen - 1, i, l)].coeff [k] + /* Last avg */
+            g4 [2] * y [WAVELET_IDX(len  - 1, i, l)].coeff [k] + /* Last diff */
+            h4 [0] * y [WAVELET_IDX(0,        i, l)].coeff [k] + /* First avg */
+            g4 [0] * y [WAVELET_IDX(hlen,     i, l)].coeff [k];  /* First diff */
          
-         yt [i][1].coeff [k] =
-            h4 [3] * y [hlen - 1][i].coeff [k] +
-            g4 [3] * y [len  - 1][i].coeff [k] +
-            h4 [1] * y [0       ][i].coeff [k] +
-            g4 [1] * y [hlen    ][i].coeff [k];
+         yt [WAVELET_IDX(i, 1, l)].coeff [k] =
+            h4 [3] * y [WAVELET_IDX(hlen - 1, i, l)].coeff [k] +
+            g4 [3] * y [WAVELET_IDX(len  - 1, i, l)].coeff [k] +
+            h4 [1] * y [WAVELET_IDX(0,        i, l)].coeff [k] +
+            g4 [1] * y [WAVELET_IDX(hlen,     i, l)].coeff [k];
       }
 
       /* Invert until upper boundary */
       for (j = 2; j < len; j += 2) {
          h = (j >> 1) - 1;
          for (k = 0; k < 3; k++) {
-            yt [i][j    ].coeff [k] =
-               h4 [2] * y [h           ][i].coeff [k] + /* Avg */
-               g4 [2] * y [hlen + h    ][i].coeff [k] + /* Diff */
-               h4 [0] * y [h + 1       ][i].coeff [k] + /* Next avg */
-               g4 [0] * y [hlen + h + 1][i].coeff [k];  /* Next diff */
+            yt [WAVELET_IDX(i, j,     l)].coeff [k] =
+               h4 [2] * y [WAVELET_IDX(h,            i, l)].coeff [k] + /* Avg */
+               g4 [2] * y [WAVELET_IDX(hlen + h,     i, l)].coeff [k] + /* Diff */
+               h4 [0] * y [WAVELET_IDX(h + 1,        i, l)].coeff [k] + /* Next avg */
+               g4 [0] * y [WAVELET_IDX(hlen + h + 1, i, l)].coeff [k];  /* Next diff */
                
-            yt [i][j + 1].coeff [k] =
-               h4 [3] * y [h           ][i].coeff [k] +
-               g4 [3] * y [hlen + h    ][i].coeff [k] +
-               h4 [1] * y [h + 1       ][i].coeff [k] +
-               g4 [1] * y [hlen + h + 1][i].coeff [k];
+            yt [WAVELET_IDX(i, j + 1, l)].coeff [k] =
+               h4 [3] * y [WAVELET_IDX(h,            i, l)].coeff [k] +
+               g4 [3] * y [WAVELET_IDX(hlen + h,     i, l)].coeff [k] +
+               h4 [1] * y [WAVELET_IDX(h + 1,        i, l)].coeff [k] +
+               g4 [1] * y [WAVELET_IDX(hlen + h + 1, i, l)].coeff [k];
          }
       }
    }
 
 #ifdef WAVELET_DBG
    printf("%s INV D4 (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
    dumpCoeffs(y, yt, l);
    putchar('\n');
    axis ^= 1;
 #endif         
 
    return 0;
 }
 
 
 
-CoeffArray2 waveletAllocCoeffArray2 (unsigned l)
-/* 
-   Allocate and return a 2D coefficient array of size (2^l) x (2^l),
-   where l >= 1. Returns NULL if allocation failed.
-*/
-{
-   const unsigned len = 1 << l;
-   unsigned       i;
-   CoeffArray2    y = NULL;
-   
-   if (l >= 1) {
-      if (!(y = calloc(len, sizeof(Coeff3*))))
-         return NULL;
-      
-      for (i = 0; i < len; i++)
-         if (!(y [i] = calloc(len, COEFSIZE)))
-            return NULL;
-   }
-   
-   return y;
-}
-
-
-   
-void waveletFreeCoeffArray2 (CoeffArray2 y, unsigned l)
-/*
-   Free previously allocated 2D coefficient array y of size (2^l) x (2^l)
-*/  
-{
-   unsigned       i, j;
-   const unsigned len = 1 << l;
-   
-   if (y) {
-      for (i = 0; i < len; i++)
-         free(y [i]);
-      
-      free(y);
-   }
-}
-
-
-
-int waveletXform2 (CoeffArray2 y, CoeffArray2 yt, unsigned l)
+int waveletXform2 (WaveletMatrix y, WaveletMatrix yt, unsigned l)
 /*
    Perform full 2D multiresolution forward wavelet transform on array y of
    size (2^l) x (2^l) containing original signal as 3-tuples, where l >= 1. 
    Note no intra-tuple transform occurs.
    
    The wavelet coefficients are returned in array y, containing the coarsest
-   approximation in y [0][0] followed by horizontal/vertical details in
+   approximation in y [0] followed by horizontal/vertical details in
    order of increasing resolution/frequency.
    
    A preallocated array yt of identical dimensions to y can be supplied as
    buffer for intermediate results.  If yt == NULL, a buffer is
    automatically allocated and freed on demand, but this is inefficient for
    frequent calls.  It is recommended to preallocate yt to the maximum
-   expected size.  The dimensions of yt are not checked; this is the
-   caller's responsibility.
+   expected size.  
+   WARNING: The dimensions of yt are not checked; this is the caller's 
+   responsibility.
       
    Returns 0 on success, else -1.
 */
 {
    const unsigned len = 1 << l;
    unsigned       li;
-   CoeffArray2    ytloc = NULL;
+   WaveletMatrix  ytloc = NULL;
 
    /* Skip transform if input too short or missing */
    if (l < 1 || !y)
       return -1;
       
    if (!yt)
       /* No buffer supplied; allocate one on demand */
-      if (!(yt = ytloc = waveletAllocCoeffArray2(l))) {
+      if (!(yt = ytloc = WAVELET_ALLOC(l))) {
          fprintf(stderr, "ERROR - Failed allocating %dx%d buffer array"
             " in WaveletXform2()", len, len);
          return -1;
       }
-      
+
    for (li = l; li > 1; li--) {
+#if 0
       /* Apply horizontal & vertical Daubechies D4 transform, swapping input
          and transposed output array */
       if (d4Step(y, yt, li) || d4Step(yt, y, li))
+#else
+      if (haarStep(y, yt, li) || haarStep(yt, y, li))
+#endif
          return -1;
    }
    
    /* Apply horizontal & vertical Haar transform at coarsest resolution
-      (li==1) to obtain single approximation coefficient at y [0][0]; all
+      (li==1) to obtain single approximation coefficient at y [0]; all
       other coeffs are details.  */
    if (haarStep(y, yt, li) || haarStep(yt, y, li))
       return -1;
    
    /* NOTE: All coefficients now in y */
    
    if (ytloc)
       /* Free yt if allocated on demand */
-      waveletFreeCoeffArray2(ytloc, l);
+      free(ytloc);
    
    return 0;
 }
 
 
 
-int waveletInvXform2 (CoeffArray2 y, CoeffArray2 yt, unsigned l)
+int waveletInvXform2 (WaveletMatrix y, WaveletMatrix yt, unsigned l)
 /* 
    Perform full 2D multiresolution inverse wavelet transform on array y of
    size (2^l) x (2^l) containing wavelet coefficients as 3-tuples, where
    l >= 1.  Note no intra-tuple transform occurs.
 
    A preallocated array yt of identical dimensions to y can be supplied as
    buffer for intermediate results.  If yt == NULL, a buffer is
    automatically allocated and freed on demand, but this is inefficient for
    frequent calls.  It is recommended to preallocate yt to the maximum
-   expected size.  The dimensions of yt are not checked; this is the
-   caller's responsibility.
+   expected size.  
+   WARNING: The dimensions of yt are not checked; this is the caller's 
+   responsibility.
    
    The reconstructed signal is returned in array y. 
    Returns 0 on success, else -1.
 */
 {
    const unsigned len = 1 << l;
    unsigned       li;
-   CoeffArray2    ytloc = NULL;
+   WaveletMatrix  ytloc = NULL;
 
    /* Skip inverse transform if input too short or missing */
    if (l < 1 || !y)
       return -1;
 
    if (!yt)
       /* No buffer supplied; allocate one on demand */
-      if (!(yt = ytloc = waveletAllocCoeffArray2(l))) {
+      if (!(yt = ytloc = WAVELET_ALLOC(l))) {
          fprintf(stderr, "ERROR - Failed allocating %dx%d buffer array"
             " in WaveletInvXform2()", len, len);
          return -1;
       }
 
+
    /* Invert horizontal & vertical Haar transform at coarsest level (li==1),
       swapping input and transposed output array */
    if (haarInvStep(y, yt, 1) || haarInvStep(yt, y, 1))
       return -1;
       
    for (li = 2; li <= l; li++) {
+#if 0
       /* Invert horizontal & vertical Daubechies D4 transform, swapping 
          input and transposed output arrays */
       if (d4InvStep(y, yt, li) || d4InvStep(yt, y, li))
+#else
+      if (haarInvStep(y, yt, li) || haarInvStep(yt, y, li))
+#endif
          return -1;
    }
    
    /* NOTE: Reconstructed signal now in y */
 
    if (ytloc)
       /* Free yt if allocated on demand */
-      waveletFreeCoeffArray2(ytloc, l);
+      free(ytloc);
    
    return 0;
 }
 
 
 
 #ifdef WAVELET_TEST
    #include <stdio.h>
    
    #define ERRSTRLEN 1024
 
    int main (int argc, char *argv [])
    {
-      int         i, j, k, l;
-      unsigned    len;
-      CoeffArray2 y0 = NULL, y = NULL;
-      char        errstr [ERRSTRLEN];
-      FILE        *dataFile = NULL;
-      float       inData;
+      int            i, j, k, l;
+      unsigned       len, idx;
+      WaveletMatrix  y0 = NULL, y = NULL;
+      char           errstr [ERRSTRLEN];
+      FILE           *dataFile = NULL;
+      float          inData;
       
       if (argc < 2) {
          fprintf(stderr, "%s <l> [dataFile]\n", argv [0]);
          fputs("Missing array resolution l > 1\n", stderr);
          return -1;
       }
       if (!(l = atoi(argv [1])) || l < 1) {
          fputs("Invalid array resolution l\n", stderr);
          return -1;
       }
       else len = 1 << l;
       
       /* Allocate arrays for original and reconstruction */
-      if (!(y0 = waveletAllocCoeffArray2(l)) || 
-         !(y = waveletAllocCoeffArray2(l))
-      ) {
+      if (!(y0 = WAVELET_ALLOC(l)) || !(y = WAVELET_ALLOC(l))) {
          fprintf(stderr, "Failed allocating %dx%d array\n", len, len);
          return -1;
       }
 
       if (argc > 2) {
          /* Load data from file; length must not exceed allocated */
          if (!(dataFile = fopen(argv [2], "r"))) {
             fprintf(stderr, "Failed opening data file %s\n", argv [2]);
             return -1;
          }
       
          for (i = 0; i < len; i++) {
             for (j = 0; j < len; j++) {
                if (feof(dataFile)) {
                   fprintf(stderr, 
                      "Premature end of file reading data from %s\n",
                      argv [2]
                   );
                   fclose(dataFile);
                   return -1;
                }
-            
+               
                /* Read next float, skipping any leading whitespace */
                if (fscanf(dataFile, " %f", &inData)) {
-                  y0 [i][j].coeff [0] = y0 [i][j].coeff [1] = 
-                  y0 [i][j].coeff [2] = y [i][j].coeff [0] = 
-                  y [i][j].coeff [1] = y [i][j].coeff [2] =
+                  idx = WAVELET_IDX(i, j, l);
+                  
+                  y0 [idx].coeff [0] = y0 [idx].coeff [1] = y0 [idx].coeff [2] = 
+                     y [idx].coeff [0] = y [idx].coeff [1] = y [idx].coeff [2] = 
                      (WAVELET_COEFF)inData;
                }
                else {
                   fprintf(stderr, 
                      "Error reading from data file %s\n", argv [2]
                   );
                   fclose(dataFile);
                   return -1;
                }
             }
          }
          
          fclose(dataFile);
       }
       else {
          /* Init with random data */
          srand48(111);
          for (i = 0; i < len; i++)
-            for (j = 0; j < len; j++)
+            for (j = 0; j < len; j++) {
+               idx = WAVELET_IDX(i, j, l);
    #if 1
                for (k = 0; k < 3; k++)
-                  y0 [i][j].coeff [k] = y [i][j].coeff [k] = drand48();
+                  y0 [idx].coeff [k] = y [idx].coeff [k] = drand48();
    #else
-               y0 [i][j].coeff [0] = y0 [i][j].coeff [1] = 
-                  y0 [i][j].coeff [2] = y [i][j].coeff [0] = 
-                  y [i][j].coeff [1] = y [i][j].coeff [2] = drand48();
+               y0 [idx].coeff [0] = y0 [idx].coeff [1] = y0 [idx].coeff [2] = 
+                  y [idx].coeff [0] = y [idx].coeff [1] = y [idx].coeff [2] = drand48();
    #endif
+         }
       }
       
       /* Forward xform */
       if (waveletXform2(y, NULL, l)) {
          fputs("Forward xform failed\n", stderr);
          return -1;
       }
 
    #ifdef WAVELET_DBG
       /* Dump coefficients */
       puts("-----------------------------------------------------------\n");
-      for (i = 0; i < len; i++) {
-         for (j = 0; j < len; j++)
-            printf("%4.3f\t", coeffAvg(y [i][j]));
-         
-         putchar('\n');
-      }
+      dumpCoeffs(y, NULL, l);
       puts("\n-----------------------------------------------------------\n");
    #endif
    
       /* Inverse xform */
       if (waveletInvXform2(y, NULL, l)) {
          fputs("Inverse xform failed\n", stderr);
          return -1;
       }
    
    #ifdef WAVELET_DBG
       puts("-----------------------------------------------------------\n");
-
       puts("ORIG vs. INV XFORM");
-      for (i = 0; i < len; i++) {
-         for (j = 0; j < len; j++)
-            printf("%4.3f\t", coeffAvg(y0 [i][j]));
-         
-         printf("\t\t");         
-         for (j = 0; j < len; j++)
-            printf("%4.3f\t", coeffAvg(y [i][j]));
-            
-         putchar('\n');
-      }
+      dumpCoeffs(y0, y, l);
    #endif
    
-      waveletFreeCoeffArray2(y0, l);
-      waveletFreeCoeffArray2(y, l);
+      free(y0);
+      free(y);
       return 0;
    }
 #endif
diff --git a/wavelet2.h b/wavelet2.h
index 2b1c698..5494e2e 100644
--- a/wavelet2.h
+++ b/wavelet2.h
@@ -1,81 +1,89 @@
 /* 
    =========================================================================
    2-dimensional wavelet transform on (2^l) x (2^l) sized arrays of 3-tuples,
    where l >= 1.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 #ifndef _WAVELET2_H
    #define _WAVELET2_H
 
-   /* Wavelet coefficient type; defaults to double if not already defined.
-      NOTE: single precision float may improve cache performance during 
-      2D transform with large arrays. */
+   /* Wavelet coefficient type; defaults to double if not already defined
+    * (may be overridden in compiler command line, for example).
+    * NOTE: single precision float may improve cache performance during 
+    * 2D transform with large arrays. */
    #ifndef WAVELET_COEFF
-      #define WAVELET_COEFF double
+      #define WAVELET_COEFF      double
    #endif
 
-
+   /* Wavelet coefficient matrices defs.  These are stored as flattened 2D
+    * arrays to ensure they are allocated in a contiguous block.  This
+    * simplifies interfacing to calling routines, and improves cache
+    * performance through localised access.  */
    typedef struct {
-      WAVELET_COEFF coeff [3];
-   } Coeff3;
-   typedef Coeff3 **CoeffArray2;
-   
+      WAVELET_COEFF              coeff [3];
+   } WaveletCoeff3;
+   typedef WaveletCoeff3         *WaveletMatrix;
    
-   CoeffArray2 waveletAllocCoeffArray2 (unsigned l);
-   /* 
-      Allocate and return a 2D coefficient array of size (2^l) x (2^l),
-      where l >= 1. Returns NULL if allocation failed.
-   */
+   #define COEFFSIZE             (sizeof(WaveletCoeff3))
+
+   /* Allocate wavelet coefficient matrix of size 2^l per dimension.
+      NOTE: Matrices are stored as contiguous flattened 2D arrays for cache
+      optimisation */
+   #define WAVELET_ALLOC(l)      calloc(1L << (l << 1), COEFFSIZE)   
+
+    
+   /* Map 2D matrix indices to linear 1D index for size 2^l per dimension */
+   #define WAVELET_IDX(y, x, l)  (((y) << l) + (x))
+   /* Map linear 1D index to 2D matrix indices for size 2^l per dimension */
+   #define WAVELET_Y(i, l)       ((i) >> l)
+   #define WAVELET_X(i, l)       ((i) & (1 << l) - 1)
    
-   void waveletFreeCoeffArray2 (CoeffArray2 y, unsigned l);
-   /*
-      Free previously allocated 2D coefficient array y of size (2^l) x (2^l)
-   */ 
    
-   int waveletXform2 (CoeffArray2 y, CoeffArray2 yt, unsigned l);
-   /*
+   int waveletXform2 (WaveletMatrix y, WaveletMatrix yt, unsigned l);
+   /* 
       Perform full 2D multiresolution forward wavelet transform on array y
       of size (2^l) x (2^l) containing original signal as 3-tuples, where 
       l >= 1.  Note no intra-tuple transform occurs.
       
       The wavelet coefficients are returned in array y, containing the
-      coarsest approximation in y [0][0] followed by horizontal/vertical
+      coarsest approximation in y [0] followed by horizontal/vertical
       details in order of increasing resolution/frequency.
       
       A preallocated array yt of identical dimensions to y can be supplied
       as buffer for intermediate results.  If yt == NULL, a buffer is
       automatically allocated and freed on demand, but this is inefficient
       for frequent calls.  Ideally, yt should be preallocated to the maximum
-      expected size.  The dimensions of a preallocated yt are not checked;
-      this is the caller's responsibility.
+      expected size.  
+      WARNING: The dimensions of a preallocated yt are not checked; this is 
+      the caller's responsibility.
          
       Returns 0 on success, else -1.   
    */
    
-   int waveletInvXform2 (CoeffArray2 y, CoeffArray2 yt, unsigned l);
+   int waveletInvXform2 (WaveletMatrix y, WaveletMatrix yt, unsigned l);
    /* 
       Perform full 2D multiresolution inverse wavelet transform on array y
       of size (2^l) x (2^l) containing wavelet coefficients as 3-tuples,
       where l >= 1.  Note no intra-tuple transform occurs.
 
       A preallocated array yt of identical dimensions to y can be supplied
       as buffer for intermediate results.  If yt == NULL, a buffer is
       automatically allocated and freed on demand, but this is inefficient
       for frequent calls.  Ideally, yt should be preallocated to the maximum
       expected size.  The dimensions of a preallocated yt are not checked;
       this is the caller's responsibility.
       
       The reconstructed signal is returned in array y. 
       Returns 0 on success, else -1.
    */
 #endif