diff --git a/mrgbe.h b/mrgbe.h
index cb69fdb..b264360 100644
--- a/mrgbe.h
+++ b/mrgbe.h
@@ -1,112 +1,113 @@
 /* 
    =========================================================================
    mRGBE (miniRGBE): reduced RGBE encoding of normalised RGB floats plus
    associated integer payload data.  By default, this encoding consists of:
    
       - three signed 5-bit mantissas for each RGB colour channel,
       - a common 5-bit exponent,
       - and an associated 12-bit integer datum.
       
    However, this allocation can be modified within a 32-bit envelope if more
    precision is required by modifying mRGBE_MANTBITS, mRGBE_EXPBITS and
    mRGBE_DATABITS.
       
    The encoded float RGB tuples are assumed to lie in the range [-1, 1],
    with values in the interval [-mRGBE_MIN, mRGBE_MIN] clamped to 0, since
    these cannot be resolved.  The encoded values are therefore normalised
    against a given normalisation factor (e.g.  absolute maximum) in order to
    maximise the useful range.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 #ifndef _mRGBE_H
    #define _mRGBE_H
    
    #include <stdint.h>
    
    
    /* Mantissa / exponent / payload data bits and their encoding ranges.
       NOTE: binary shifts can overflow if the number of bits is increased;
       in this case shifts must either be promoted to 64 bits (possibly at
       the expense of portability), or replaced by much slower pow(2, ..) */
    #define mRGBE_MANTBITS  6
    #define mRGBE_EXPBITS   5
    #define mRGBE_DATABITS  9
    
-   /* Ensure mRGBE accommodates 32 bits (and not more!) */
+   /* Make sure the above configuration occupies a 32 bit envelope */
    #if (3*mRGBE_MANTBITS + mRGBE_EXPBITS + mRGBE_DATABITS != 32)
       #error Invalid mRGBE bit configuration
    #endif
    
    /* Rounding amplitude for encoding
       NOTE: unit test indicates this actually _increases_ RMSE! */
    #define mRGBE_ROUND     0
    /* Jitter amplitude for decoding */
    #define mRGBE_JITTER    0.5
    /* Jitter encoded zeros (this may not always be desirable, depending on
     * the application */
    /* #define mRGBE_ZEROJITTER */
       
    /* Signed maximum for mantissa */
    #define mRGBE_MANTMAX   (1L << (mRGBE_MANTBITS - 1))
    /* Absolute overflow limit for mantissa (sign flips if exceeded) */
    #define mRGBE_MANTOVF   ((mRGBE_MANTMAX << 1) - 1)   
    /* HACK: Factorisation of mRGBE_MIN prevents integer overflow for 
       mRGBE_EXPBITS <= 6; further factorisation is required for larger
       exponent sizes, else this code will trigger warnings or errors! */
    #define mRGBE_MIN       (1.0 /            \
       (1L << (1 << (mRGBE_EXPBITS - 1))) /   \
       (1L << (1 << (mRGBE_EXPBITS - 1)))     \
    )
    #define mRGBE_MAX       1.0
    #define mRGBE_RANGE     (mRGBE_MAX - mRGBE_MIN)
    #define mRGBE_DATAMAX   ((1U << mRGBE_DATABITS) - 1)
 
 
    typedef union {
       struct {
-         /* (mini)RGBE representation consisting of a 4-bit mantissa + sign
-          * per RGB, a common 5-bit exponent, and a 12-bit payload data */
-         unsigned red: mRGBE_MANTBITS,
+         /* (mini)RGBE representation consisting of MANTBITS-bit mantissa
+            (incl. sign bit) per RGB, a common (implictly negative) 
+            EXPBITS-bit exponent, and DATABITS-bit payload data */
+         unsigned red: mRGBE_MANTBITS,    /* Includes sign bit each */
                   grn: mRGBE_MANTBITS,
                   blu: mRGBE_MANTBITS,
-                  exp: mRGBE_EXPBITS,
+                  exp: mRGBE_EXPBITS,     /* Implicitly 2^(-exp) */
                   dat: mRGBE_DATABITS;
       };
       
       uint32_t all;
    } mRGBE;
 
    /* State for mRGBE encoding to offset and normalise floats in order to
     * optimally utilise the encoding range */
    typedef struct {
       double min [3], max [3], norm [3];
    } mRGBERange;
    
    
    mRGBERange *mRGBEinit (mRGBERange *range, 
       double rgbMin [3], double rgbMax [3]
    );
    /* Initialise range state for mRGBEencode()/mRGBEdecode() with
     * the specified float input ranges [rgbMin, rgbMax] in _absolute_ values
     * per colour channel.  This sets the offset and normalisation for the
     * encoding.  */
    
    mRGBE mRGBEencode (double rgb [3], const mRGBERange *range, unsigned data);
    /* Encode signed float RGB tuple to mRGBE along with payload data.  The
     * supplied range state must be previously initialised with mRGBEinit(). 
     * A warning is issued if rgb lies outside the initialised range.  */
    
    unsigned mRGBEdecode (mRGBE mrgbe, const mRGBERange *range, double rgb [3]);
    /* Decode mRGBE, returning signed float RGB tuple in array rgb.  The
     * associated payload data is decoded as return value.  The supplied
     * range state must be previously initialised with mRGBEinit().  */
 #endif
diff --git a/pmapcontrib.c b/pmapcontrib.c
index 9d15bc1..af21f49 100644
--- a/pmapcontrib.c
+++ b/pmapcontrib.c
@@ -1,1545 +1,1547 @@
 #ifndef lin
 static const char RCSid[] = "$Id: pmapcontrib.c,v 2.20 2021/02/16 20:06:06 greg Exp $";
 #endif
 
 /* 
    =========================================================================
    Photon map routines for precomputed light source contributions. 
    These routines handle contribution binning, compression and encoding,
    and are used by mkpmap.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #147053, "Daylight Redirecting Components",
         SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id: pmapcontrib.c,v 2.20 2021/02/16 20:06:06 greg Exp $
 */
 
 
 #include "pmapcontrib.h"
 
 #ifdef PMAP_CONTRIB
 #include "pmapdiag.h"
 #include "pmaprand.h"
 #include "pmapmat.h"
 #include "pmapsrc.h"
 #include "pmutil.h"
 #include "otspecial.h"
 #include "otypes.h"
 #include "lookup.h"
 #if NIX
    #include <sys/mman.h>
    #include <sys/wait.h>
 #endif
 
 
 
 /* The following are convenient placeholders interfacing to mkpmap 
    and rcontrib. They can be externally set via initPmapContribTab()
    and referenced within the contrib pmap modules. These variables 
    can then be safely ignored by rtrace/rpict/rvu, without annoying
    linking errors. */
 /* Global pointer to rcontrib's contribution binning LUT */
 LUTAB *pmapContribTab = NULL;
 /* Contribution/coefficient mode flag */
 int *pmapContribMode;
 
 #ifdef PMAP_CONTRIB_DBG
    /* Sum/num of mRGBE encoding errors for wavelet coeffs */
    static WAVELET_COEFF mrgbeDiffs = 0;
    static unsigned long mrgbeDiffCnt = 0;
 #endif
 
 
 
 static int ray2bin (const RAY *ray, unsigned scDim)
 /* Map ray dir (pointing away from origin) to its 1D bin in an
    (scDim x scDim) Shirley-Chiu square.
    Returns -1 if mapped bin is invalid (e.g. behind plane) */
 {
    static int        scRHS, varInit = 0;
    static FVECT      scNorm, scUp;
    unsigned          scBin [2];
    FVECT             diskPlane;
    RREAL             dz, diskx, disky, rad, diskd2, scCoord [2];
 
    if (!varInit) {
       /* Lazy init shirley-Chiu mapping orientation from function variables */
       scRHS       = varvalue(PMAP_CONTRIB_SCRHS);
       scNorm [0]  = varvalue(PMAP_CONTRIB_SCNORMX);
       scNorm [1]  = varvalue(PMAP_CONTRIB_SCNORMY);
       scNorm [2]  = varvalue(PMAP_CONTRIB_SCNORMZ);
       scUp [0]    = varvalue(PMAP_CONTRIB_SCUPX);
       scUp [1]    = varvalue(PMAP_CONTRIB_SCUPY);
       scUp [2]    = varvalue(PMAP_CONTRIB_SCUPZ);
       varInit ^= 1;
    }
 
    /* Map incident direction to disk */
    dz = DOT(ray -> rdir, scNorm);                         /* normal proj */
    if (dz > 0) {
       fcross(diskPlane, scUp, scNorm);    /* y-axis in plane, perp to up */
       /* normalize(diskPlane); NEED TO NORMALISE? */
       diskPlane [0] *= scRHS; 
       diskx = DOT(ray -> rdir, diskPlane);
       disky = DOT(ray -> rdir, scUp);
 
       /* Apply Shirley-Chiu mapping of (diskx, disky) to square */
       disk2square(scCoord, diskx, disky);
       /* Map Shirley-Chiu square coords to 1D bin */
       scBin [0] = scCoord [0] * scDim;
       scBin [1] = scCoord [1] * scDim;
       return PMAP_CONTRIB_XY2LIN(scDim, scBin [0], scBin [1]);
    }
    else return -1;
 }
 
 
 /* ------------------ CONTRIBSRC  STUFF --------------------- */
 
 
 #ifndef PMAP_CONTRIB_TEST
    MODCONT *addContribModifier (LUTAB *contribTab, unsigned *numContribs, 
       char *mod, char *binParm, int binCnt
    )
    /* Add light source modifier mod to contribution lookup table contribTab,
       and update numContribs.  Return initialised contribution data for this
       modifier. This code adapted from rcontrib.c:addmodifier(). */
    {
       LUENT    *lutEntry = lu_find(contribTab, mod);
       MODCONT  *contrib;
       EPNODE   *eBinVal;
       unsigned numCoeffs;
 
       /* Warn if potential coefficient index overflow in mRGBE encoding.
          This requires getting the number of wavelet coefficients generated
          by the transform a priori. */
       if (!(numCoeffs = padWaveletXform2(NULL, NULL, sqrt(binCnt), NULL))) {
          sprintf(errmsg, "can't determine number of wavelet coefficients "
             "for modifier %s", mod
          );
          error(INTERNAL, errmsg);
       }
          
       if (numCoeffs * numCoeffs > PMAP_CONTRIB_MAXCOEFFS) {
          sprintf(errmsg, 
             "mRGBE data field may overflow for modifier %s; reduce -bn "
             "and/or compression if contribution precomputation fails", mod
          );
          error(WARNING, errmsg);
       }
        
       if (lutEntry -> data) {
          /* Reject duplicate modifiers */
          sprintf(errmsg, "duplicate light source modifier %s", mod);
          error(USER, errmsg);
       }
       
       if (*numContribs >= MAXMODLIST) {
          sprintf(errmsg, "too many modifiers (max. %d)", MAXMODLIST);
          error(INTERNAL, errmsg);
       }
       
       lutEntry -> key = mod;
       
       if (binCnt <= 0) {
          sprintf(errmsg, "undefined/invalid bin count for modifier %s", mod);
          error(USER, errmsg);
       }
       
       /* Allocate and init contributions */
       contrib = (MODCONT *)malloc(
          sizeof(MODCONT) + sizeof(DCOLOR) * (binCnt - 1)
       );
       if (!contrib)
          error(SYSTEM, "out of memory in addContribModifier()");
       contrib -> outspec   = NULL;
       contrib -> modname   = mod;
       contrib -> params    = binParm;
       contrib -> nbins     = binCnt;
       contrib -> binv      = eBinVal;
       contrib -> bin0      = 0;
       memset(contrib -> cbin, 0, sizeof(DCOLOR) * binCnt);
       lutEntry -> data = lutEntry -> data = (char *)contrib;
       ++(*numContribs);
 
       return(contrib);
    }
 
 
 
    void addContribModfile (LUTAB *contribTab, unsigned *numContribs, 
       char *modFile, char *binParm, int binCnt
    )
    /* Add light source modifiers from file modFile to contribution lookup table
     * contribTab, and update numContribs. 
     * NOTE: This code is adapted from rcontrib.c */
    {
       char  *mods [MAXMODLIST];
       int   i;
       
       /* Find file and store strings */
       i = wordfile(mods, MAXMODLIST, getpath(modFile, getrlibpath(), R_OK));
       if (i < 0) {
          sprintf(errmsg, "can't open modifier file %s", modFile);
          error(SYSTEM, errmsg);
       }
       if (*numContribs + i >= MAXMODLIST - 1) {
          sprintf(errmsg, "too many modifiers (max. %d) in file %s", 
             MAXMODLIST - 1, modFile
          );
          error(INTERNAL, errmsg);
       }
 
       for (i = 0; mods [i]; i++)
          /* Add each modifier */
          addContribModifier(contribTab, numContribs, 
             mods [i], binParm, binCnt
          );
    }
 
 
 
    static int contribSourceBin (LUTAB *contribs, const RAY *ray)
    /* Map contribution source ray to its bin for light source ray -> rsrc,
       using Shirley-Chiu disk-to-square mapping.
       Return invalid bin -1 if mapping failed. */
    {
       const SRCREC   *src;
       const OBJREC   *srcMod;
       const MODCONT  *srcCont;
       RAY            srcRay;
       int            bin, i;
 
       /* Check we have a valid ray and contribution LUT */
       if (!ray || !contribs)
          return -1;
 
       src = &source [ray -> rsrc];
       srcMod = findmaterial(src -> so);
       srcCont = (MODCONT*)lu_find(contribs, srcMod -> oname) -> data;
 
       if (!srcCont)
          /* Not interested in this source (modifier not in contrib LUT) */
          return -1;
 
       /* Set up shadow ray pointing to source for disk2square mapping */
       rayorigin(&srcRay, SHADOW, NULL, NULL);
       srcRay.rsrc = ray -> rsrc;
       VCOPY(srcRay.rorg, ray -> rop);
       for (i = 0; i < 3; i++)
          srcRay.rdir [i] = -ray -> rdir [i];
               
       if (!(src -> sflags & SDISTANT 
          ? sourcehit(&srcRay) 
          : (*ofun[srcMod -> otype].funp)(srcMod, &srcRay)
       ))
          /* (Redundant?) sanity check for valid source ray? */
          return -1;
    #if 0
       worldfunc(RCCONTEXT, &srcRay);
    #endif      
       set_eparams((char*)srcCont -> params);
 
       if ((bin = ray2bin(&srcRay, sqrt(srcCont -> nbins))) < 0)
          error(WARNING, "Ignoring invalid bin in contribSourceBin()");
 
       return bin;
    }
 
 
 
    PhotonContribSourceIdx newPhotonContribSource (PhotonMap *pmap, 
       const RAY *contribSrcRay, FILE *contribSrcHeap
    )
    /* Add contribution source ray for emitted contribution photon and save 
     * light source index and binned direction. The current contribution 
     * source is stored in pmap -> lastContribSrc. If the previous contrib
     * source spawned photons (i.e. has srcIdx >= 0), it's appended to 
     * contribSrcHeap. If contribSrcRay == NULL, the current contribution 
     * source is still flushed, but no new source is set. Returns updated 
     * contribution source counter pmap -> numContribSrc. */
    {
       if (!pmap || !contribSrcHeap)
          return 0;
          
       /* Check if last contribution source has spawned photons (srcIdx >= 0, 
        * see newPhoton()), in which case we save it to the heap file before 
        * clobbering it. (Note this is short-term storage, so we doan' need 
        * da portable I/O stuff here). */
       if (pmap -> lastContribSrc.srcIdx >= 0) {
          if (!fwrite(&pmap -> lastContribSrc, sizeof(PhotonContribSource), 
             1, contribSrcHeap
          ))
             error(SYSTEM, "failed writing photon contrib source in "
                "newPhotonContribSource()"
             );
             
          pmap -> numContribSrc++;
          if (!pmap -> numContribSrc || 
             pmap -> numContribSrc > PMAP_MAXCONTRIBSRC
          )
             error(INTERNAL, "contribution source overflow");
       }
 
       if (contribSrcRay) {
          /* Mark this contribution source unused with a negative source
             index until its path spawns a photon (see newPhoton()) */
          pmap -> lastContribSrc.srcIdx = -1;
 
          /* Map ray to bin in anticipation that this contrib source will be
             used, since the ray will be lost once a photon is spawned */
          pmap -> lastContribSrc.srcBin = contribSourceBin(
             pmapContribTab, contribSrcRay
          );
 
          if (pmap -> lastContribSrc.srcBin < 0) {
             /* Warn if invalid bin, but trace photon nonetheless. It will
                count as emitted to prevent bias, but will not be stored in
                newPhoton(), as it contributes zero flux */
             sprintf(errmsg, "invalid bin for light source %s, "
                "contribution photons will be discarded", 
                source [contribSrcRay -> rsrc].so -> oname
             );
             error(WARNING, errmsg);
          }
       }
 
       return pmap -> numContribSrc;
    }
 
 
 
    PhotonContribSourceIdx buildContribSources (PhotonMap *pmap, 
       FILE **contribSrcHeap, char **contribSrcHeapFname, 
       PhotonContribSourceIdx *contribSrcOfs, unsigned numHeaps
    )
    /* Consolidate per-subprocess contribution source heaps into array
     * pmap -> contribSrc. Returns offset for contribution source index 
     * linearisation in pmap -> numContribSrc. The heap files in 
     * contribSrcHeap are closed on return. */
    {
       PhotonContribSourceIdx  heapLen;
       unsigned                heap;
 
       if (!pmap || !contribSrcHeap || !contribSrcOfs || !numHeaps)
          return 0;
 
       pmap -> numContribSrc = 0;
 
       for (heap = 0; heap < numHeaps; heap++) {
          contribSrcOfs [heap] = pmap -> numContribSrc;
          
          if (fseek(contribSrcHeap [heap], 0, SEEK_END) < 0)
             error(SYSTEM, "failed photon contrib source seek "
                "in buildContribSources()"
             );
          pmap -> numContribSrc += heapLen = 
             ftell(contribSrcHeap [heap]) / sizeof(PhotonContribSource);
 
          if (!(pmap -> contribSrc = realloc(pmap -> contribSrc, 
             pmap -> numContribSrc * sizeof(PhotonContribSource)
          ))) 
             error(SYSTEM, "failed photon contrib source alloc "
                "in buildContribSources()"
             );
 
          rewind(contribSrcHeap [heap]);
          if (fread(pmap -> contribSrc + contribSrcOfs [heap], 
             sizeof(PhotonContribSource), heapLen, contribSrcHeap [heap]
          ) != heapLen)
             error(SYSTEM, "failed reading photon contrib source "
                "in buildContribSources()"
             );
          
          fclose(contribSrcHeap [heap]);
          unlink(contribSrcHeapFname [heap]);
       }
       
       return pmap -> numContribSrc;
    }
 
 
 
    /* ----------------- CONTRIB PMAP ENCODING STUFF -------------------- */
 
    static void coeffSwap (PreComputedContribCoeff *c1,
       PreComputedContribCoeff *c2
    ) {
       PreComputedContribCoeff tCoeff;
 
       tCoeff.coeff      = c1 -> coeff;
       tCoeff.idx        = c1 -> idx;
       c1 -> coeff       = c2 -> coeff;
       c1 -> idx         = c2 -> idx;
       c2 -> coeff       = tCoeff.coeff;
       c2 -> idx         = tCoeff.idx;
    }
 
 
    
    static int coeffPartition (PreComputedContribCoeff *coeffs, 
       unsigned medianPos, unsigned left, unsigned right
    )
    /* REVERSE partition coefficients by magnitude, such that 
       coeffs [left..medianPos] >= coeffs [medianPos+1..right].
       Returns median's position. */
    {
       unsigned long           l, r, m;
       WAVELET_COEFF           lVal, rVal, mVal, tVal;
       
       #define COEFFVAL(c)     (DOT((c) -> coeff, (c) -> coeff))
 
       if (left < right) {
          /* Select pivot from median-of-three and move to photons [right] 
             (a.k.a. Lomuto partitioning) */
          l = left;
          r = right;
          m = l + ((r - l) >> 1);   /* Avoids overflow vs. (l+r) >> 1 */
          lVal = COEFFVAL(coeffs + l);
          rVal = COEFFVAL(coeffs + r);
          mVal = COEFFVAL(coeffs + m);
          if (mVal > lVal) {
             coeffSwap(coeffs + m, coeffs + l);
             tVal  = mVal;
             mVal  = lVal;
             lVal  = tVal;
          }
          if (rVal > lVal) {
             coeffSwap(coeffs + r, coeffs + l);
             tVal  = rVal;
             rVal  = lVal;
             lVal  = tVal;            
          }
          if (mVal > rVal) {
             coeffSwap(coeffs + m, coeffs + r);
             tVal  = mVal;
             mVal  = rVal;
             rVal  = tVal;
          }
          /* Pivot with key rVal is now in coeffs [right] */
 
          /* l & r converge, swapping coefficients out of (reversed) order
             with respect to pivot.  The convergence point l = r is the
             pivot's final position */
          while (l < r) {
             while (l < r && COEFFVAL(coeffs + l) > rVal)
                l++;
             while (r > l && COEFFVAL(coeffs + r) <= rVal)
                r--;
                
             /* Coeffs out of order, must swap */
             if (l < r)
                coeffSwap(coeffs + l, coeffs + r);
          };
          
          /* Now l == r is pivot's final position; swap these coeffs */
          coeffSwap(coeffs + l, coeffs + right);
          
          /* Recurse in partition containing median */
          if (l > medianPos)
             return coeffPartition(coeffs, medianPos, left, l - 1);
          else if (l < medianPos)
             return coeffPartition(coeffs, medianPos, l + 1, right);
          else  /* l == medianPos, partitioning done */
             return l; 
       }
       else return left;
    }
 
 
 
    static int coeffIdxCompare (const void *c1, const void *c2)
    /* Comparison function to sort thresholded coefficients by index */
+   /* TODO: scale coeffs by 2^-l to adapt to resolution/frequency l?
+      (this drops higher-frequency coeffs first). */  
    {
       const PreComputedContribCoeff *tcoeff1 = c1, *tcoeff2 = c2;
       const unsigned                v1 = tcoeff1 -> idx, 
                                     v2 = tcoeff2 -> idx;
 
       if (v1 < v2)
          return -1;
       else if (v1 > v2)
          return 1;
       else
          return 0;
    }
 
 
 
    static int thresholdContribs (PreComputedContrib *preCompContrib)
    /* Threshold wavelet detail coefficients in preCompContrib -> 
       waveletMatrix [approxDim..coeffDim-1] [approxDim..coeffDim-1] (where
       approxDim = WAVELET_PADD2_APPROXDIM) by keeping the (preCompContrib -> 
       nCompressedCoeffs) largest of these and returning them in
       preCompContrib -> compressedCoeffs along with their original matrix
       indices. 
       NOTE: The wavelet approximation coefficients 
       preCompContrib -> waveletMatrix [0..approxDim-1] [0..approxDim-1] 
       are excluded from thresholding to minimise compression artefacts.
       Returns 0 on success, else -1. */
    {
       unsigned                i, j, coeffDim, coeffIdx, nNzDetailCoeffs,
                               nCompressedCoeffs, numThresh;
       WaveletMatrix2          waveletMatrix;
       PreComputedContribCoeff *threshCoeffs, *threshCoeffPtr;
       WAVELET_COEFF           *coeffPtr;
       
       if (!preCompContrib || !(coeffDim = preCompContrib -> coeffDim) ||
          !(threshCoeffs = preCompContrib -> compressedCoeffs) ||
          !(waveletMatrix = preCompContrib -> waveletMatrix)
       )
          /* Should be initialised by caller! */
          return -1;
 
       /* Set up coefficient buffer for compression (thresholding), skipping
        * the approximation coefficients in the upper left of waveletMatrix,
        * which are not thresholded.  Also skip zero coefficients (resulting
        * from padding), since these are already implicitly thresholded.  The
        * original 2D matrix indices are linearised to 1D and saved with each
        * coefficient to restore the original sparse coefficient matrix.  */
       for (i = 0, threshCoeffPtr = threshCoeffs; i < coeffDim; i++) {
          /* Calc row pointer (=mult) in outer loop, inc in inner */
          coeffIdx = PMAP_CONTRIB_XY2LIN(coeffDim, i, 0);
          
          for (j = 0; j < coeffDim; j++, coeffIdx++) {
             if ((i >= WAVELET_PADD2_APPROXDIM || 
                   j >= WAVELET_PADD2_APPROXDIM
                ) && !coeffIsZero(waveletMatrix [i] [j])
             ) {
                /* Nonzero detail coefficient; set up pointer to coeff 
                   (sorts faster than 3 doubles) and save original
                   (linearised) matrix index */
                threshCoeffPtr -> idx = coeffIdx;
                threshCoeffPtr++ -> coeff = (WAVELET_COEFF*)&(
                   waveletMatrix [i] [j]
                );
             }
          }
       }
       
       /* Num of nonzero detail coeffs in buffer, number actually expected */
       numThresh         = threshCoeffPtr - threshCoeffs;
       nNzDetailCoeffs   = WAVELET_PADD2_NUMDETAIL(
          preCompContrib -> nNonZeroCoeffs
       );
       nCompressedCoeffs = preCompContrib -> nCompressedCoeffs;
       
       /* If fewer nonzero detail coeff are in the threshold buffer than
        * anticipated, the loop below fills the remainder of the threshold
        * buffer with duplicates of a coefficient in the lower right of the
        * matrix, which is padding and guaranteed to be zero.  This condition
        * can occur if the wavelet transform actually generated genuine zero
        * detail coefficients.  Infact it's quite common if the wavelet
        * transformed contributions are quite sparse.  */
       i = j = coeffDim - 1;
       coeffIdx = PMAP_CONTRIB_XY2LIN(coeffDim, i, j);
       for (coeffPtr = (WAVELET_COEFF*)(waveletMatrix [i][j]);
          numThresh < nNzDetailCoeffs; 
          numThresh++
       ) {
          threshCoeffPtr -> idx = coeffIdx;
          threshCoeffPtr++ -> coeff = coeffPtr;
       }
 
       /* Partition coeffs in reverse order, such that all coeffs in 
          threshCoeffs [0..nCompressedCoeffs-1] are larger than those in
          threshCoeffs [nCompressedCoeffs..nNzDetailCoeffs-1] */
       coeffPartition(threshCoeffs, nCompressedCoeffs - 1, 
          0, nNzDetailCoeffs - 1
       );
 
 #ifdef PMAP_CONTRIB_DBG
       /* Check coefficient partitioning */
       threshCoeffPtr = threshCoeffs + nCompressedCoeffs - 1;
       for (i = 0; i < nCompressedCoeffs; i++)
          if (COEFFVAL(threshCoeffs + i) < COEFFVAL(threshCoeffPtr))
             error(CONSISTENCY, "invalid wavelet coefficient partitioning "
                "in thresholdContribs()"
             );
       
       for (; i < nNzDetailCoeffs; i++)
          if (COEFFVAL(threshCoeffs + i) > COEFFVAL(threshCoeffPtr))
             error(CONSISTENCY, "invalid wavelet coefficient partitioning "
                "in thresholdContribs()"
             );
 #endif
       
       /* Sort partition containing nCompressedCoeffs coefficients by index
        * (ignoring the remaining coefficients since these are now dropped
        * due to tresholding) */
       qsort(threshCoeffs, nCompressedCoeffs,
          sizeof(PreComputedContribCoeff), coeffIdxCompare
       );
       
       return 0;
    }
 
 
 
    static int encodeContribs (PreComputedContrib *preCompContrib, 
       float compressRatio
    )
    /* Apply wavelet transform to input matrix preCompContrib ->
       waveletMatrix and compress according to compressRatio, storing
       thresholded and mRGBE-encoded coefficients in preCompContrib ->
       mrgbeCoeffs.  Note that the approximation coefficients in the upper
       left of the matrix are not encoded, and returned in 
       preCompContrib -> waveletMatrix
          [0..WAVELET_PADD2_APPROXDIM-1] [0..WAVELET_PADD2_APPROXDIM-1].
       Returns 0 on success.  */
    {
       unsigned                   i, j, k, scDim, lastCoeffIdx;
       WaveletMatrix2             waveletMatrix, tWaveletMatrix;
       PreComputedContribCoeff    *threshCoeffs;
       mRGBERange                 *mrgbeRange;
       mRGBE                      *mrgbeCoeffs;
       WAVELET_COEFF              absCoeff;
    #ifdef PMAP_CONTRIB_DBG
       WaveletCoeff3              decCoeff;
       unsigned                   decIdx;
    #endif 
       
       if (!preCompContrib || !preCompContrib -> mrgbeCoeffs ||
          !preCompContrib -> compressedCoeffs || 
          !(waveletMatrix = preCompContrib -> waveletMatrix) || 
          !(tWaveletMatrix = preCompContrib -> tWaveletMatrix) ||
          !(scDim = preCompContrib -> scDim)
       )
          /* Should be initialised by the caller! */
          return -1;
 
    #ifdef PMAP_CONTRIB_DBG
       for (i = 0; i < scDim; i++) {
          for (j = 0; j < scDim; j++) {
             for (k = 0; k < 3; k++) {
       #if 0
                /* Set contributions to bins for debugging */
                waveletMatrix [i] [j] [k] = PMAP_CONTRIB_VAL(
                   PMAP_CONTRIB_XY2LIN(scDim, i, j)
                );
       #elif 0
                /* Replace contribs with "bump" function */
                waveletMatrix [i] [j] [k] = PMAP_CONTRIB_VAL(
                   (1. - fabs(i - scDim/2. + 0.5) / (scDim/2. - 0.5)) *
                   (1. - fabs(j - scDim/2. + 0.5) / (scDim/2. - 0.5))
                );
       #elif 0
                /* Inject reference contribs from rcontrib classic */
                #include "rc-ref.c"
 
                if (preCompContrib -> nBins != PMAP_CONTRIB_REFSIZE) {
                   sprintf(errmsg, "reference contribs require %d bins",
                      PMAP_CONTRIB_REFSIZE
                   );
                   error(USER, errmsg);
                }
                waveletMatrix [i] [j] [k] = PMAP_CONTRIB_VAL(
                   refContrib [PMAP_CONTRIB_XY2LIN(scDim, i, j)] [k]
                );
       #endif
             }
 
       #if 0
             /* Dump contribs prior to encoding for debugging */
             printf("% 7.3g\t", colorAvg(waveletMatrix [i] [j]));
          }
          putchar('\n');
       }
       putchar('\n');
       #else
          }
       }
       #endif
    #endif
       
       /* Do 2D wavelet transform on preCompContrib -> waveletMatrix */
       if (padWaveletXform2(waveletMatrix, tWaveletMatrix, scDim, NULL) !=
          preCompContrib -> coeffDim
       )
          error(INTERNAL, "failed wavelet transform in encodeContribs()");
 
       /* Compress wavelet detail coeffs by thresholding */
       if (thresholdContribs(preCompContrib) < 0)
          error(INTERNAL, "failed wavelet compression in encodeContribs()");
       threshCoeffs = preCompContrib -> compressedCoeffs;
 
       /* Init per-channel coefficient range for mRGBE encoding */
       mrgbeRange = &preCompContrib -> mrgbeRange;
       setcolor(mrgbeRange -> min, FHUGE, FHUGE, FHUGE);
       setcolor(mrgbeRange -> max, 0, 0, 0);
 
       /* Update per-channel coefficient range */
       for (i = 0; i < preCompContrib -> nCompressedCoeffs; i++)
          for (k = 0; k < 3; k++) {
             absCoeff = fabs(threshCoeffs [i].coeff [k]);
             
             if (absCoeff < mrgbeRange -> min [k])
                mrgbeRange -> min [k] = absCoeff;
             if (absCoeff > mrgbeRange -> max [k])
                mrgbeRange -> max [k] = absCoeff;
          }
       
       if (preCompContrib -> nCompressedCoeffs == 1)
          /* Maximum compression with just 1 (!) compressed detail coeff (is
           * this even useful?), so mRGBE range is undefined since min & max
           * coincide; set minimum to 0, maximum to the single remaining
           * coefficient */
          setcolor(mrgbeRange -> min, 0, 0, 0);
       
       /* Init mRGBE coefficient normalisation from range */
       if (!mRGBEinit(mrgbeRange, mrgbeRange -> min, mrgbeRange -> max))
          return -1;
          
       mrgbeCoeffs = preCompContrib -> mrgbeCoeffs;
 
       /* Encode wavelet detail coefficients to mRGBE */
       for (i = lastCoeffIdx = 0; i < preCompContrib -> nCompressedCoeffs; 
          lastCoeffIdx = threshCoeffs [i++].idx
       ) {
          /* HACK: To reduce the likelihood of overflowing the mRGBE data
           * field with the coefficient index, it is expressed incrementally
           * w.r.t.  the previously encoded coefficient's index, instead of
           * absolutely.  This implies threshCoeffs must be sorted by
           * coefficient index to ensure increments are positive, and to
           * minimise their magnitude.
           * Note that an overflow cannot be predicted beforehand, e.g. by
           * mkpmap when parsing the number of bins, as this depends on the
           * sparseness of the wavelet coefficients (which in turn depends on
           * the frequency distribution of the binned contributions), and the
           * fraction of those that are dropped (i.e. the compression ratio).
           * */
          mrgbeCoeffs [i] = mRGBEencode(threshCoeffs [i].coeff, 
             mrgbeRange, threshCoeffs [i].idx - lastCoeffIdx
          );
          if (!mrgbeCoeffs [i].all)
             error(INTERNAL, "failed mRGBE encoding in encodeContribs()");
    #ifdef PMAP_CONTRIB_DBG
          /* mRGBE encoding sanity check */
          decIdx = mRGBEdecode(mrgbeCoeffs [i], mrgbeRange, decCoeff);
          mrgbeDiffs += sqrt(dist2(decCoeff, threshCoeffs [i].coeff));
          mrgbeDiffCnt++;         
          
          if (decIdx > preCompContrib -> nCompressedCoeffs ||
             decIdx != threshCoeffs [i].idx - lastCoeffIdx ||
             sqrt(dist2(decCoeff, threshCoeffs [i].coeff)) >
                0.1 * colorAvg(mrgbeRange -> max)
          ) {
             sprintf(errmsg, "failed sanity check in encodeContribs()\n"
                "Encoded: [%.3g %.3g %.3g %d]\nDecoded: [%.3g %.3g %.3g %d]",
                threshCoeffs [i].coeff [0], threshCoeffs [i].coeff [1],
                threshCoeffs [i].coeff [2], threshCoeffs [i].idx - lastCoeffIdx,
                decCoeff [0], decCoeff [1], decCoeff [2], decIdx
             );
             error(CONSISTENCY, errmsg);
          }
          
          for (k = 0; k < 3; k++) {
             absCoeff = fabs(threshCoeffs [i].coeff [k]) > 1e-6
                ? decCoeff [k] / threshCoeffs [i].coeff [k] : 0;
             if (absCoeff < 0)
                error(CONSISTENCY, 
                   "coefficient sign reversal in encodeContribs()"
                );
          }
    #endif
       }
          
       return 0;
    }
 
 
 
    static MODCONT *getPhotonContrib (PhotonMap *pmap, RAY *ray, COLOR irrad)
    /* Locate photons near ray -> rop which originated from the light source
       modifier indexed by ray -> rsrc, and accumulate their contributions 
       in pmap -> srcContrib. Return total contributions in irrad and
       pointer to binned contributions (or NULL if none were found). */
    {
       const OBJREC            *srcMod = findmaterial(source [ray->rsrc].so);
       MODCONT                 *modCont = (MODCONT*)lu_find(
                                  pmapContribTab, srcMod -> oname
                               ) -> data;
       Photon                  *photon;
       COLOR                   flux;
       PhotonSearchQueueNode   *sqn;
       float                   r2, norm;
       unsigned                i, bin;
       
       setcolor(irrad, 0, 0, 0);
       if (!modCont)
          /* Modifier not in LUT */
          return NULL;
          
       /* Zero bins for this source modifier */     
       memset(modCont -> cbin, 0, sizeof(DCOLOR) * modCont -> nbins);
       if (!pmap -> maxGather) 
          /* Zero bandwidth */
          return NULL;
 
       /* Lookup photons; pass light source index to filter in findPhotons()
          via pmap -> lastContribSrc */ 
       pmap -> lastContribSrc.srcIdx = ray -> rsrc;
       findPhotons(pmap, ray);
       
       /* Need at least 2 photons */
       if (pmap -> squeue.tail < 2) {
    #ifdef PMAP_NONEFOUND
          sprintf(errmsg, "no photons found on %s at (%.3f, %.3f, %.3f)", 
             ray -> ro ? ray -> ro -> oname : "<null>",
             ray -> rop [0], ray -> rop [1], ray -> rop [2]
          );
          error(WARNING, errmsg);
    #endif
          return NULL;
       }
    
       /* Avg radius^2 between furthest two photons to improve accuracy */ 
       sqn = pmap -> squeue.node + 1;
       r2 = max(sqn -> dist2, (sqn + 1) -> dist2);
       r2 = 0.25 * (pmap -> maxDist2 + r2 + 2 * sqrt(pmap -> maxDist2 * r2));
 
    #ifdef PMAP_EPANECHNIKOV
       /* Normalise accumulated flux by Epanechnikov kernel integral in 2D
          (see Eq. 4.4, p.76 in Silverman, "Density Estimation for 
          Statistics and Data Analysis", 1st Ed., 1986, and
          Wann Jensen, "Realistic Image Synthesis using Photon Mapping"),
          include RADIANCE-specific lambertian factor PI */
       norm = 2 / (PI * PI * r2);
    #else
       /* Normalise accumulated flux by search area PI * r^2, including
          RADIANCE-specific lambertian factor PI */
       norm = 1 / (PI * PI * r2);
    #endif 
 
       /* Skip the extra photon */
       for (i = 1; i < pmap -> squeue.tail; i++, sqn++) {
          /* Get photon's contribution to density estimate */
          photon = getNearestPhoton(&pmap -> squeue, sqn -> idx);
          getPhotonFlux(photon, flux);
          scalecolor(flux, norm);
    #ifdef PMAP_EPANECHNIKOV
          /* Apply Epanechnikov kernel to photon flux based on distance */
          scalecolor(flux, 1 - sqn -> dist2 / r2);
    #endif
          addcolor(irrad, flux);
          addcolor(modCont -> cbin [photonSrcBin(pmap, photon)], flux);
       }
 
       return modCont;
    }
  
    
    
    void freePreCompContribNode (void *p)
    /* Clean up precomputed contribution LUT entry */   
    {
       PhotonMap            *preCompContribPmap = (PhotonMap*)p;
       PreComputedContrib   *preCompContrib;
       
       if (preCompContribPmap) {
          preCompContrib = (PreComputedContrib*)(
             preCompContribPmap -> preCompContrib
          );
          
          if (preCompContrib) {
             /* Free primary and transposed wavelet matrices */
             freeWaveletMatrix2(preCompContrib -> waveletMatrix,
                preCompContrib -> coeffDim
             );
             freeWaveletMatrix2(preCompContrib -> tWaveletMatrix, 
                preCompContrib -> coeffDim
             );
          
             /* Free thresholded coefficients */
             free(preCompContrib -> compressedCoeffs);
          
             /* Free mRGBE encoded coefficients */
             free(preCompContrib -> mrgbeCoeffs);
             free(preCompContrib -> waveletFname);
             
             if (preCompContrib -> cache) {
                /* Free contribution cache */
                OOC_DeleteCache(preCompContrib -> cache);
                free(preCompContrib -> cache);
             }
    #if NIX   
             if (preCompContrib -> numPhotonsShm) {
                /* Release shared memory for photon counter */
                munmap(preCompContrib -> numPhotonsShm, sizeof(unsigned long));
                close(preCompContrib -> shmFile);
                unlink(preCompContrib -> shmFname);
             }
    #endif
          }
          
          /* Clean up precomputed contrib photon map */
          deletePhotons(preCompContribPmap);
          free(preCompContribPmap);
       }
    }
 
 
 
    void initPreComputedContrib (PreComputedContrib *preCompContrib)
    /* Initialise precomputed contribution container in photon map */
    {
       preCompContrib -> waveletFname = NULL;
       preCompContrib -> waveletFile = NULL;
       preCompContrib -> waveletMatrix = 
          preCompContrib -> tWaveletMatrix = NULL;
       preCompContrib -> compressedCoeffs = NULL;
       setcolor(preCompContrib -> mrgbeRange.min, 0, 0, 0);
       setcolor(preCompContrib -> mrgbeRange.max, 0, 0, 0);
       setcolor(preCompContrib -> mrgbeRange.norm, 0, 0, 0);
       preCompContrib -> mrgbeCoeffs = NULL;
       preCompContrib -> scDim = 
          preCompContrib -> nBins = 
          preCompContrib -> coeffDim = 
          preCompContrib -> nCoeffs = 
          preCompContrib -> nNonZeroCoeffs = 
          preCompContrib -> nCompressedCoeffs = 
          preCompContrib -> contribSize = 0;
       preCompContrib -> cache = NULL;
       preCompContrib -> numPhotonsShm = NULL;
       preCompContrib -> shmFile = -1;
    }
 
 
 
    static int allocPreCompContribPmap (const LUENT *modContNode, void *p) 
    /* Allocate & init child contrib photon map for modContNode's modifier
     * and its contributions for parent photon map p.  */
    {
       unsigned             scDim, nBins, coeffDim, nCoeffs, 
                            nCompressedCoeffs, nNzDetailCoeffs;
       PhotonMap            *preCompContribPmap, *parentPmap = (PhotonMap*)p;
       LUENT                *preCompContribNode;
       PreComputedContrib   *preCompContrib;
       const MODCONT        *modCont = (MODCONT*)modContNode -> data;
 
       if (!modCont || !parentPmap)
          return -1;
       
       preCompContribNode = lu_find(parentPmap -> preCompContribTab,
          modCont -> modname
       );
       if (!preCompContribNode)
          error(SYSTEM, "out of memory allocating LUT entry in "
             "allocPreCompContribPmap()"
          );
       
       preCompContribNode -> key = (char*)modCont -> modname;
       preCompContribNode -> data = (char*)(preCompContribPmap =
          malloc(sizeof(PhotonMap))
       );
       if (!preCompContribPmap)
          error(SYSTEM, "out of memory allocating photon map "
             "in allocPreCompContribPmap()"
          );
       initPhotonMap(preCompContribPmap, PMAP_TYPE_CONTRIB_CHILD);
       
       /* Set output filename from parent photon map */
       preCompContribPmap -> fileName = parentPmap -> fileName;
       /* Set contrib/coeff mode from parent photon map */
       preCompContribPmap -> contribMode = parentPmap -> contribMode;
       
       preCompContrib = (PreComputedContrib*)(
          preCompContribPmap -> preCompContrib = 
             malloc(sizeof(PreComputedContrib))
       );
       if (!preCompContrib)
          error(SYSTEM, "out of memory allocating contributions "
             "in allocPreCompContribPmap()"
          );
       initPreComputedContrib(preCompContrib);
       
    #if NIX      
       /* Allocate and map shared memory for photon counter */
       strcpy(preCompContrib -> shmFname, PMAP_TMPFNAME);
       preCompContrib -> shmFile = mkstemp(preCompContrib -> shmFname);
       if (preCompContrib -> shmFile < 0 || 
          ftruncate(preCompContrib -> shmFile, sizeof(unsigned long)) < 0
       )
          error(SYSTEM, 
             "failed opening shared memory file in allocPreCompContribPmap()"
          );
 
       preCompContrib -> numPhotonsShm = mmap(NULL, sizeof(unsigned long),
          PROT_READ | PROT_WRITE, MAP_SHARED, preCompContrib -> shmFile, 0
       );
       if (preCompContrib -> numPhotonsShm == MAP_FAILED)
          error(SYSTEM, 
             "failed mapping shared memory in allocPreCompContribPmap()"
          );
    #endif
       
       /* Set Shirley-Chiu square & wavelet matrix dimensions
          (number of bins resp. coefficients). */
       preCompContrib -> scDim = scDim = sqrt(
          preCompContrib -> nBins = nBins = modCont -> nbins
       );
 
       if (scDim * scDim != nBins)
          /* Shouldn't happen; mkpmap took care of dis */
          error(INTERNAL, "contribution bin count not integer square "
             "in allocPreCompContribPmap()"
          );
       
       /* Allocate and initialise photon and contribution heaps */
       initPhotonHeap(preCompContribPmap);
       
       if (nBins > 1) {
          /* BINNING ENABLED; set up wavelet xform & compression.
             Get dimensions of wavelet coefficient matrix after
             boundary padding, and number of nonzero coeffs.  The
             number of compressed (detail) coeffs is fixed and
             based on the number of NONZERO coeffs (minus approx,
             see NOTE below) since zero coeffs are considered
             thresholded by default.
          
             !!! NOTE: THE APPROX COEFFS IN THE UPPER LEFT OF
             !!! THE MATRIX ARE _NEVER_ THRESHOLDED TO MINIMISE
             !!! COMPRESSION ARTEFACTS!  THEY ARE ESSENTIAL FOR
             !!! RECONSTRUCTING THE ORIGINAL CONTRIBS.  */
          preCompContrib -> coeffDim = coeffDim = padWaveletXform2(
             NULL, NULL, scDim, &preCompContrib -> nNonZeroCoeffs 
          );
          preCompContrib -> nCoeffs = nCoeffs = coeffDim * coeffDim;
          nNzDetailCoeffs = WAVELET_PADD2_NUMDETAIL(
             preCompContrib -> nNonZeroCoeffs
          );
          
          nCompressedCoeffs = (1 - compressRatio) * nNzDetailCoeffs;
          if (nCompressedCoeffs < 1) {
             error(WARNING, "maximum contribution compression, "
                "clamping number of wavelet coefficients in "
                "allocPreCompContribPmap()"
             );
             nCompressedCoeffs = 1;
          }
          if (nCompressedCoeffs >= preCompContrib -> nCoeffs) {
             error(WARNING, "minimum contribution compression, "
                "clamping number of wavelet coefficients in "
                "allocPreCompContribPmap()"
             );
             nCompressedCoeffs = nNzDetailCoeffs;
          } 
          preCompContrib -> nCompressedCoeffs = nCompressedCoeffs;
 
          /* Lazily allocate primary & transposed wavelet coeff matrix */
          preCompContrib -> waveletMatrix  = allocWaveletMatrix2(coeffDim);
          preCompContrib -> tWaveletMatrix = allocWaveletMatrix2(coeffDim);
          if (!preCompContrib -> waveletMatrix || 
             !preCompContrib -> tWaveletMatrix
          )
             error(SYSTEM, "out of memory allocating wavelet coefficient "
                "matrix in allocPreCompContribPmap()"
             );
       
          /* Lazily allocate thresholded detail coefficients */
          preCompContrib -> compressedCoeffs = calloc(nNzDetailCoeffs,
             sizeof(PreComputedContribCoeff)
          );
       
          /* Lazily alloc mRGBE-encoded compressed wavelet coeffs */
          preCompContrib -> mrgbeCoeffs = calloc(nCompressedCoeffs, 
             sizeof(mRGBE)
          );
          if (!preCompContrib -> compressedCoeffs || 
             !preCompContrib -> mrgbeCoeffs
          )
             error(SYSTEM, "out of memory allocating compressed "
                "contribution buffer in allocPreCompContribPmap()"
             );
       
          /* Set size of compressed contributions, in bytes */
          preCompContrib -> contribSize = PMAP_CONTRIB_ENCSIZE(
             nCompressedCoeffs
          );
       }
    }
 
 
 
    static int sumNumPhotons (const LUENT *preCompContribNode, void *p)
    /* Set number of photons for this modifier's child contrib photon map
     * from the corresponding shared photon counter, and sum over all
     * modifiers in (unsigned long*)p */
    {
       unsigned long              *numPhotons = (unsigned long*)p;
       PhotonMap                  *preCompContribPmap = (PhotonMap*)(
                                     preCompContribNode -> data
                                  );
       const PreComputedContrib   *preCompContrib;
       
       if (!preCompContribPmap)
          return 0;
       else preCompContrib = (PreComputedContrib*)(
          preCompContribPmap -> preCompContrib
       );
       
    #if NIX
       /* Lock shared photon counter file for reading */
       shmLock(preCompContrib -> shmFile, F_RDLCK);
    #endif
       *numPhotons += (preCompContribPmap -> numPhotons = 
          *preCompContrib -> numPhotonsShm
       );
    #if NIX
       /* Release lock on shard photon counter file */
       shmLock(preCompContrib -> shmFile, F_UNLCK);
    #endif
    
       return 0;
    }
 
 
 
    void preComputeContrib (PhotonMap *pmap, unsigned numProc)
    /* Precompute contributions for a random subset of (finalGather * 
       pmap -> numPhotons) photons, and return the per-modifier precomputed
       contribution photon maps in LUT preCompContribTab, discarding the
       original photons.  */
    {
       unsigned long        p, numPreComp;
       unsigned             proc;
       int                  stat;
       pid_t                pid, procPids [PMAP_MAXPROC];
       PhotonMap            nuPmap;
       LUTAB                lutInit = LU_SINIT(NULL, freePreCompContribNode);
       
       /* Init new parent photon map and set output filename */
       initPhotonMap(&nuPmap, pmap -> type);
       nuPmap.fileName = pmap -> fileName;
       
       /* Set contrib/coeff mode in new parent photon map */
       nuPmap.contribMode = pmap -> contribMode;
       
       /* Allocate and init LUT containing per-modifier child photon maps */
       if (!(nuPmap.preCompContribTab = malloc(sizeof(LUTAB))))
          error(SYSTEM, 
             "out of memory allocating LUT in preComputeContrib()"
          );
       memcpy(nuPmap.preCompContribTab, &lutInit, sizeof(LUTAB));
       
       /* Allocate and init per-modifier child photon maps */
       lu_doall(pmapContribTab, allocPreCompContribPmap, &nuPmap);
       
       /* Record start time, baby */
       repStartTime = repLastTime = time(NULL);
       repComplete = numPreComp = finalGather * pmap -> numPhotons;
       repProgress = 0;
 
       if (verbose) {
          sprintf(errmsg, 
             "\nPrecomputing contributions for %ld photons\n", numPreComp
          );
          eputs(errmsg);
    #if NIX
          fflush(stderr);
    #endif
       }
 
       for (proc = 0; proc < numProc; proc++) {
    #if NIX
          if (!(pid = fork())) {
             /* SUBPROCESS ENTERS HERE; opened and mmapped files inherited */
    #else
          if (1) {
             /* No subprocess under Windoze */
    #endif
             int                  photonOk;
             unsigned             i, j, k, coeffIdx, scDim, nBins, coeffDim, 
                                  nCoeffs, nCompressedCoeffs;
             const double         pInc = finalGather > FTINY ? 1 / finalGather : 0,
                                  numPhotonsPerProc = 
                                     (double)pmap -> numPhotons / numProc,
                                  numPreCompPerProc = 
                                     (double)numPreComp / numProc;
             PhotonIdx            pIdx;
             Photon               photon;
             MODCONT              *modCont;
             LUENT                *preCompContribNode;
             PhotonMap            *preCompContribPmap;
             PreComputedContrib   *preCompContrib;
             WaveletMatrix2       waveletMatrix;
             WaveletCoeff3        *coeffPtr;
             RAY                  ray;
             unsigned long        lastpIdx = 0;
             
             photonRay(NULL, &ray, PRIMARY, NULL);
             ray.ro = NULL;
             /* Decorrelate subprocess RNGs */
             pmapSeed(randSeed + proc, pmap -> randState);
 
    #ifdef PMAP_CONTRIB_DBG
             /* Output child process PID after random delay to prevent
              * scrambling console output with sibling subprocs */
             usleep(1e6 * pmapRandom(rouletteState));
             sprintf(errmsg, "*DBG* Proc %d: PID = %d "
                "(waiting 10 sec to attach DEBUGGA...)\n",
                proc, getpid()
             );
             eputs(errmsg);
             /* Give debugga time to attach to subproc */
             sleep(10);
    #endif
 
             for (p = 0; p < (unsigned long)numPreCompPerProc; p++) {
                do {
                   /* Get random photon from stratified distribution in
                    * source heap to avoid duplicates and clustering. */
                   pIdx = firstPhoton(pmap) + (unsigned long)(p * pInc + 
                      pmapRandom(pmap -> randState) * max(pInc - 1, 0) +
                      proc * numPhotonsPerProc
                   );
                   getPhoton(pmap, pIdx, &photon);
    #if PMAP_CONTRIB_DBG
                   if (pIdx == lastpIdx) {
                      sprintf(errmsg, "duplicate contrib photon "
                         "(proc %d, pIdx %d)", proc, pIdx
                      );
                      error(WARNING, errmsg);
                   }
                   lastpIdx = pIdx;
    #endif       
                   /* Init dummy photon ray with intersection and normal at
                    * photon position.  Set emitting light source index from
                    * origin.  */                   
                   VCOPY(ray.rop, photon.pos);
                   ray.rsrc = photonSrcIdx(pmap, &photon);
                   for (i = 0; i < 3; i++)
                      ray.ron [i] = photon.norm [i] / 127.0;
 
                   /* Get contributions at photon position; retry with
                      another photon if no contribs found */
                   modCont = getPhotonContrib(pmap, &ray, ray.rcol);
                } while (!modCont);
                
                /* Get preallocated LUT node for this modifier */
                preCompContribNode = lu_find(nuPmap.preCompContribTab,
                   modCont -> modname
                );
                
                if (!preCompContribNode || !preCompContribNode -> key ||
                   !preCompContribNode -> data
                )
                   error(CONSISTENCY, 
                      "missing LUT entry in preComputeContrib()"
                   );
                
                preCompContribPmap = (PhotonMap*)preCompContribNode -> data;
                preCompContrib = (PreComputedContrib*)(
                   preCompContribPmap -> preCompContrib
                );
                scDim             = preCompContrib -> scDim;
                nBins             = preCompContrib -> nBins;
                nCoeffs           = preCompContrib -> nCoeffs;
                nCompressedCoeffs = preCompContrib -> nCompressedCoeffs;
                waveletMatrix     = preCompContrib -> waveletMatrix;
 
                if (nBins > 1) {  /* Binning enabled */
                   char  contribBuf0 [preCompContrib -> contribSize],
                         *contribBuf;
                   COLR  *rgbe32 = (COLR*)contribBuf0;
                   
                   /* copy binned contribs to wavelet
                    * input matrix before compressing and encoding */
                   for (i = 0; i < scDim; i++) {
                      /* Calculate row pointer (=multiplication) in outer
                       * loop, increment in inner loop */
                      coeffPtr = modCont -> cbin + 
                         PMAP_CONTRIB_XY2LIN(scDim, i, 0);
                      
    #ifdef PMAP_CONTRIB_LOG
                      /* Logarithmic contribs; copy & apply per element */
                      for (j = 0; j < scDim; j++, coeffPtr++)
                         for (k = 0; k < 3; k++)
                            waveletMatrix [i] [j] [k] = 
                               PMAP_CONTRIB_VAL((*coeffPtr) [k]);
    #else
                      /* Linear contribs; copy matrix rows */
                      memcpy(waveletMatrix [i], *coeffPtr, 
                         scDim * WAVELET_COEFFSIZE
                      );
    #endif
                   }
                
                   /* Compress and encode contribs */
                   if (encodeContribs(preCompContrib, compressRatio))
                      error(INTERNAL, "failed contribution "
                         "compression/encoding in preComputeContrib()"
                      );
 
                   /* Encode wavelet approx coeffs in the upper left of
                    * the wavelet matrix to 32-bit RGBE, and buffer them.
                    * These coeffs are not thresholded to minimise
                    * compression artefacts.  */
                   for (i = 0; i < WAVELET_PADD2_APPROXDIM; i++) {
                      for (j = 0; j < WAVELET_PADD2_APPROXDIM; j++, rgbe32++) {
                         setcolr(*rgbe32,
                            fabs(waveletMatrix [i] [j] [0]), 
                            fabs(waveletMatrix [i] [j] [1]), 
                            fabs(waveletMatrix [i] [j] [2])
                         );
                   
                         /* HACK: depending on the boundary extension mode,
                          * some approx coeffs can be NEGATIVE (!), which
                          * 32-bit RGBE doesn't accommodate.  To get around
                          * this, we sacrifice a bit in each mantissa for the
                          * sign.  */
                         for (k = 0; k < 3; k++)
                            (*rgbe32) [k] = PMAP_CONTRIB_SET_RGBE32_SGN(
                               (*rgbe32) [k], waveletMatrix [i] [j] [k]
                            );
                      }
                   }
 
                   /* Encode wavelet detail coeff range to 32-bit RGBE, and
                    * buffer it after approx coeffs. 
                    * NOTE: mRGBE range maximum is buffered first to
                    * facilitate handling the special (but pointless) case of
                    * a single compressed detail coeff; if so, the mRGBE
                    * minimum is omitted, since implicitly zero. */
                   setcolr(*(rgbe32++), 
                      preCompContrib -> mrgbeRange.max [0],
                      preCompContrib -> mrgbeRange.max [1],
                      preCompContrib -> mrgbeRange.max [2]
                   );
                   if (nCompressedCoeffs > 1)
                      setcolr(*(rgbe32++), 
                         preCompContrib -> mrgbeRange.min [0],
                         preCompContrib -> mrgbeRange.min [1], 
                         preCompContrib -> mrgbeRange.min [2]
                      );
                   contribBuf = (char*)rgbe32;
 
                   /* Now buffa mRGBE encoded, compressed detail coeffs.
                    * NOTE: we can't copy to preCompContribPmap ->
                    * contribHeapBuf directly because it isn't allocated
                    * until the 1st call to newPhoton() below. IMHO this is
                    * cleaner and more consistent with the existing code than
                    * allocating it here. */
                   memcpy(contribBuf, preCompContrib -> mrgbeCoeffs, 
                      nCompressedCoeffs * sizeof(mRGBE)
                   );
                   
                   /* Append precomputed photon to heap from ray, check if 
                    * photon was accepted (i.e. newPhoton() returned 0).
                    * Note the locally buffered encoded contribs are passed
                    * as an extra parameter. */
                   ray.rsrc = proc;
                   photonOk = !newPhoton(preCompContribPmap, &ray, 
                      contribBuf0
                   );
                }
                else {
                   /* Binning disabled; append precomputed photon to heap
                    * from ray _without_ encoded contribs, check if photon
                    * was accepted (i.e. newPhoton() returned 0). */
                   photonOk = !newPhoton(preCompContribPmap, &ray, NULL);
                }
                
                /* Increment parent photon map's photon counter if photon was
                 * accepted by newPhoton(). */
                nuPmap.numPhotons += photonOk;
    #if NIX
                /* Asynchronously increment shared photon counter for this
                 * modifier if photon was accepted by newPhoton().
                 * NOTE: write lock on memmapped file! */
                if (photonOk) {
                   shmLock(preCompContrib -> shmFile, F_WRLCK);
                   *preCompContrib -> numPhotonsShm += photonOk;
                   shmLock(preCompContrib -> shmFile, F_UNLCK);
                }
    #else
                /* Synchronously report progress on W33nD0z */
                if (!proc && photonRepTime > 0 && 
                   time(NULL) >= repLastTime + photonRepTime
                ) {
                   repProgress = nuPmap.numPhotons;
                   pmapPreCompReport();
                }
    #endif
             } /* End precomp. photon loop */
 
             /* Flush heap buffa one final time to prevent data corruption */
             flushPhotonHeap(preCompContribPmap);
    #if NIX
       #ifdef PMAP_CONTRIB_DBG
             sprintf(errmsg,
                "*DBG* preComputeContrib: Avg mRGBE encoding diff = %g\n",
                mrgbeDiffs / mrgbeDiffCnt
             );
             eputs(errmsg);
             sprintf(errmsg, "*DBG* Subproc %d exiting normally\n", proc);
             eputs(errmsg);
       #endif
             /* Terminate subprocess normally */
             exit(0);
    #endif
          } /* if (!fork()) */
          else {
             /* PARENT PROCESS CONTINUES FORKING LOOP HERE */
             if (pid < 0)
                error(SYSTEM, 
                   "failed to fork subprocess in preComputeContrib()"
                );
             else
                /* Save child process ID */
                procPids [proc] = pid; 
          }
       } /* End forking loop */
 
    #if NIX
       /* PARENT PROCESS CONTINUES HERE AFTER FORKING ALL SUBPROCS */
       #ifdef SIGCONT
          /* Enable progress report signal handler */
          signal(SIGCONT, pmapPreCompReport);
       #endif
       /* Wait for subprocesses to complete while reporting progress */
       proc = numProc;
       while (proc) {
          while (waitpid(-1, &stat, WNOHANG) > 0) {
             /* Subprocess exited; check status */
             if (!WIFEXITED(stat) || WEXITSTATUS(stat)) {
                /* Exited with error; terminate siblings, clean up & bail out */
                for (proc = 0; proc < numProc; proc++)
                   kill(procPids [proc], SIGKILL);
                
                cleanUpPmaps(photonMaps);
                error(USER, "failed precomputing contribution photons");
             }
 
             --proc;
          }
 
          /* Nod off for a bit and update progress  */
          sleep(1);
          
          /* Update num photons from shared photon counters for all mods */
          nuPmap.numPhotons = 0;
          lu_doall(nuPmap.preCompContribTab, sumNumPhotons,
             &nuPmap.numPhotons
          );
          repProgress = nuPmap.numPhotons;
 
          if (photonRepTime > 0 && time(NULL) >= repLastTime + photonRepTime)
             pmapPreCompReport();
       #ifdef SIGCONT
          else signal(SIGCONT, pmapPreCompReport);
       #endif
       }
    #endif /* NIX */
 
    #ifdef SIGCONT
       /* Reset signal handler */
       signal(SIGCONT, SIG_DFL);
    #endif
 
       /* Trash original pmap and its leaf file, and replace with new parent,
          which now acts as container for the per-modifier child pmaps */
       unlink(pmap -> store.leafFname);
       deletePhotons(pmap);
       memcpy(pmap, &nuPmap, sizeof(PhotonMap));
 
       /* Bail out if no photons could be precomputed */
       if (!pmap -> numPhotons)
          error(USER, 
             "no contribution photons precomputed; try increasing -am"
          );
 
       /* !!! NOTE: buildPreCompContribPmap() bails out if any child contrib
        * !!! pmaps are empty -- should this trigger a warning instead? */
                
       /* Build per-modifier precomputed photon maps from their contribution
          heaps */
       lu_doall(pmap -> preCompContribTab, buildPreCompContribPmap, NULL);
    }
 
 
 
 #else
    /* -------------------------------------------------------------------
       U N I T   T E S T S
       ------------------------------------------------------------------- */
 
    #include <stdio.h>
    #include <random.h>
    #include "func.h"
    
    int main (int argc, char *argv [])
    {
       unsigned       i, scdim, nsamp, numTheta = 0, numPhi = 0;
       double         t, p;
       RAY            ray;
       int            bin;
 
       if (argc < 3) {
          fprintf(stderr, "%s <scdim> <nsamp> [<var>=<value>; ..]\n", 
             argv [0]
          );
          fprintf(stderr, "Default variable defs: %s\n", 
             PMAP_CONTRIB_SCDEFAULTS
          );
          fputs("\nMissing Shirley-Chiu dimension scDim>1, "
             "number of samples nsamp\n",
             stderr
          );
          return -1;
       }
       
       if (!(scdim = atoi(argv [1]))) {
          fputs("Invalid Shirley-Chiu dimension\n", stderr);
          return -1;
       }
       
       if (!(nsamp = atoi(argv [2]))) {
          fputs("Invalid num samples\n", stderr);
          return -1;
       }
       else {
          numTheta = (int)(sqrt(nsamp) / 2);
          numPhi = 4* numTheta;
       }
       
       /* (Doan' need to) Init cal func routines for binning */
    #if 0
       initfunc();
       setcontext(RCCONTEXT);
    #endif
    
       /* Compile default orientation variables for contribution binning */
       scompile(PMAP_CONTRIB_SCDEFAULTS, NULL, 0);
       /* Compile custom orientation variabls from command line */
       for (i = 3; i < argc; i++)
          scompile(argv [i], NULL, 0);
       
       for (i = 0; i < nsamp; i++) {
          #if 0
             /* Random */
             t = 0.5 * PI * drand48();
             p = 2 * PI * drand48();
          #else
             /* Stratified */
             t = 0.5 * PI * ((i % numTheta) + drand48()) / numTheta;
             p = 2.0 * PI * ((i / numTheta) + drand48()) / numPhi;
          #endif
          ray.rdir [0] = sin(t) * cos(p);
          ray.rdir [1] = sin(t) * sin(p);
          ray.rdir [2] = cos(t);
          bin = ray2bin(&ray, scdim);
          
          printf("%.3f\t%.3f\t%.3f\t-->\t%d\n", 
             ray.rdir [0], ray.rdir [1], ray.rdir [2], bin
          );
       }
       
       return 0;
    }
 #endif
 
 #endif /* PMAP_CONTRIB */
diff --git a/pmapcontrib.h b/pmapcontrib.h
index aef9284..4d93589 100644
--- a/pmapcontrib.h
+++ b/pmapcontrib.h
@@ -1,241 +1,243 @@
 /* RCSid $Id: pmapcontrib.h,v 2.5 2016/05/17 17:39:47 rschregle Exp $ */
 
 /* 
    =========================================================================
    Photon map routines for precomputed light source contributions; these
    routines interface to mkpmap and rcontrib.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #147053, "Daylight Redirecting Components",
         SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id: pmapcontrib.h,v 2.5 2016/05/17 17:39:47 rschregle Exp $
 */
 
 
 
 #ifndef _PMAPCONTRIB_H
    #define _PMAPCONTRIB_H
    
    #if defined(PMAP_OOC) && !defined(PMAP_CONTRIB)
       #define PMAP_CONTRIB
       
       #include "pmapdata.h"
       #include "wavelet2.h"
       #include "mrgbe.h"
       #ifndef MAXPROCESS
          #include "rcontrib.h"
       #endif
       
       #ifndef MAXMODLIST 
          /* Max number of contributing sources */
          #define MAXMODLIST                  1024 
       #endif
       
       /* Filename templates for per-modifier contrib photon maps and wavelet
          coeffs; these are held in a separate subdirectory PMAP_CONTRIB_DIR */
       #define PMAP_CONTRIB_DIR               "%s.rc"
       #define PMAP_CONTRIB_FILE              "%s/%s.pm"
       #define PMAP_CONTRIB_WAVELETFILE       "%s/%s.wvt"
       
-      /* Precompute logarithmic contributions */
+      /* Precompute logarithmic contributions
+         !!! WARNING: DO NOT USE WITH WAVELET_EXTEND_GRAD[1,2], as this
+         !!! assumes _linear_ input to the wavelet transform!!! */
       #define PMAP_CONTRIB_LOG
 
       /* Contrib value based on above (log or linear) */
       #ifdef PMAP_CONTRIB_LOG
          /* TODO: HOW DO WE RELIABLY HANDLE ZEROS IN LOG MODE ??? :^o */
          #define PMAP_CONTRIB_VAL(c)         ((c) > FTINY ? log(c) : 0)
          #define PMAP_CONTRIB_IVAL(c)        (fabs(c) > FTINY ? exp(c) : 0)
       #else
          #define PMAP_CONTRIB_VAL(c)         (c)
          #define PMAP_CONTRIB_IVAL(c)        (c)
       #endif    
       
       /* The following variables can be specified to override the orientation
          of the Shirley-Chiu mapping (see also disk2square.cal).
          We use the built-in functions in disk2square.c for efficiency and
          in order to not depend on an external function file. These variables
          merely mimic the function file interace.
 
          RHS            : right-hand-coordinate system (-1 for left-hand)
          rNx, rNy, rNz  : surface normal
          Ux, Uy, Uz     : up vector (defines phi = 0) */
       #define PMAP_CONTRIB_SCRHS             "RHS"
       #define PMAP_CONTRIB_SCNORMX           "rNx"
       #define PMAP_CONTRIB_SCNORMY           "rNy"
       #define PMAP_CONTRIB_SCNORMZ           "rNz"
       #define PMAP_CONTRIB_SCUPX             "Ux"
       #define PMAP_CONTRIB_SCUPY             "Uy"
       #define PMAP_CONTRIB_SCUPZ             "Uz"
       #define PMAP_CONTRIB_SCDEFAULTS        ( \
          "RHS=1; rNx=0; rNy=0; rNz=1; Ux=0; Uy=1; Uz=0;" \
       )
       
       /* Maximum number of mRGBE encodable wavelet coefficients */
       #define PMAP_CONTRIB_MAXCOEFFS         (mRGBE_DATAMAX + 1)
       
       /* Size of a set of encoded coefficients in wavelet file (in bytes) per
        * photon.  This is a function of the number of (mRGBE encoded)
        * compressed detail coefficients nComp, their mRGBE range, and the
        * number of (uncompressed) approximation coefficients.      
        * NOTE: the coefficient range minimum is omitted from the mRGBE range if
        * nComp == 1, since it is implicitly zero!  */
       #define PMAP_CONTRIB_ENCSIZE(nComp)    ((nComp) * sizeof(mRGBE) + \
          (WAVELET_PADD2_NUMAPPROX + 1 + ((nComp) > 1)) * sizeof(COLR) \
       )
 
       /* Serialise 2D coordinates (x,y) in (dim x dim) Shirley-Chiu
          square to 1D index.  Returns -1 if coordinates invalid */
       #define PMAP_CONTRIB_XY2LIN(dim, x, y) ( \
          (x) < 0 || (y) < 0 ? -1 : (x) * (dim) + (y) \
       )
 
       /* Deserialise 1D index idx to 2D coordinates (x,y) in (dim x dim)
          Shirley-Chiu square.  Returns -1 if bin invalid */
       #define PMAP_CONTRIB_LIN2X(dim, idx)   ((idx) < 0 ? -1 : (idx) / (dim))
       #define PMAP_CONTRIB_LIN2Y(dim, idx)   ((idx) < 0 ? -1 : (idx) % (dim))
       
       /* HACK: Get/set sign of f by (ab)using MSB in char */
       #define PMAP_CONTRIB_SET_RGBE32_SGN(c, f) ((c) & 0xfe | ((f) < 0))
       #define PMAP_CONTRIB_GET_RGBE32_SGN(c, f) (((c) & 1) ? -(f) : (f))
 
       /* Condition for binned contributions */
       #define PMAP_CONTRIB_BINNING(pmap) (isContribChildPmap(pmap) && \
          pmap -> preCompContrib && \
          ((PreComputedContrib*)pmap -> preCompContrib) -> nBins > 1 \
       )
 
 
 
       /* Struct for wavelet coeff thresholding; saves original coeff index */
       typedef struct {
          WAVELET_COEFF  *coeff;
          unsigned       idx;
       } PreComputedContribCoeff;
       
       typedef struct {
          char                       *waveletFname;
          FILE                       *waveletFile;
          WaveletMatrix2             waveletMatrix, tWaveletMatrix;
          /* Wavelet coeff compression/encoding buffer */
          PreComputedContribCoeff    *compressedCoeffs;
          mRGBERange                 mrgbeRange;
          mRGBE                      *mrgbeCoeffs;
          unsigned                   scDim, nBins, coeffDim, nCoeffs, 
                                     nCompressedCoeffs, nNonZeroCoeffs;
          unsigned long              contribSize;
          OOC_Cache                  *cache;
          /* Shared memory mapping to update numPhotons in multiprocessing mode */
          unsigned long              *numPhotonsShm;
          int                        shmFile;
          char                       shmFname [PMAP_TMPFNLEN];
       } PreComputedContrib;
 
 
 
       /* The following are convenient placeholders interfacing to mkpmap 
          and rcontrib. They can be externally set via initPmapContribTab()
          and referenced within the contrib pmap modules. These variables 
          can then be safely ignored by rtrace/rpict/rvu, without annoying
          linking errors. */
       /* Global pointer to rcontrib's contribution binning LUT */
       extern LUTAB *pmapContribTab;
       /* Contribution/coefficient mode flag */
       extern int *pmapContribMode;
 
 
        
       MODCONT *addContribModifier(LUTAB *contribTab, unsigned *numContribs, 
          char *mod, char *binParm, int binCnt
       );
       /* Add light source modifier mod to contribution lookup table
          contribsTab, and update numContribs. Return initialised
          contribution data for this modifier. */
 
 
       void addContribModfile(LUTAB *contribTab, unsigned *numContribs, 
          char *modFile, char *binParm, int binCnt
       );
       /* Add light source modifiers from file modFile to contribution lookup
        * table contribTab, and update numContribs. */
 
 
       void freePreCompContribNode (void *p);
       /* Clean up precomputed contribution LUT entry */
 
 
       void initPmapContribTab (LUTAB *contribTab, int *contribMode);
       /* Set contribution table and contrib/coeff mode flag
          (interface to rcmain.c, see also pmapContribTab above) */
       
       
       void initPmapContrib (PhotonMap *pmap);
       /* Initialise contribution photon map and check modifiers. 
          NOTE: pmapContribTab must be set before via initPmapContribTab() */
       
       
       PhotonContribSourceIdx newPhotonContribSource (PhotonMap *pmap, 
          const RAY *contribSrcRay, FILE *contribSrcHeap
       );
       /* Add contribution source ray for emitted contribution photon and save
        * light source index and binned direction.  The current contribution
        * source is stored in pmap -> lastContribSrc.  If the previous
        * contribution source spawned photons (i.e.  has srcIdx >= 0), it's
        * appended to contribSrcHeap.  If contribSrcRay == NULL, the current
        * contribution source is still flushed, but no new source is set. 
        * Returns updated contribution source counter pmap -> numContribSrc. */
 
 
       PhotonContribSourceIdx buildContribSources (PhotonMap *pmap, 
          FILE **contribSrcHeap, char **contribSrcHeapFname, 
          PhotonContribSourceIdx *contribSrcOfs, unsigned numHeaps
       );
       /* Consolidate per-subprocess contribution sources heaps into array
        * pmap -> contribSrc. Returns offset for contribution source index 
        * linearisation in pmap -> numContribSrc. The heap files in 
        * contribSrcHeap are closed on return. */
 
 
       void initPreComputedContrib (PreComputedContrib *preCompContrib);
       /* Initialise precomputed contribution container in photon map */
       
       
       void preComputeContrib (PhotonMap *pmap, unsigned numProc);
       /* Precompute contributions for a random subset of (finalGather * 
          pmap -> numPhotons) photons, and init the per-modifier precomputed
          contribution photon maps in LUT pmap -> preCompContribTab, discarding
          the original photons. */
 
 
       void distribPhotonContrib (PhotonMap *pmap, LUTAB *contribTab, 
          unsigned numContribs, int *contribMode, unsigned numProc
       );
       /* Emit photons with binned light source contributions, precompute their
          contributions and build photon map */
 
 
       int buildPreCompContribPmap (const LUENT *preCompContribNode, void *p);
       /* Build per-modifier precomputed photon map. Returns 0 on success.  */
 
 
       void saveContribPhotonMap (const PhotonMap *pmap, const char *fname, 
          int argc, char **argv
       );
       /* Save contribution pmap and its per-modifier precomputed children */
 
 
       void loadContribPhotonMap (PhotonMap *pmap, const char *fname);
       /* Load contribution pmap and its per-modifier precomputed children */
 
        
       void getPreCompPhotonContrib (PhotonMap *pmap, RAY *ray, 
          COLOR totalContrib
       );
       /* Get precomputed light source contributions at ray -> rop for all
          specified modifiers and accumulate them in pmapContribTab (which maps
          to rcontrib's binning LUT). Also return total precomputed contribs. */
    #endif
 #endif
diff --git a/pmapdata.h b/pmapdata.h
index 28ee966..befaaf8 100644
--- a/pmapdata.h
+++ b/pmapdata.h
@@ -1,439 +1,439 @@
 /* RCSid $Id: pmapdata.h,v 2.14 2020/04/08 15:14:21 rschregle Exp $ */
 
 /* 
    =========================================================================
    Photon map types and interface to nearest neighbour lookups in underlying
    point cloud data structure.
 
    The default data structure is an in-core kd-tree (see pmapkdt.{h,c}).
    This can be overriden with the PMAP_OOC compiletime switch, which enables
    an out-of-core octree (see oococt.{h,c}).
    
    Defining PMAP_FLOAT_FLUX stores photon flux as floats rather than packed
    RGBE for greater precision; this may be necessary when the flux differs
    significantly in individual colour channels, e.g. with highly saturated
    colours.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Fraunhofer Institute for Solar Energy Systems,
        supported by the German Research Foundation 
        (DFG LU-204/10-2, "Fassadenintegrierte Regelsysteme" (FARESYS)) 
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #147053, "Daylight Redirecting Components",
         SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    (c) Tokyo University of Science,
        supported by the JSPS Grants-in-Aid for Scientific Research 
        (KAKENHI JP19KK0115, "Three-Dimensional Light Flow")
    =========================================================================
    
    $Id: pmapdata.h,v 2.14 2020/04/08 15:14:21 rschregle Exp $
 */
 
 
 
 #ifndef PMAPDATA_H
    #define PMAPDATA_H
 
    #ifndef NIX
       #if defined(_WIN32) || defined(_WIN64)
          #define NIX 0
       #else
          #define NIX 1
       #endif
    #endif
 
    #if (defined(PMAP_OOC) && !NIX)
       #error "OOC currently only supported on NIX -- tuff luck."
    #endif
 
 
 
    #include "ray.h"
    #include "pmaptype.h"
    #include "paths.h"
    #include "lookup.h"
    #include <stdint.h>
 
 
 
    /* Source of a contribution photon. This consists of the emitting light
       source and binned direction.  These records are only used to
       precompute contribution photons.  They are referenced by contribution
       photons (see contribSrc field in struct PhotonAuxAttrib below) in a
       surjective mapping, since multiple photons may share the same emitting
       source and direction if they lie along its associated path.  For this
       reason it is more efficient to factor this data out of the photons
       themselves and consolidate it here until the photons have been
       precomputed, after which it is no longer needed.  */
    typedef struct {
       int16    srcIdx,     /* Index of emitting light source */
                srcBin;     /* Binned incident direction */
    } PhotonContribSource;
 
    /* Maximum light source index accommodated by PhotonContribSource */
    #define PMAP_MAXSRCIDX  ((1L << 15) - 1)
 
 
 
    typedef uint32 PhotonPathID;
    typedef uint32 PhotonContribSourceIdx;
    #define PMAP_MAXCONTRIBSRC   UINT32_MAX
 
    #define photonSrcIdx(pm, p) ((pm) -> contribSrc \
       ? (pm) -> contribSrc [(p) -> aux.contribSrc].srcIdx \
       : (p) -> aux.pathID\
    )
    #define photonSrcBin(pm, p) ( \
       (pm) -> contribSrc [(p) -> aux.contribSrc].srcBin \
    )
    #define photonSrcMod(pm, p) findmaterial(source [photonSrcIdx(pm, p)].so)
 
 
 
    /* Multipurpose auxiliary photon attribute type */
    typedef union {
       /* Photon's propagation distance (= path length / time of flight) 
          for temporal light flow */
       float                   pathLen;
       /* Index into contribution photon's emitting source and binned 
          direction; see struct PhotonContribSource above */
       PhotonContribSourceIdx  contribSrc; 
       /* Unique path ID for all other photon types */
       PhotonPathID            pathID;
    } PhotonAuxAttrib;
 
 
 
    /* Macros for photon's generating subprocess field */
    #ifdef PMAP_OOC
       #define  PMAP_PROCBITS        7
    #else
       #define  PMAP_PROCBITS        5
    #endif
    #define     PMAP_MAXPROC         (1 << PMAP_PROCBITS)
    #define     PMAP_GETRAYPROC(r)   ((r) -> crtype >> 8)
    #define     PMAP_SETRAYPROC(r,p) ((r) -> crtype |= p << 8)
 
 
 
    typedef struct {
       float                pos [3];       /* Photon position */
       signed char          norm [3];      /* Encoded normal / incident
                                              direction [volume photons] */
       union {
          struct {
    #ifndef PMAP_OOC
             unsigned char  discr    : 2;  /* kd-tree discriminator axis */
    #endif            
             unsigned char  caustic  : 1;  /* Specularly scattered (=caustic) */
             
             /* Photon's generating subprocess index, used for primary ray
              * index linearisation when building contrib pmaps; note this is
              * reduced for kd-tree to accommodate the discriminator field */
             unsigned char  proc  : PMAP_PROCBITS; 
          };
          
          unsigned char     flags;
       };
 
       /* Photon flux in watts or lumen */
    #ifdef PMAP_FLOAT_FLUX
       COLOR                flux;
    #else
       COLR                 flux;
    #endif
       /* Auxiliary field; this is a multipurpose, type-specific field used
          by the following photon types (as identified by enum PhotonMapType
          in pmaptype.h):
          
          PMAP_TYPE_CONTRIB:         Index into photon map's contrib 
                                     origin array.
          PMAP_TYPE_TEMPLIGHTFLOW:   Distance travelled by photon / time
                                     of flight
          All others:                Photon path ID. 
       */
       PhotonAuxAttrib      aux;
    } Photon;
 
 
 
    /* Define PMAP_FLOAT_FLUX to store photon flux as floats instead of
     * compact RGBE, which was found to improve accuracy in analytical
     * validation. */
    #ifdef PMAP_FLOAT_FLUX
       #define setPhotonFlux(p,f) copycolor((p) -> flux, f)
       #define getPhotonFlux(p,f) copycolor(f, (p) -> flux)
    #else
       #define setPhotonFlux(p,f) setcolr((p)->flux, (f)[0], (f)[1], (f)[2])
       #define getPhotonFlux(p,f) colr_color(f, (p) -> flux)
    #endif
 
 
 
    /* Define search queue and underlying data struct types */
    #ifdef PMAP_OOC
       #include "pmapooc.h"
    #else
       #include "pmapkdt.h"
       #include "pmaptkdt.h"
    #endif
 
 
    /* Mean size of heapfile write buffer, in number of photons */
    #define PMAP_HEAPBUFSIZE   1e5
 
    /* Temporary filename for photon heaps */
    #define PMAP_TMPFNAME      TEMPLATE
    #define PMAP_TMPFNLEN      (TEMPLEN + 1)
 
 
    /* Forward declarations */
    struct PhotonMap;
    struct PreComputedContrib;
    struct PhotonBiasCompNode;
 
    typedef struct PhotonMap {
       PhotonMapType  type;             /* See pmaptype.h */
       char           *fileName;        /* Photon map file */
 
       /* ================================================================
        * PRE/POST-BUILD STORAGE
        * ================================================================ */
       FILE           *heap;            /* Unsorted photon heap prior to
                                           construction of store */
       char           heapFname [sizeof(PMAP_TMPFNAME)];
       Photon         *heapBuf;         /* Write buffer for above */
       unsigned long  heapBufLen,       /* Current & max size of heapBuf */
                      heapBufSize;
       PhotonStorage  store;            /* Photon storage in space
                                           subdividing data struct */
 
       /* ================================================================
        * PHOTON DISTRIBUTION STUFF
        * ================================================================ */
       unsigned long  distribTarget,    /* Num stored specified by user */
                      numPhotons;       /* Num actually stored */
       float          distribRatio;     /* Probability of photon storage */
       COLOR          photonFlux;       /* Average photon flux */
       unsigned short randState [3];    /* Local RNG state */
 
 
       /* ================================================================
        * PHOTON LOOKUP STUFF 
        * ================================================================ */
       union {                          /* Flags passed to findPhotons() */
          char        lookupCaustic : 1;
          char        lookupFlags;
       };
       
       PhotonSearchQueue squeue;        /* Search queue for photon lookups */
       unsigned       minGather,        /* Specified min/max photons per */
                      maxGather;        /* density estimate */
 
                                        /* NOTE: All distances are SQUARED */
       float          maxDist2,         /* Max search radius */
                      maxDist0,         /* Initial value for above */
                      maxDist2Limit,    /* Hard limit for above */
                      gatherTolerance;  /* Fractional deviation from minGather/
                                           maxGather for short lookup */
       void (*lookup)(
          struct PhotonMap*, RAY*, COLOR
       );                               /* Callback for type-specific photon
                                         * lookup (usually density estimate) */
 
       /* ================================================================
        * TRANSIENT PHOTON STUFF
        * ================================================================ */
       double         velocity,         /* Speed of light in units of scene 
                                           geometry [1/sec] */
                      time,             /* Photons' time of flight for
                                           transient lookups */
                      minPathLen,
                      maxPathLen,
                      avgPathLen;       /* Min/max/avg path length */
 
       /* ================================================================
        * CONTRIBUTION PHOTON STUFF
        * ================================================================ */
       PhotonContribSource *contribSrc,    /* Contribution source array */
                           lastContribSrc; /* Current contrib source */
       PhotonContribSourceIdx numContribSrc; /* Number of contrib sources */
 
       LUTAB          *preCompContribTab;  /* LUT for per-modifier precomp
                                              contrib child photon maps (in
                                              parent) or NULL (in child) */
       struct PreComputedContrib *preCompContrib;   /* Precomputed contribs
                                                       (in child) or NULL 
                                                       (in parent) */
       FILE           *contribHeap;        /* Out-of-core heap containing 
                                              unsorted precomputed contrib
                                              photon bins prior to 
                                              construction of store */
       char           contribHeapFname [sizeof(PMAP_TMPFNAME)], 
                      *contribHeapBuf;
       int            contribMode;         /* If 0, photon map contains 
                                              precomputed coefficients,
-                                             else contributions */                                             
-   
+                                             else contributions */
+
       /* ================================================================
        * BIAS COMPENSATION STUFF
        * ================================================================ */
       struct PhotonBiasCompNode *biasCompHist; /* Bias compensation history */
 
       /* ================================================================
        * STATISTIX
        * ================================================================ */
       unsigned long  totalGathered,    /* Total photons gathered */
                      numDensity,       /* Num density estimates */
                      numLookups,       /* Counters for short photon lookups */
                      numShortLookups;
                      
       unsigned       minGathered,      /* Min/max photons actually gathered */
                      maxGathered,      /* per density estimate */
                      shortLookupPct;   /* % of short lookups for stats */
       float          minError,         /* Min/max/rms density estimate error */
                      maxError,
                      rmsError,
                      CoGdist,          /* Avg distance to centre of gravity */
                      maxPos [3],       /* Max & min photon positions */
                      minPos [3];
       FVECT          CoG;              /* Centre of gravity (avg photon pos) */
 
    #ifdef PMAP_PATHFILT
      /* ================================================================
        * PHOTON PATH FILTERING STUFF
        * ================================================================ */
       LUTAB          *pathLUT;         /* Photon path lookup table to filter 
                                           volume photons */
       char           **pathLUTKeys;    /* Preallocated buffer to store keys
                                           for path lookup table */
       unsigned       numPathLUTKeys;   /* Num keys currently in key buffer 
                                           (= next free entry at tail) */
    #endif
    } PhotonMap;
 
 
 
    /* Photon maps by type (see PhotonMapType) */
    extern PhotonMap  *photonMaps [];
 
    /* Macros for specific photon map types */
    #define globalPmap               (photonMaps [PMAP_TYPE_GLOBAL])
    #define preCompPmap              (photonMaps [PMAP_TYPE_PRECOMP])
    #define causticPmap              (photonMaps [PMAP_TYPE_CAUSTIC])
    #define directPmap               (photonMaps [PMAP_TYPE_DIRECT])
    #define contribPmap              (photonMaps [PMAP_TYPE_CONTRIB])
    #define volumePmap               (photonMaps [PMAP_TYPE_VOLUME])
    #define transientPmap            (photonMaps [PMAP_TYPE_TRANSIENT])
 
    #ifdef PMAP_PHOTONFLOW
       /* Transient lightflow has precedence */
       #define lightFlowPmap         (transLightFlowPmap \
          ? transLightFlowPmap : photonMaps [PMAP_TYPE_LIGHTFLOW] \
       )
       #define transLightFlowPmap    (photonMaps [PMAP_TYPE_TRANSLIGHTFLOW])
    #else
       #define lightFlowPmap         NULL
       #define transLightFlowPmap    NULL
    #endif
 
    /* Photon map type tests */
    #define isGlobalPmap(p)          ((p) -> type == PMAP_TYPE_GLOBAL)
    #define isCausticPmap(p)         ((p) -> type == PMAP_TYPE_CAUSTIC)
    #define isContribPmap(p)         ((p) -> type == PMAP_TYPE_CONTRIB)
    #define isContribChildPmap(p)    ((p) -> type == PMAP_TYPE_CONTRIB_CHILD)
    #define isVolumePmap(p)          ((p) -> type == PMAP_TYPE_VOLUME)
    #define isTransientPmap(p)       ((p) -> type == PMAP_TYPE_TRANSIENT)
    
    #ifdef PMAP_PHOTONFLOW
       /* lightflow also implies transient lightflow */
       #define isLightFlowPmap(p)      ( \
          (p) -> type == PMAP_TYPE_LIGHTFLOW || isTransLightFlowPmap(p) \
       )
       #define isTransLightFlowPmap(p) ( \
          (p) -> type == PMAP_TYPE_TRANSLIGHTFLOW \
       )
    #endif
 
 
 
    void initPhotonMap (PhotonMap *pmap, PhotonMapType t);
    /* Initialise empty photon map of specified type */
 
    int newPhoton (PhotonMap *pmap, const RAY *ray, const char *contribs);
    /* Create new photon with ray's direction, intersection point, and flux, 
     * append to unsorted photon heap pmap -> heap. 
     * 
     * The photon is accepted with probability pmap -> distribRatio for
     * global density control; if the photon is rejected, -1 is returned,
     * else 0. The flux is scaled by ray -> rweight / pmap -> distribRatio.
     *  
     * If pmap is a contribution photon map and contribs != NULL, the
     * buffered precomputed contributions of length pmap -> preCompContrib ->
     * contribSize are appended to the contribution heap pmap -> contribHeap.
     * The contribution heap is synchronised with the photon heap and is
     * identically ordered. */
 
    void initPhotonHeap (PhotonMap *pmap);
    /* Open photon heap file */
 
    void flushPhotonHeap (PhotonMap *pmap);
    /* Flush photon heap buffa pmap -> heapBuf to heap file pmap -> heap;
     * used by newPhoton() and to finalise heap in distribPhotons(). */
 
    void buildPhotonMap (PhotonMap *pmap, double *photonFlux, 
       PhotonContribSourceIdx *contribSrcOfs, unsigned nproc
    );
    /* Postprocess unsorted photon heap pmap -> heap and build underlying data
     * structure pmap -> store.  This is prerequisite to photon lookups with
     * findPhotons(). */
     
    /* PhotonFlux is the flux per photon averaged over RGB; this is
     * multiplied with each photon's flux during the postprocess.  In the
     * case of a contribution photon map, this is an array with a separate
     * flux specific to each light source due to non-uniform photon emission;
     * Otherwise it is referenced as a scalar value.  Flux is not scaled if
     * photonFlux == NULL.  */
     
    /* Photon map construction may be parallelised if nproc > 1, if
     * supported. The heap is destroyed on return.  */
     
    /* OriginOfs is an array of index offsets for the contribution photon
     * origins in pmap->contribOrg generated by each of the nproc subprocesses
     * during contrib photon distribution (see distribPhotonContrib()). These 
     * offsets are used to linearise the photon origin indices in the 
     * postprocess. This linearisation is skipped if originOfs == NULL,
     * e.g. when building a global/caustic/volume photon map, where the
     * origins are serial path IDs. */
 
    void findPhotons (PhotonMap* pmap, const RAY *ray);
    /* Find pmap -> squeue.len closest photons to ray -> rop with similar 
       normal. For volume photons ray -> rorg is used and the normal is 
       ignored (being the incident direction in this case). Found photons 
       are placed search queue starting with the furthest photon at pmap ->
       squeue.node, and pmap -> squeue.tail being the number actually found. */
 
    Photon *find1Photon (PhotonMap *pmap, const RAY *ray, Photon *photon,
       void *photonIdx
    );
    /* Find single closest photon to ray -> rop with similar normal. 
       Return NULL if none found, else the supplied Photon* buffer
       and the photon's index/ID (if photonIdx != NULL), indicating 
       that it contains a valid photon. */
 
    void getPhoton (PhotonMap *pmap, PhotonIdx idx, Photon *photon);
    /* Retrieve photon referenced by idx from pmap -> store */
 
    Photon *getNearestPhoton (const PhotonSearchQueue *squeue, PhotonIdx idx);
    /* Retrieve photon from NN search queue after calling findPhotons() */
    
    PhotonIdx firstPhoton (const PhotonMap *pmap);
    /* Index to first photon, to be passed to getPhoton(). Indices to
     * subsequent photons can be optained via increment operator (++) */
     
    void deletePhotons (PhotonMap*);
    /* Free dem mammaries... */
 
 #endif
 
diff --git a/pmcontrib4.c b/pmcontrib4.c
index 39dd662..da8065e 100644
--- a/pmcontrib4.c
+++ b/pmcontrib4.c
@@ -1,809 +1,823 @@
 #ifndef lint
 static const char RCSid[] = "$Id: pmcontrib2.c,v 2.5 2018/11/08 00:54:07 greg Exp $";
 #endif
 
 /* 
    =========================================================================
    Photon map routines for precomputed light source contributions. 
    These routines interface to rcontrib.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #147053, "Daylight Redirecting Components",
         SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 
 #include "pmapcontrib.h"
 
 #ifdef PMAP_CONTRIB
 #include "pmapmat.h"
 #include "pmapsrc.h"
 #include "pmaprand.h"
 #include "pmapio.h"
 #include "pmapdiag.h"
 #include "otypes.h"
 #include "otspecial.h"
 #include "random.h"
 #include "pmcontribcache.h"
 #include "oocnn.h"
 
 
 
 void initPmapContribTab (LUTAB *contribTab, int *contribMode)
 /* Set contribution table (interface to rcmain.c) */
 {
    pmapContribTab = contribTab;
    pmapContribMode = contribMode;
 }
 
 
 
 /* ------------------ CONTRIB PMAP LOADING STUFF --------------------- */
 
 
 
 static int checkContribModifier (const LUENT *modContNode, void *p)
 /* Check for valid modifier in LUT entry */
 {
    MODCONT              *modCont = (MODCONT*)modContNode -> data;
    char                 *modName = (char*)modCont -> modname;
    OBJECT               modObj = modifier(modName);
    const PhotonMap      *pmap = (PhotonMap*)p;
    
    if (modObj == OVOID) {
       sprintf(errmsg, "invalid modifier %s", modName);
       error(USER, errmsg);
    }
    if (!islight(objptr(modObj) -> otype)) {
       sprintf(errmsg, "%s is not a light source modifier", modName);
       error(USER, errmsg);
    }
    
+#if 0
    /* Warn if number of bins exceeds lookup bandwidth, as this is
     * guaranteed to result in empty bins! */
    if (pmap -> maxGather < modCont -> nbins) {
       sprintf(errmsg, "contribution photon lookup bandwidth too low "
          "for modifier %s, some bins will be zero", modName
       );
       error(WARNING, errmsg);
    }
+#endif
    
    return 0;
 }
 
 
 
 void initPmapContrib (PhotonMap *pmap)
 /* Set up photon map contributions and get binning parameters */ 
 {
    unsigned t;
 
    /* Avoid incompatible photon maps */
    for (t = 0; t < NUM_PMAP_TYPES; t++)
       if (photonMaps [t] && t != PMAP_TYPE_CONTRIB) {
          sprintf(errmsg, "%s photon map does not support contributions",
             pmapName [t]
          );
          error(USER, errmsg);
       }
 
    if (!pmapContribTab)
       error(INTERNAL, 
          "contribution LUT not initialised in initPmapContrib()"
       );
       
    /* Check for valid contribution modifiers */
    lu_doall(pmapContribTab, checkContribModifier, pmap);
 
    /* Set callback to collect contributions */
    if (pmap) {
       pmap -> lookup = getPreCompPhotonContrib;
       /* XXX: Need tighter tolerance for contribution photon lookups? */
       pmap -> gatherTolerance = 1.0;
    }
 }
 
 
 
 static int loadPreCompContrib (const LUENT *modContNode, void *p)
 /* Load child contribution photon map for current contrib modifier entry */
 {
    MODCONT              *modCont = (MODCONT*)modContNode -> data;
    char                 *modName = (char*)modCont -> modname;
    PhotonMap            *preCompContribPmap;
    const PhotonMap      *parentPmap = (const PhotonMap*)p;
    LUENT                *preCompContribNode;
    PreComputedContrib   *preCompContrib;
    char                 dirName [1024];
    
    /* Allocate new LUT entry for child precomputed contribution pmap */
    preCompContribNode = lu_find(parentPmap -> preCompContribTab, modName);
    if (!preCompContribNode)
       error(SYSTEM, 
          "out of memory allocating LUT entry in loadPreCompContrib()"
       );
    preCompContribNode -> key = modName;
 
    /* Allocate child precomputed contribution photon map */
    preCompContribNode -> data = (char*)(
       preCompContribPmap = malloc(sizeof(PhotonMap))
    );
    if (!preCompContribPmap)
       error(SYSTEM, "out of memory allocating precomputed contribution "
          "photon map in loadPreCompContrib()"
       );
    
    /* Set subdirectory from parent photon map's filename */
    snprintf(dirName, sizeof(dirName), PMAP_CONTRIB_DIR, 
       parentPmap -> fileName
    );
 
    /* Allocate & set child pmap's filename from subdir and modifier */
    preCompContribPmap -> fileName = malloc(strlen(parentPmap -> fileName) +
       strlen(modName) + strlen(PMAP_CONTRIB_FILE)
    );
    if (!preCompContribPmap -> fileName)
       error(SYSTEM, "out of memory allocating filename in "
          "loadPreCompContrib()"
       );   
    snprintf(preCompContribPmap -> fileName, sizeof(dirName) + 4, 
       PMAP_CONTRIB_FILE, dirName, modName
    );      
    
    /* Load child precomp contrib photon map for current modifier; 
       loadPhotonMap() will not recurse and call loadContribPhotonMap() 
       again because preCompContribPmap -> preCompContribTab == NULL */
    loadPhotonMap(preCompContribPmap, preCompContribPmap -> fileName);   
    
    /* Pass parent pmap's lookup parameters on to child */
    preCompContribPmap -> minGather        = parentPmap -> minGather;
    preCompContribPmap -> maxGather        = parentPmap -> maxGather;
    preCompContribPmap -> gatherTolerance  = parentPmap -> gatherTolerance;   
    
    /* Override contrib/coefficient mode if it doesn't match precomputed */
    if (*pmapContribMode != preCompContribPmap -> contribMode) {
       sprintf(errmsg, 
          "photon map contains precomputed %s, overriding rcontrib setting",
          preCompContribPmap -> contribMode ? "contributions" : "coefficients"
       );
       error(WARNING, errmsg);
       *pmapContribMode = preCompContribPmap -> contribMode;
    }
    
    /* NOTE: preCompContribPmap -> preCompContrib is initialised by
     * loadPhotonMap() */
    preCompContrib = (PreComputedContrib*)(
       preCompContribPmap -> preCompContrib
    );
    
    /* Set number of bins and wavelet coefficients */
    preCompContrib -> nBins = preCompContrib -> scDim * 
       preCompContrib -> scDim;
    preCompContrib -> nCoeffs = preCompContrib -> coeffDim *
       preCompContrib -> coeffDim;
    
    /* Set wavelet coefficient filename */
    preCompContrib -> waveletFname = malloc(strlen(dirName) + 
       strlen(modName) + strlen(PMAP_CONTRIB_WAVELETFILE)
    );
    if (!preCompContribPmap -> fileName)
       error(SYSTEM, "out of memory allocating filename in "
          "loadPreCompContrib()"
       );
    snprintf(preCompContrib -> waveletFname, sizeof(dirName) + 5, 
       PMAP_CONTRIB_WAVELETFILE, dirName, modName
    );
    
    return 0;
 }
 
 
 
 void loadContribPhotonMap (PhotonMap *pmap, const char *fname)
 /* Load contribution pmap and its per-modifier precomputed children */
 {
    LUTAB lutInit = LU_SINIT(NULL, freePreCompContribNode);
    
    /* Allocate & init LUT containing per-modifier child contrib pmaps */
    if (!(pmap -> preCompContribTab = malloc(sizeof(lutInit))))
       error(SYSTEM, "out of memory allocating precomputed contribution "
          "LUT in loadContribPhotonMap()"
       );
    memcpy(pmap -> preCompContribTab, &lutInit, sizeof(lutInit));
 
    /* NOTE: filename already set in loadPhotonMap()
    pmap -> fileName = (char*)fname; */
 
    /* Init contrib photon map for light source contributions */
    initPmapContrib(pmap);
      
    /* Load child contribution photon maps for each modifier in contribTab */
    lu_doall(pmapContribTab, loadPreCompContrib, pmap);
 }
 
 
 
 static int decodeContribs (PreComputedContrib *preCompContrib)
 /* Decode and decompress mRGBE-encoded wavelet coefficients in
    preCompContrib -> mrgbeCoeffs, apply inverse wavelet transform and 
    return decoded contributions in the wavelet coefficient matrix 
    preCompContrib -> waveletMatrix.
    NOTE: THE WAVELET COEFFICIENT MATRIX IS ASSUMED TO BE ZEROED, with 
    the wavelet approximation coefficients in the upper left of the matrix 
    (i.e. preCompContrib -> waveletMatrix [0..approxDim-1][0..approxDim-1],
    where approxDim = WAVELET_PADD2_APPROXDIM).
    Returns 0 on success, else -1. */
 {
    unsigned          c, i, j, coeffIdx, scDim, coeffDim, nCoeffs, 
                      nCompressedCoeffs;
    WaveletMatrix2    waveletMatrix;
    mRGBERange        *mrgbeRange;
    mRGBE             *mrgbeCoeffs;
    DCOLOR            fCoeff;
    
    if (!preCompContrib || !preCompContrib -> mrgbeCoeffs ||
       !(waveletMatrix = preCompContrib -> waveletMatrix) || 
       !preCompContrib -> tWaveletMatrix
    )
       /* Should be initialised by caller! */
       return -1;
    
    scDim             = preCompContrib -> scDim;
    coeffDim          = preCompContrib -> coeffDim;
    nCoeffs           = preCompContrib -> nCoeffs;
    nCompressedCoeffs = preCompContrib -> nCompressedCoeffs;
    mrgbeCoeffs       = preCompContrib -> mrgbeCoeffs;
    mrgbeRange        = &preCompContrib -> mrgbeRange;
    
    /* Init mRGBE coefficient normalisation from range */
    if (!mRGBEinit(mrgbeRange, mrgbeRange -> min, mrgbeRange -> max))
       return -1;
 
    /* Decode mRGBE detail coeffs and populate wavelet coefficient matrix,
       based on the encoded incremental coefficient index. This omits the
       approximation coefficients in the upper left of the coefficient matrix
       (these should have already been set by the caller).
       NOTE: The detail coefficients in the matrix are assumed to be
       initialised to zero to account for those that were dropped by
       thresholding. */
    for (c = coeffIdx = 0; c < nCompressedCoeffs; c++) {
       coeffIdx += mRGBEdecode(mrgbeCoeffs [c], mrgbeRange, fCoeff);
       i = PMAP_CONTRIB_LIN2X(coeffDim, coeffIdx);
       j = PMAP_CONTRIB_LIN2Y(coeffDim, coeffIdx);
 #ifdef PMAP_CONTRIB_DBG
       /* Check for invalid decoded coefficients */
       if (i < WAVELET_PADD2_APPROXDIM && j < WAVELET_PADD2_APPROXDIM ||
          coeffIdx >= nCoeffs
       )
          error(CONSISTENCY, 
             "wavelet coefficient index out of range in decodeContribs()"
          );
 #endif
       copycolor(waveletMatrix [i] [j], fCoeff);
    }
 
    /* Do inverse 2D wavelet transform on preCompContrib -> waveletMatrix */
    if (!padWaveletInvXform2(waveletMatrix, preCompContrib -> tWaveletMatrix,
          coeffDim, scDim
       )
    )
       error(INTERNAL, 
          "failed inverse wavelet transform in decodeContribs()"
       );
 
 #if 0
 #ifdef PMAP_CONTRIB_DBG
    for (i = 0; i < scDim; i++) {
       for (j = 0; j < scDim; j++) {
    #if 0
          /* Dump decoded contribs for debugging */
          printf("% 7.3g\t", colorAvg(waveletMatrix [i][j]));
    #endif
          /* Warn if coefficient is negative; this _can_ happen due to loss
             of precision by the mRGBE encoding */
          if (min(min(waveletMatrix [i][j][0], waveletMatrix [i][j][1]),
                waveletMatrix [i][j][2]
             ) < 0
          )
             error(WARNING, "negative contributions in getPreCompContrib()");
       }
    #if 0
       putchar('\n');
    }
    putchar('\n');
    #else
    }
    #endif
 #endif
 #endif
 
    return 0;
 }
 
 
 
 static void getPreCompContribByPhoton (const Photon *photon, 
    OOC_DataIdx photonIdx, PreComputedContrib *preCompContrib, 
    DCOLOR *binnedContribs 
 )
 /* Fetch and decode mRGBE-encoded wavelet coefficients for given photon 
    from wavelet file at offset photonIdx, perform inverse wavelet xform,
    and return the reconstructed binned contributions in binnedContribs
    (which is assumed to be variable). Returns 0 on success, else -1. */
 /* NOTE: binnedContribs IS ASSUMED TO POINT TO CACHE PAGE DATA! */
 {
    unsigned             i, j, k, coeffIdx, scDim, coeffDim, 
                         nCompressedCoeffs;
    COLOR                fCoeff;
    COLR                 rgbe32 [WAVELET_PADD2_NUMAPPROX + 2];
    WaveletMatrix2       waveletMatrix = NULL;
    WaveletCoeff3        *coeffPtr;
    
    if (!binnedContribs || !preCompContrib)
       /* Shouldn't happen */
       error(INTERNAL, 
          "contributions not initialised in getPreCompContrib()"
       );
 
    if (preCompContrib -> nBins <= 1)
       /* No binning, so nothing to decode */
       return;
 
    scDim             = preCompContrib -> scDim;
    coeffDim          = preCompContrib -> coeffDim;
    nCompressedCoeffs = preCompContrib -> nCompressedCoeffs;
 
    /* Lazily initialise preCompContrib */
    if (!preCompContrib -> waveletFile) {
       /* Lazily open wavelet coefficient file */
       preCompContrib -> waveletFile = fopen(
          preCompContrib -> waveletFname, "rb"
       );
       if (!preCompContrib -> waveletFile) {
          sprintf(errmsg, "can't open wavelet coefficient file %s "
             "in getPreCompContrib()", preCompContrib -> waveletFname
          );
          error(SYSTEM, errmsg);
       }
 
       /* Set record size of encoded coeffs in wavelet file */
       preCompContrib -> contribSize = PMAP_CONTRIB_ENCSIZE(
          nCompressedCoeffs
       );
 
       /* Lazily allocate buffer for mRGBE wavelet coeffs */
       preCompContrib -> mrgbeCoeffs = calloc(nCompressedCoeffs, 
          sizeof(mRGBE)
       );
       if (!preCompContrib -> mrgbeCoeffs)
          error(SYSTEM, "out of memory allocating decoded wavelet "
             "coefficients in getPreCompContrib()"
          );
          
       /* Lazily allocate primary and transposed wavelet matrices */
       preCompContrib -> waveletMatrix = allocWaveletMatrix2(coeffDim);
       preCompContrib -> tWaveletMatrix = allocWaveletMatrix2(coeffDim);
       if (!preCompContrib -> waveletMatrix || 
          !preCompContrib -> tWaveletMatrix
       )
          error(SYSTEM, "out of memory allocating wavelet coefficient "
             "matrix in getPreCompContrib()"
          );
    }
 
    /* Seek to photon index in wavelet file and read its associated
       coefficients */ 
    if (fseek(preCompContrib -> waveletFile,
          photonIdx * preCompContrib -> contribSize, SEEK_SET
       ) < 0
    )
       error(SYSTEM, "can't seek in wavelet coefficient file "
          "in getPreCompContrib()"
       );
 
    /* Read 32-bit encoded wavelet approximation coefficients and mRGBE
     * range; omit mRGBE minimum if only 1 compressed bin (=maximum
     * compression), since it's implicitly zero in this case. */
    if (getbinary(rgbe32, sizeof(COLR), 
          WAVELET_PADD2_NUMAPPROX + 1 + (nCompressedCoeffs > 1),
          preCompContrib -> waveletFile
       ) != WAVELET_PADD2_NUMAPPROX + 1 + (nCompressedCoeffs > 1)
    )
       error(SYSTEM, "can't read approximation coefficients from "
          "wavelet coefficient file in getPreCompContrib()"
       );
 
    if (getbinary(preCompContrib -> mrgbeCoeffs, sizeof(mRGBE),
          nCompressedCoeffs, preCompContrib -> waveletFile
       ) != nCompressedCoeffs
    )
       error(SYSTEM, "can't read detail coefficients from "
          "wavelet coefficient file in getPreCompContrib()"
       );
 
    /* Get mRGBE range (NOTE min/max are reversed in the coeff file)
       Note that direct assignment isn't possible as colr_color() returns
       float, not double. */
    colr_color(fCoeff, rgbe32 [WAVELET_PADD2_NUMAPPROX]);
    copycolor(preCompContrib -> mrgbeRange.max, fCoeff);
    
    if (nCompressedCoeffs > 1) {
       colr_color(fCoeff, rgbe32 [WAVELET_PADD2_NUMAPPROX + 1]);
       copycolor(preCompContrib -> mrgbeRange.min, fCoeff);
    }
    else {
       /* Single compressed bin; mRGBE minimum is implicitly zero */
       setcolor(preCompContrib -> mrgbeRange.min, 0, 0, 0);
    }
 
    /* Zero wavelet coefficient matrix and set approximation coefficients in
     * the upper left of the matrix, as expected by the decodeContribs(). */
    waveletMatrix = preCompContrib -> waveletMatrix;
    zeroCoeffs2(waveletMatrix, coeffDim);
    for (i = 0; i < WAVELET_PADD2_APPROXDIM; i++) {
       /* Calc row index (=mult) in outer loop, increment in inner */
       coeffIdx = PMAP_CONTRIB_XY2LIN(WAVELET_PADD2_APPROXDIM, i, 0);
       
       for (j = 0; j < WAVELET_PADD2_APPROXDIM; j++, coeffIdx++) {
          /* Direct assignment to wavelet matrix via colr_color() isn't
           * possible as the latter returns float, not double */
          colr_color(fCoeff, rgbe32 [coeffIdx]);
          /* HACK: depending on the boundary extension mode, some approx
           * coeffs may be NEGATIVE (!), so set sign from extra bit in
           * 32-bit RGBE mantissa. */
          for (k = 0; k < 3; k++)
             waveletMatrix [i] [j] [k] = PMAP_CONTRIB_GET_RGBE32_SGN(
                rgbe32 [coeffIdx] [k], fCoeff [k]
             );
       }
    }
 
    /* All set, now decode mRGBE coeffs and invert wavelet transform */
    if (decodeContribs(preCompContrib))
       error(INTERNAL, "failed contribution decoding/decompression "
          "in getPreCompContrib()"
       );
       
    /* Copy decoded contributions from wavelet coefficient matrix */
    for (i = 0; i < scDim; i++) {
       /* Calc row pointer (=mult) in outer loop, increment in inner */
       coeffPtr = binnedContribs + PMAP_CONTRIB_XY2LIN(scDim, i, 0);
 
 #ifdef PMAP_CONTRIB_LOG
       /* Logarithmic contribs; copy & apply log per element */
       for (j = 0; j < scDim; j++, coeffPtr++)
          for (k = 0; k < 3; k++)
             (*coeffPtr) [k] = PMAP_CONTRIB_IVAL(waveletMatrix [i] [j] [k]);
 #else
       /* Linear contribs; copy matrix rows */
       memcpy(*coeffPtr, waveletMatrix [i], scDim * WAVELET_COEFFSIZE);
 #endif
    }
 }
 
 
 
 typedef struct {
    const RAY   *ray;
    RREAL       rayCoeff [3];
    COLORV      *totalContrib;
 } PreCompContribRCData;
 
 
 
 #if 1
 static int getPreCompContribByMod (const LUENT *preCompContribTabNode, 
    void *p
 )
 /* Fetch and decode precomputed contributions from closest photon in pmap
  * referenced by preCompContribTabNode, and accumulate in pmapContribTab
  * for the corresponding modifier.
  * THIS VERSION FETCHES THE SINGLE CLOSEST PRECOMP. PHOTON */
 {   
    PhotonMap                  *preCompContribPmap = (PhotonMap*)(
                                  preCompContribTabNode -> data
                               );
    PreComputedContrib         *preCompContrib;
    const char                 *modName = preCompContribTabNode -> key;
    MODCONT                    *modCont;
    PreCompContribRCData       *rcData = (PreCompContribRCData*)p;
    LUENT                      *contribTabNode;
    Photon                     photon;
    OOC_DataIdx                photonIdx;
    COLOR                      photonContrib;
    unsigned                   b, k;
 
    if (!preCompContribPmap || !rcData || 
       !(preCompContrib = (PreComputedContrib*)(
          preCompContribPmap -> preCompContrib
       ))
    ) {
       sprintf(errmsg, "missing precomputed contributions for modifier %s",
          modName
       );
       error(INTERNAL, errmsg);
    }
    
    /* Find rcontrib's LUT entry for this modifier */
    contribTabNode = lu_find(pmapContribTab, modName);
    if (!contribTabNode || !contribTabNode -> key || 
       !(modCont = (MODCONT*)contribTabNode -> data)
    ) {
       sprintf(errmsg, "missing contribution LUT entry for modifier %s",
          modName
       );
       error(INTERNAL, errmsg);
    }
    
-   /* Reallocate rcontrib bins if they don't match precomputed */
+   /* Check if rcontrib bins match precomputed
+      XXX: Currently we just bail out if this is not the case, as this
+      requires reallocating the bins in modCont (which is easy) _and_
+      reassigning the already opened output streams (which is much more
+      complicated and prone to disastrous side effects */
    if (modCont -> nbins != preCompContrib -> nBins) {
+   #if 0
       contribTabNode -> data = realloc(modCont,
          sizeof(MODCONT) + sizeof(DCOLOR) * (preCompContrib -> nBins - 1)
       );
 
       if (!contribTabNode -> data) {
          sprintf(errmsg, "out of memory reallocating contribution bins "
             "for modifier %s", modName
          );
          error(SYSTEM, errmsg);
       }
       else {
          sprintf(errmsg, "bin count mismatch for modifier %s, resizing",
             modName
          );
          error(WARNING, errmsg);
       }
       
       modCont = (MODCONT*)contribTabNode -> data;
       modCont -> nbins = preCompContrib -> nBins;
       memset(modCont -> cbin, 0, sizeof(DCOLOR) * modCont -> nbins);
+   #else
+      sprintf(errmsg, 
+         "bin count mismatch for modifier %s; please specify with -bn %d",
+         modName, preCompContrib -> nBins
+      );
+      error(USER, errmsg);
+   #endif
    }
  
    /* Fetch closest photon and its contributions */
    if (find1Photon(preCompContribPmap, rcData -> ray, &photon, 
          &photonIdx
       )
    ) {
       if (preCompContrib -> nBins > 1) {
          /* BINNING ENABLED; lazily init contrib cache if necessary */
          if (initContribCache(preCompContrib)) {
             /* CACHE ENABLED; fetch cache page for found photon */
             DCOLOR *cacheBins;
             
             if (!getContribCache(preCompContrib, photonIdx, &cacheBins)) {
                /* PAGE NOT CACHED; decode contribs into new cache page */
                getPreCompContribByPhoton(&photon, photonIdx, preCompContrib, 
                   cacheBins
                );
             } 
             else; /* PAGE CACHED; (re)use decoded contribs from cache! */
             
             for (b = 0; b < preCompContrib -> nBins; b++) {
                /* Accumulate binned contributions into rcontrib's LUT entry
                   for this modifier, weighted by ray coefficient.  Also sum
                   total contributions.  */
                /* NOTE: Using multcolor() on the decoded contribs would
                   modify them in the cache, so use a temp variable here. */
                for (k = 0; k < 3; k++)
                   photonContrib [k] = cacheBins [b] [k] * 
                      rcData -> rayCoeff [k];
 
                addcolor(modCont -> cbin [b], photonContrib);
                addcolor(rcData -> totalContrib, photonContrib);
             }
          }
          else {   
             /* CACHE DISABLED; decode contribs into temp array on stack */
             DCOLOR tempBins [preCompContrib -> nBins];
             
             getPreCompContribByPhoton(&photon, photonIdx, preCompContrib,
                tempBins
             );
          
             for (b = 0; b < preCompContrib -> nBins; b++) {
                /* Accumulate binned contributions into rcontrib's LUT entry
                   for this modifier, weighted by ray coefficient.  Also sum
                   total contributions.  */
                for (k = 0; k < 3; k++)
                   photonContrib [k] = tempBins [b] [k] * 
                      rcData -> rayCoeff [k];
 
                addcolor(modCont -> cbin [b], photonContrib);
                addcolor(rcData -> totalContrib, photonContrib);
             }
          }
       }
       else {
          /* NO BINNING; get single contribution directly from photon,
             scale by ray coefficient and sum to total contribs */
          getPhotonFlux(&photon, photonContrib);
          multcolor(photonContrib, rcData -> rayCoeff);
          addcolor(modCont -> cbin [0], photonContrib);
          addcolor(rcData -> totalContrib, photonContrib);
       }
    }
    
    return 0;
 }
 
 #else
 
 static int getPreCompContribByMod (const LUENT *preCompContribTabNode, 
    void *p
 )
 /* Fetch and decode precomputed contributions from closest photon in pmap
  * referenced by preCompContribTabNode, and accumulate in pmapContribTab
  * for the corresponding modifier.
  * HACK: THIS VERSION AVERAGES OVER SEVERAL PRECOMP. PHOTONS! */
 {   
    PhotonMap                  *preCompContribPmap = (PhotonMap*)(
                                  preCompContribTabNode -> data
                               );
    PreComputedContrib         *preCompContrib;
    const char                 *modName = preCompContribTabNode -> key;
    MODCONT                    *modCont;
    PreCompContribRCData       *rcData = (PreCompContribRCData*)p;
    LUENT                      *contribTabNode;
    Photon                     *photon;
    PhotonSearchQueue          *squeue;
    COLOR                      photonContrib;
    unsigned                   i, b, k;
 
    if (!preCompContribPmap || !rcData || 
       !(preCompContrib = (PreComputedContrib*)(
          preCompContribPmap -> preCompContrib
       ))
    ) {
       sprintf(errmsg, "missing precomputed contributions for modifier %s",
          modName
       );
       error(INTERNAL, errmsg);
    }
 
    /* Find rcontrib's LUT entry for this modifier */
    contribTabNode = lu_find(pmapContribTab, modName);
    if (!contribTabNode || !contribTabNode -> key || 
       !(modCont = (MODCONT*)contribTabNode -> data)
    ) {
       sprintf(errmsg, "missing contribution LUT entry for modifier %s",
          modName
       );
       error(INTERNAL, errmsg);
    }
    
    /* Reallocate rcontrib bins if they don't match precomputed */
    if (modCont -> nbins != preCompContrib -> nBins) {
       contribTabNode -> data = realloc(modCont,
          sizeof(MODCONT) + sizeof(DCOLOR) * (preCompContrib -> nBins - 1)
       );
 
       if (!contribTabNode -> data) {
          sprintf(errmsg, "out of memory reallocating contribution bins "
             "for modifier %s", modName
          );
          error(SYSTEM, errmsg);
       }
       else {
          sprintf(errmsg, "bin count mismatch for modifier %s, resizing",
             modName
          );
          error(WARNING, errmsg);
       }
       
       modCont = (MODCONT*)contribTabNode -> data;
       modCont -> nbins = preCompContrib -> nBins;
       memset(modCont -> cbin, 0, sizeof(DCOLOR) * modCont -> nbins);
    }
  
    if (!preCompContribPmap -> maxGather) 
      /* Zero bandwidth */
      return -1;
      
    /* Fetch closest photon and its contributions */
    findPhotons(preCompContribPmap, rcData -> ray);
    squeue = &preCompContribPmap -> squeue;
    for (i = 0; i < squeue -> tail; i++) {
       photon = getNearestPhoton(squeue, squeue -> node [i].idx);
       
       if (preCompContrib -> nBins > 1) {
          /* BINNING ENABLED; lazily init contrib cache if necessary */
    #if 0
          if (initContribCache(preCompContrib)) {
             /* CACHE ENABLED; fetch cache page for found photon */
             DCOLOR *cacheBins;
             
             if (!getContribCache(preCompContrib, photonIdx, &cacheBins)) {
                /* PAGE NOT CACHED; decode contribs into new cache page */
                getPreCompContribByPhoton(&photon, photonIdx, preCompContrib, 
                   cacheBins
                );
             } 
             else; /* PAGE CACHED; (re)use decoded contribs from cache! */
             
             for (b = 0; b < preCompContrib -> nBins; b++) {
                /* Accumulate binned contributions into rcontrib's LUT entry
                   for this modifier, weighted by ray coefficient.  Also sum
                   total contributions.  */
                /* NOTE: Using multcolor() on the decoded contribs would
                   modify them in the cache, so use a temp variable here. */
                for (k = 0; k < 3; k++)
                   photonContrib [k] = cacheBins [b] [k] * 
                      rcData -> rayCoeff [k];
 
                addcolor(modCont -> cbin [b], photonContrib);
                addcolor(rcData -> totalContrib, photonContrib);
             }
          }
          else 
    #endif
          {
             /* CACHE DISABLED; decode contribs into temp array on stack */
             DCOLOR tempBins [preCompContrib -> nBins];
             
             getPreCompContribByPhoton(photon, squeue -> node [i].recIdx, 
                preCompContrib, tempBins
             );
 
             for (b = 0; b < preCompContrib -> nBins; b++) {
                /* Accumulate binned contributions into rcontrib's LUT entry
                   for this modifier, weighted by ray coefficient.  Also sum
                   total contributions.  */
                for (k = 0; k < 3; k++)
                   photonContrib [k] = tempBins [b] [k] * 
                      rcData -> rayCoeff [k] / squeue -> tail;
 
                addcolor(modCont -> cbin [b], photonContrib);
                addcolor(rcData -> totalContrib, photonContrib);
             }
          }
       }
       else {
          /* NO BINNING; get single contribution directly from photon,
             scale by ray coefficient and sum to total contribs */
          getPhotonFlux(photon, photonContrib);\
          scalecolor(photonContrib, 1.0 / squeue -> tail);
          multcolor(photonContrib, rcData -> rayCoeff);
          addcolor(modCont -> cbin [0], photonContrib);
          addcolor(rcData -> totalContrib, photonContrib);
       }
    }
    
    return 0;
 }
 #endif
 
 
 
 void getPreCompPhotonContrib (PhotonMap *pmap, RAY *ray, 
    COLOR totalContrib
 )
 /* Get precomputed light source contributions at ray -> rop for all
    specified modifiers and accumulate them in pmapContribTab (which maps
    to rcontrib's binning LUT). Also return total precomputed contribs. */
 {
    PreCompContribRCData rcData;
 
    /* Ignore sources */
    if (ray -> ro && islight(objptr(ray -> ro -> omod) -> otype)) 
       return;
  
    /* Get cumulative path coefficient up to photon lookup point */
    raycontrib(rcData.rayCoeff, ray, PRIMARY);
    setcolor(totalContrib, 0, 0, 0);
    
    /* Rcontrib results passed to per-modifier child contrib pmaps */
    rcData.ray           = ray;
    rcData.totalContrib  = totalContrib;
 
    /* Iterate over child contribution photon maps for each modifier and
     * collect their contributions */
    lu_doall(pmap -> preCompContribTab, getPreCompContribByMod, &rcData);
 }
 
 #endif /* PMAP_CONTRIB */
diff --git a/wavelet2.c b/wavelet2.c
index fa60ecd..60380e8 100644
--- a/wavelet2.c
+++ b/wavelet2.c
@@ -1,878 +1,897 @@
 /* 
    =========================================================================
    2-dimensional Daubechies D2 and D1 (Haar) wavelet transforms on 
    (2^l) x (2^l) sized arrays of 3-tuples, where l > 1.  Array boundaries 
    are extended as periodic, which does not require padding coefficients.
    
    Compile with -DWAVELET_TEST_2D to build standalone unit tests.
    
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 
 #include "wavelet2.h"
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
-#include <stdio.h>
 
 
 
 /* The following defs are const, but strict compilers pretend to be dumber
    than they are, and refuse to init from a func (in this case sqrt(2)
    and sqrt(3)), or other consts */
 
-/* Haar wavelet coeffs */
+/* 2-tap Haar wavelet/scaling func coeffs */
 const WAVELET_COEFF h1 = 1 / SQRT2;
 
-#if 1
-   /* Daubechies D2 wavelet coeffs */   
+
+/* 4-tap wavelet/scaling func coeffs; default is Daubechies D2 */
+#ifdef WAVELET_BIOR
+   /* Biorthogonal 3.1 */
+   const WAVELET_COEFF h2 [4] = {
+      -0.3535533905932738,   1.0606601717798214,
+      1.0606601717798214,   -0.3535533905932738
+   },
+   g2 [4] = {
+      -0.1767766952966369,   0.5303300858899107,
+      -0.5303300858899107,   0.1767766952966369
+   },
+   /* Inverse coeffs */
+   h2i [4] = { -g2 [0], g2 [1], -g2 [2], g2 [3] },
+   g2i [4] = { h2 [0], -h2 [1], h2 [2], -h2 [3] };
+#else
+   /* Daubechies D2 */
    const WAVELET_COEFF h2 [4] = {
       H2NORM * (1 + SQRT3), H2NORM * (3 + SQRT3),
       H2NORM * (3 - SQRT3), H2NORM * (1 - SQRT3)
-   };
-   const WAVELET_COEFF g2 [4] = {
+   },
+   g2 [4] = {
       H2NORM * (1 - SQRT3), -H2NORM * (3 - SQRT3),
       H2NORM * (3 + SQRT3), -H2NORM * (1 + SQRT3)
-   };
-   /* g2 [4] = { h2 [3], -h2 [2], h2 [1], -h2 [0] }; */
-#else
-   /* Daubechies D2 wavelet coeffs (REVERSED) */
-   const WAVELET_COEFF h2 [4] = {
-      H2NORM * (1 - SQRT3), H2NORM * (3 - SQRT3),
-      H2NORM * (3 + SQRT3), H2NORM * (1 + SQRT3)
-   };
-   const WAVELET_COEFF g2 [4] = {
-      H2NORM * (1 + SQRT3), -H2NORM * (3 + SQRT3),
-      H2NORM * (3 - SQRT3), -H2NORM * (1 - SQRT3)
-   };
-   /* g2 [4] = { h2 [3], -h2 [2], h2 [1], -h2 [0] }; */
+   },
+   h2i [4] = { h2 [3], h2 [2], h2 [1], h2 [0] },
+   g2i [4] = { g2 [3], g2 [2], g2 [1], g2 [0] };
 #endif
 
 
+
 WaveletMatrix2 allocWaveletMatrix2 (unsigned len)
 /* 
    Allocate and return a 2D coefficient array of size (len x len).  Returns
    NULL if allocation failed.  
 */
 {
    unsigned       i;
    WaveletMatrix2 y = NULL;
    
    if (len >= 2) {
       if (!(y = calloc(len, sizeof(WaveletCoeff3*))))
          return NULL;
       
       for (i = 0; i < len; i++)
          if (!(y [i] = calloc(len, WAVELET_COEFFSIZE)))
             return NULL;
    }
    
    return y;
 }
 
 
    
 void freeWaveletMatrix2 (WaveletMatrix2 y, unsigned len)
 /*
    Free previously allocated 2D coefficient array y of size (len x len)
 */  
 {
    unsigned i, j;
    
    if (y) {
       for (i = 0; i < len; i++)
          free(y [i]);
       
       free(y);
    }
 }
 
 
 
 void zeroCoeffs2 (WaveletMatrix2 y, unsigned len)
 /* Set 2D array coefficients to zero */
 {
    unsigned i;
    
    if (y)
       for (i = 0; i < len; i++)
          memset(y [i], 0, len * WAVELET_COEFFSIZE);
 }
 
 
 
 #ifdef WAVELET_DBG      
    void clearCoeffs2 (WaveletMatrix2 y, unsigned len)
    /* Clear 2D array to facilitate debugging */
    {
       unsigned i, j, k;
       
       if (y)
          for (i = 0; i < len; i++)
             for (j = 0; j < len; j++)
                for (k = 0; k < 3; k++)
                   y [i] [j] [k] = NAN;
    }
 
 
 
    static char *coeffStr (const WaveletCoeff3 coeff)
    /* Format coefficient in string for dump. Cleared coeffs are represented
     * as dots to facilitate debugging. */
    {
       static char str [9];
 
       if (coeffIsEmpty(coeff))
          /* Coeff is blank */
          return "  ..  ";
 
       snprintf(str, sizeof(str), "% 5.2f", coeffAvg(coeff));
       return str;
    }
    
    
 
    void dumpCoeffs2 (const WaveletMatrix2 y1, const WaveletMatrix2 y2,
-      unsigned y1Len, unsigned y2Len, float thresh 
+      unsigned y1Len, unsigned y2Len, float thresh, FILE *stream
    )
    /* 
       Multipurpose routine to dump 2D arrays y1 and y2 of dims y1Len resp
-      y2Len to stdout.  These are presented side-by-side, separated by an
-      arrow to indicate the input (y1) and output (y2) of a transform step.
+      y2Len.  These are presented side-by-side, separated by an arrow to
+      indicate the input (y1) and output (y2) of a transform step.
       
       If either y1 or y2 is NULL, it will be omitted from the output,
       enabling dumping a single array.
-      
+            
       If thresh > 0, coefficients with absolute magnitude below this value
       are marked with brackets ('[]') as thresholded.
+      
+      If stream is not NULL, the coefficients will be dumped to the
+      specified file, else to stdout.      
    */
    {
       unsigned       i, j, k;
+      FILE           *out = stream ? stream : stdout;
 
       for (i = 0; i < max(y1Len, y2Len); i++) {
          if (y1) {
             for (j = 0; j < y1Len; j++)
                if (i < y1Len)
-                  printf("%s\t", coeffThresh(y1, i, j, thresh)
+                  fprintf(out, "%s\t", 
+                     coeffThresh(y1, i, j, thresh)
                         ? "[ .. ]" : coeffStr(y1 [i][j])
                   );
-               else putchar('\t');
+               else fputc('\t', out);
          }
          
          if (y2) {
-            printf("  -->\t");
+            fprintf(out, "  -->\t");
             
             for (j = 0; j < y2Len; j++)
                if (i < y2Len)
-                  printf("%s\t", coeffThresh(y2, i, j, thresh)
+                  fprintf(out, "%s\t", 
+                     coeffThresh(y2, i, j, thresh)
                         ? "[ .. ]" : coeffStr(y2 [i][j])
                   );
-               else putchar('\t');
+               else fputc('\t', out);
          }
             
-         putchar('\n');
+         fputc('\n', out);
       }
    }
    
    
    
    float rmseCoeffs2 (const WaveletMatrix2 y1, const WaveletMatrix2 y2, 
       unsigned len
    ) 
    /* Calculate RMSE between 2D matrices y1 and y2 */
    {
       unsigned       i, j;
       float          d, rmse = 0;
 
       for (i = 0; i < len; i++)
          for (j = 0; j < len; j++) {
    #if 0
             d = (coeffAvg(y1 [i][j]) - coeffAvg(y2 [i][j])) /
                coeffAvg(y1 [i][j]);
    #else
             d = coeffAvg(y1 [i][j]) - coeffAvg(y2 [i][j]);
    #endif
             rmse += d * d;
          }
       
       return sqrt(rmse / ((float)len * len));
    }
 #endif
 
 
 
 static int d1Step2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l)
 /* 
    Single step of forward 2D Daubechies D1 (Haar) wavelet transform on array
    y of size (2^l) x (2^l) containing 3-tuples, where l >= 1.  The transform
    is performed over y's 2nd axis (i.e.  horizontally, assuming row-major
    addressing as per C convention).
    
    Note the triplets per array element are _not_ decorrelated, but
    transformed independently.
    
    The wavelet coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the generated
    coefficients vertically, and prepares the array for another transform
    over its 2nd axis in a subsequent call (this time as input y).
    
    The result of a subsequent transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    decorrelated.
    
    Returns 0 on success, else -1.
 */
 {
    const unsigned    len = 1 << l, hlen = 1 << (l - 1);
    unsigned          h, i, j, k;
 #ifdef WAVELET_DBG
    static unsigned   axis = 0;
 #endif
    
    if (len < 2 || !y || !yt)
       /* Input shorter than wavelet support, or no input/output */
       return -1;
    
 #ifdef WAVELET_DBG
    clearCoeffs2(yt, len);
 #endif
 
    /* NOTE: yt is transposed on the fly such that the next function call
     * transforms over the alternate axis. This is done by simply swapping
     * the indices during assignment */
    for (i = 0; i < len; i++) {
       for (j = 0; j < len; j += 2) {
          h = j >> 1;
          for (k = 0; k < 3; k++) {
             /* Smooth/approx/avg/lowpass */
             yt [h       ] [i] [k] = h1 * (
                y [i] [j    ] [k] + 
                y [i] [j + 1] [k]
             );
 
             /* Detail/diff/highpass */
             yt [hlen + h] [i] [k] = h1 * (
                y [i] [j    ] [k] - 
                y [i] [j + 1] [k]
             );
          }
       }
    }
 
 #ifdef WAVELET_DBG
    printf("%s FWD D1 (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
-   dumpCoeffs2(y, yt, len, len, 0);
+   dumpCoeffs2(y, yt, len, len, 0, NULL);
    putchar('\n');
    axis ^= 1;
 #endif
 
    return 0;
 }
 
 
 
 static int d1InvStep2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l)
 /* 
    Single step of inverse 2D Daubechies D1 (Haar) wavelet transform on
    coefficient array y of size (2^l) x (2^l) containing 3-tuples, where
    l>=1.  This reverses the forward transform above.  The transform is
    inverted over y's 2nd axis (i.e.  horizontally, assuming row-major
    addressing as per C convention).
    
    The inverted coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the inverted
    coefficients vertically, and prepares the array for another inverse
    transform over its 2nd axis in a subsequent call (this time as input y).
 
    The result of a subsequent inverse transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    inversely transformed.
    
    Returns 0 on success, else -1.
 */
 {
    const unsigned    len = 1 << l, hlen = 1 << (l - 1);
    unsigned          h, i, j, k;
 #ifdef WAVELET_DBG
    static unsigned   axis = 1;
 #endif
 
    if (len < 2 || !y || !yt)
       /* Too few coeffs for reconstruction, or no input/output */
       return -1;
 
 #ifdef WAVELET_DBG
    clearCoeffs2(yt, len);
 #endif   
 
    /* NOTE: i, j are swapped relative to the forward transform, as axis
     * order is now reversed. */
    /* NOTE: yt is transposed on the fly such that the next function call
     * inverts over the alternate axis. This is done by simply swapping
     * the indices during assignment */    
    for (i = 0; i < len; i++) {
       for (j = 0; j < len; j += 2) {
          h = j >> 1;            
          for (k = 0; k < 3; k++) {
             yt [i] [j    ] [k] = h1 * (
                y [h       ] [i] [k] +   /* Avg */
                y [hlen + h] [i] [k]     /* Diff */
             );
                
             yt [i] [j + 1] [k] = h1 * (
                y [h       ] [i] [k] -   /* Avg */
                y [hlen + h] [i] [k]     /* Diff */
             );
          }
       }
    }
 
 #ifdef WAVELET_DBG
    printf("%s INV D1 (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
-   dumpCoeffs2(y, yt, len, len, 0);
+   dumpCoeffs2(y, yt, len, len, 0, NULL);
    putchar('\n');
    axis ^= 1;
 #endif         
 
    return 0;
 }
 
 
 
 static int d4Step2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l)
 /* 
    Single step of forward 2D Daubechies D2 wavelet transform on array y of
    size (2^l) x (2^l) containing 3-tuples, where l >= 2.  The transform is
    performed over y's 2nd axis (i.e.  horizontally, assuming row-major
    addressing as per C convention).
    
    Note the triplets per array element are _not_ decorrelated, but
    transformed independently.
    
    The wavelet coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the generated
    coefficients vertically, and prepares the array for another transform
    over its 2nd axis in a subsequent call (this time as input y).
    
    The result of a subsequent transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    decorrelated.
    
    Returns 0 on success, else -1.
 */
 { 
    const unsigned    len = 1 << l, hlen = 1 << (l - 1);
    unsigned          h, i, j, k;
 #ifdef WAVELET_DBG
    static unsigned   axis = 0;
 #endif
    
    if (len < 4 || !y || !yt)
       /* Input shorter than wavelet support, or no input/output */
       return -1;
 
 #ifdef WAVELET_DBG
    clearCoeffs2(yt, len);
 #endif
 
    /* NOTE: yt is transposed on the fly such that the next function call
     * transforms over the alternate axis. This is done by simply swapping
     * the indices during assignment */
    for (i = 0; i < len; i++) {
       /* Transform until upper boundary */
       for (j = 0; j < len - 2; j += 2) {
          h = j >> 1;
          for (k = 0; k < 3; k++) {
             /* Smooth/approx/avg/lowpass */
             yt [h       ] [i] [k] =
                h2 [0] * y [i] [j    ] [k] + 
                h2 [1] * y [i] [j + 1] [k] +
                h2 [2] * y [i] [j + 2] [k] +
                h2 [3] * y [i] [j + 3] [k];
 
             /* Detail/diff/highpass */
             yt [hlen + h] [i] [k] =
                g2 [0] * y [i] [j    ] [k] +
                g2 [1] * y [i] [j + 1] [k] +
                g2 [2] * y [i] [j + 2] [k] +
                g2 [3] * y [i] [j + 3] [k];
          }
       }
 
       /* Transform at upper boundary with wraparound.
          Note j is set to last index from previous loop */
       h = j >> 1;
       for (k = 0; k < 3; k++) {
          /* Smooth/approx/avg/lowpass */
          yt [h       ] [i] [k] =
             h2 [0] * y [i] [j    ] [k] +
             h2 [1] * y [i] [j + 1] [k] +
             h2 [2] * y [i] [0    ] [k] +
             h2 [3] * y [i] [1    ] [k];
 
          /* Detail/diff/highpass */
          yt [hlen + h] [i] [k] =
             g2 [0] * y [i] [j    ] [k] +
             g2 [1] * y [i] [j + 1] [k] +
             g2 [2] * y [i] [0    ] [k] +
             g2 [3] * y [i] [1    ] [k];
       }
    }
 
 #ifdef WAVELET_DBG
    printf("%s FWD D2 (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
-   dumpCoeffs2(y, yt, len, len, 0);
+   dumpCoeffs2(y, yt, len, len, 0, NULL);
    putchar('\n');
    axis ^= 1;
 #endif
 
    return 0;
 }
 
 
 
 
 
 
 static int d4InvStep2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l)
 /*
    Single step of inverse 2D Daubechies D2 wavelet transform on coefficient
    array y of size (2^l) x (2^l) containing 3-tuples, where l >= 2.  This
    reverses the forward transform above.  The transform is inverted over y's
    2nd axis (i.e.  horizontally, assuming row-major addressing as per C
    convention).
    
    The inverted coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the inverted
    coefficients vertically, and prepares the array for another inverse
    transform over its 2nd axis in a subsequent call (this time as input y).
 
    The result of a subsequent inverse transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    inversely transformed.
    
    Returns 0 on success, else -1.
 */
 {
    const unsigned    len = 1 << l, hlen = 1 << (l - 1);
    unsigned          h, i, j, k;
 #ifdef WAVELET_DBG
    static unsigned   axis = 1;
 #endif
    
    if (len < 4 || !y || !yt)
       /* Too few coeffs for reconstruction, or no input/output */
       return -1;
 
 #ifdef WAVELET_DBG
    clearCoeffs2(yt, len);
 #endif
 
    /* NOTE: i, j are swapped relative to the forward transform, as axis
     * order is now reversed. */
    /* NOTE: yt is transposed on the fly such that the next function call
     * inverts over the alternate axis. This is done by simply swapping
     * the indices during assignment */
    for (i = 0; i < len; i++) {
       /* Invert at lower boundary with wraparound */
       for (k = 0; k < 3; k++) {
          yt [i] [0] [k] =
-            h2 [2] * y [hlen - 1] [i] [k] + /* Last avg */
-            g2 [2] * y [len  - 1] [i] [k] + /* Last diff */
-            h2 [0] * y [0       ] [i] [k] + /* First avg */
-            g2 [0] * y [hlen    ] [i] [k];  /* First diff */
+            h2i [1] * y [hlen - 1] [i] [k] + /* Last avg */
+            g2i [1] * y [len  - 1] [i] [k] + /* Last diff */
+            h2i [3] * y [0       ] [i] [k] + /* First avg */
+            g2i [3] * y [hlen    ] [i] [k];  /* First diff */
          
          yt [i] [1] [k] =
-            h2 [3] * y [hlen - 1] [i] [k] +
-            g2 [3] * y [len  - 1] [i] [k] +
-            h2 [1] * y [0       ] [i] [k] +
-            g2 [1] * y [hlen    ] [i] [k];
+            h2i [0] * y [hlen - 1] [i] [k] +
+            g2i [0] * y [len  - 1] [i] [k] +
+            h2i [2] * y [0       ] [i] [k] +
+            g2i [2] * y [hlen    ] [i] [k];
       }
 
       /* Invert until upper boundary */
       for (j = 2; j < len; j += 2) {
          h = (j >> 1) - 1;
          for (k = 0; k < 3; k++) {
             yt [i] [j    ] [k] =
-               h2 [2] * y [h           ] [i] [k] + /* Avg */
-               g2 [2] * y [hlen + h    ] [i] [k] + /* Diff */
-               h2 [0] * y [h + 1       ] [i] [k] + /* Next avg */
-               g2 [0] * y [hlen + h + 1] [i] [k];  /* Next diff */
+               h2i [1] * y [h           ] [i] [k] + /* Avg */
+               g2i [1] * y [hlen + h    ] [i] [k] + /* Diff */
+               h2i [3] * y [h + 1       ] [i] [k] + /* Next avg */
+               g2i [3] * y [hlen + h + 1] [i] [k];  /* Next diff */
                
             yt [i] [j + 1] [k] =
-               h2 [3] * y [h           ] [i] [k] +
-               g2 [3] * y [hlen + h    ] [i] [k] +
-               h2 [1] * y [h + 1       ] [i] [k] +
-               g2 [1] * y [hlen + h + 1] [i] [k];
+               h2i [0] * y [h           ] [i] [k] +
+               g2i [0] * y [hlen + h    ] [i] [k] +
+               h2i [2] * y [h + 1       ] [i] [k] +
+               g2i [2] * y [hlen + h + 1] [i] [k];
          }
       }
    }
 
 #ifdef WAVELET_DBG
    printf("%s INV D2 (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
-   dumpCoeffs2(y, yt, len, len, 0);
+   dumpCoeffs2(y, yt, len, len, 0, NULL);
    putchar('\n');
    axis ^= 1;
 #endif         
 
    return 0;
 }
 
 
 
 int waveletXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l)
 /*
    Perform full 2D multiresolution forward wavelet transform on array y of
    size (2^l) x (2^l) containing original signal as 3-tuples, where l >= 1. 
    Note no intra-tuple transform occurs.
    
    The wavelet coefficients are returned in array y, containing the coarsest
    approximation in y [0][0] followed by horizontal/vertical details in
    order of increasing resolution/frequency.
    
    A preallocated array yt of identical dimensions to y can be supplied as
    buffer for intermediate results.  If yt == NULL, a buffer is
    automatically allocated and freed on demand, but this is inefficient for
    frequent calls.  It is recommended to preallocate yt to the maximum
    expected size.  The dimensions of yt are not checked; this is the
    caller's responsibility.
       
    Returns 0 on success, else -1.
 */
 {
    const unsigned len = 1 << l;
    unsigned       li;
    WaveletMatrix2 ytloc = NULL;
 
    /* Skip transform if input too short or missing */
    if (l < 1 || !y)
       return -1;
       
    if (!yt)
       /* No buffer supplied; allocate one on demand */
       if (!(yt = ytloc = allocWaveletMatrix2(len))) {
          fprintf(stderr, "ERROR - Failed allocating %dx%d buffer array"
             " in WaveletXform2()", len, len);
          return -1;
       }
 
    for (li = l; li > 1; li--) {
       /* Apply horizontal & vertical Daubechies D2 transform, swapping input
          and transposed output array */
       if (d4Step2(y, yt, li) || d4Step2(yt, y, li))
          return -1;
    }
 #ifdef WAVELET_FINAL_HAAR
    /* Apply horizontal & vertical Daubechies D1 (Haar) transform at coarsest
       resolution (li==1) to obtain single approximation coefficient at 
       y [0][0]; all other coeffs are details.  */
    if (d1Step2(y, yt, li) || d1Step2(yt, y, li))
       return -1;
 #endif
    /* NOTE: All coefficients now in y */
    
    if (ytloc)
       /* Free yt if allocated on demand */
       freeWaveletMatrix2(ytloc, len);
    
    return 0;
 }
 
 
 
 int waveletInvXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l)
 /* 
    Perform full 2D multiresolution inverse wavelet transform on array y of
    size (2^l) x (2^l) containing wavelet coefficients as 3-tuples, where
    l >= 1.  Note no intra-tuple transform occurs.
 
    A preallocated array yt of identical dimensions to y can be supplied as
    buffer for intermediate results.  If yt == NULL, a buffer is
    automatically allocated and freed on demand, but this is inefficient for
    frequent calls.  It is recommended to preallocate yt to the maximum
    expected size.  The dimensions of yt are not checked; this is the
    caller's responsibility.
    
    The reconstructed signal is returned in array y. 
    Returns 0 on success, else -1.
 */
 {
    const unsigned len = 1 << l;
    unsigned       li;
    WaveletMatrix2 ytloc = NULL;
 
    /* Skip inverse transform if input too short or missing */
    if (l < 1 || !y)
       return -1;
 
    if (!yt)
       /* No buffer supplied; allocate one on demand */
       if (!(yt = ytloc = allocWaveletMatrix2(len))) {
          fprintf(stderr, "ERROR - Failed allocating %dx%d buffer array"
             " in WaveletInvXform2()", len, len);
          return -1;
       }
 #ifdef WAVELET_FINAL_HAAR
    /* Invert horizontal & vertical Haar transform at coarsest level (li==1),
       swapping input and transposed output array */
    if (d1InvStep2(y, yt, 1) || d1InvStep2(yt, y, 1))
       return -1;
 #endif
    for (li = 2; li <= l; li++) {
       /* Invert horizontal & vertical Daubechies D2 transform, swapping 
          input and transposed output arrays */
       if (d4InvStep2(y, yt, li) || d4InvStep2(yt, y, li))
          return -1;
    }
    
    /* NOTE: Reconstructed signal now in y */
 
    if (ytloc)
       /* Free yt if allocated on demand */
       freeWaveletMatrix2(ytloc, len);
    
    return 0;
 }
 
 
 
 #ifdef WAVELET_TEST_2D
-   #include <stdio.h>
-   
    #ifdef WAVELET_TEST_mRGBE
       #include "mrgbe.h"
    #endif
+
+
+   #ifndef WAVELET_TEST_INIT
+      /* Default test data initialisation */
+      #define WAVELET_TEST_INIT 0
+   #endif
    
-   #define WAVELET_TEST_INIT 0
-   
+   #if (WAVELET_TEST_INIT < 0 || WAVELET_TEST_INIT > 5)
+      #error Invalid wavelet unit test data initialiation option
+   #endif
    
+      
+      
    int main (int argc, char *argv [])
    {
       int            i, j, k, l;
       unsigned       len, numThresh = 0;
       WaveletMatrix2 y0 = NULL, y = NULL;
       FILE           *dataFile = NULL;
       WAVELET_COEFF  inData, thresh = 0;
    #ifdef WAVELET_TEST_mRGBE
-      #define        HUGE 1e10
+      #define        TINY  1e-6
+      #define        HUGE  1e10
       mRGBE          mrgbeCoeff;
       mRGBERange     mrgbeRange;
       WaveletMatrix2 ymrgbe = NULL;
    #endif
       
       if (argc < 2) {
          fprintf(stderr, "%s <l> [threshold] [dataFile]\n", argv [0]);
          fputs("Missing array resolution l > 1, "
             "compression threshold >= 0\n", stderr
          );
          return -1;
       }
       if (!(l = atoi(argv [1])) || l < 1) {
          fputs("Invalid array resolution l\n", stderr);
          return -1;
       }
       else len = 1 << l;
       if (argc > 2 && (thresh = atof(argv [2])) < 0) {
          fprintf(stderr, "Invalid threshold %.3f\n", thresh);
          return -1;
       }
       
       /* Allocate arrays for original and reconstruction */
       if (!(y0 = allocWaveletMatrix2(len)) || 
          !(y = allocWaveletMatrix2(len))
    #ifdef WAVELET_TEST_mRGBE
          || !(ymrgbe = allocWaveletMatrix2(len))
    #endif
       ) {
          fprintf(stderr, "Failed allocating %dx%d array\n", len, len);
          return -1;
       }
 
       if (argc > 3) {
          /* Load data from file; length must not exceed allocated */
          if (!(dataFile = fopen(argv [3], "r"))) {
             fprintf(stderr, "Failed opening data file %s\n", argv [3]);
             return -1;
          }
       
          for (i = 0; i < len; i++) {
             for (j = 0; j < len; j++) {
                if (feof(dataFile)) {
                   fprintf(stderr, 
                      "Premature end of file reading data from %s\n",
                      argv [2]
                   );
                   fclose(dataFile);
                   return -1;
                }
             
                /* Read next float, skipping any leading whitespace */
                if (fscanf(dataFile, " %lf", &inData)) {
                   y0 [i][j][0] = y0 [i][j][1] = y0 [i][j][2] = 
                      y [i][j][0] = y [i][j][1] = y [i][j][2] = inData;
                }
                else {
                   fprintf(stderr, 
                      "Error reading from data file %s\n", argv [2]
                   );
                   fclose(dataFile);
                   return -1;
                }
             }
          }
          
          fclose(dataFile);
       }
       else {
          /* Init input */
          srand48(111);
          for (i = 0; i < len; i++) {
             for (j = 0; j < len; j++) {
                for (k = 0; k < 3; k++) {
                   y0 [i][j][k] = y [i][j][k] =                  
    #if WAVELET_TEST_INIT == 0
                      /* Random data, channel-independent */
                      drand48();
    #elif WAVELET_TEST_INIT == 1
                      /* Random data, indentical for all channels */
                      k ? y [i][j][k - 1] : drand48();
    #elif WAVELET_TEST_INIT == 2
                      /* Monotonically increasing along axis 0 */
                      i;
    #elif WAVELET_TEST_INIT == 3
                      /* Monotonically increasing along axis 1 */
                      j;
    #elif WAVELET_TEST_INIT == 4
                      /* Monotonically increasing along both axes */
                      i * j;
    #elif WAVELET_TEST_INIT == 5
                      /* Monotonically increasing by linear index */
                      i * len + j;         
    #endif
                }
             }
          }
       }
 
    #ifdef WAVELET_DBG
       puts("----------------------- FWD XFORM -------------------------\n");
    #endif
       
       /* Forward xform */
       if (waveletXform2(y, NULL, l)) {
          fputs("Forward xform failed\n", stderr);
          return -1;
       }
 
       /* Threshold coefficients; we use hard thresholding as it's easier to
        * implement than soft thresholding, which requires sorting the
        * coefficients.  NOTE: y [0][0] is omitted it contains the coarsest
        * approximation coefficient, which is essential for the
        * reconstruction!  */
       for (i = 0; i < len; i++)
          for (j = 0; j < len; j++) {
             if (coeffThresh(y, i, j, thresh)) {
                y [i][j][0] = y [i][j][1] = y [i][j][2] = 0;
                numThresh++;
    #if 0
                /* Replace thresholded values with random noise in range
                   [-threshold, threshold] */
                y [i][j][0] = y [i][j][1] = y [i][j][2] = thresh * (
                   2 * drand48() - 1
                );
    #endif
             }
          }
 
    #ifdef WAVELET_DBG
       /* Dump coefficients */
       puts("----------------------- COEFFICIENTS -------------------------\n");
-      dumpCoeffs2(y, NULL, len, 0, thresh);
+      dumpCoeffs2(y, NULL, len, 0, thresh, NULL);
       
       if (numThresh)
          printf("\n%d/%d coefficients thresholded = %.1f%% compression", 
             numThresh, len * len, 100. * numThresh / (len * len)
          );      
    #endif
    
    #ifdef WAVELET_TEST_mRGBE
       /* Test mRGBE coefficient encoding */
       mrgbeRange.min [0] = mrgbeRange.min [1] = mrgbeRange.min [2] = HUGE;
       mrgbeRange.max [0] = mrgbeRange.max [1] = mrgbeRange.max [2] = 0;
       
       /* Find min/max coeff and init mRGBE range */
       for (i = 0; i < len; i++)
          for (j = 0; j < len; j++)
             for (k = 0; k < 3; k++) {
                inData = fabs(y [i][j][k]);
                if (inData < mrgbeRange.min [k])
                   mrgbeRange.min [k] = inData;
                if (inData > mrgbeRange.max [k])
                   mrgbeRange.max [k] = inData;
             };
       mRGBEinit(&mrgbeRange, mrgbeRange.min, mrgbeRange.max);
       
       /* Encode coeffs to mRGBE and back, but preserve approximation
        * coefficient at y [0][0] */
       for (i = 0; i < len; i++)
          for (j = 0; j < len; j++)
             if (i | j) {
                mrgbeCoeff = mRGBEencode(y [i][j], &mrgbeRange, 0);
                mRGBEdecode(mrgbeCoeff, &mrgbeRange, ymrgbe [i][j]);
             }
             else for (k = 0; k < 3; k++)
                ymrgbe [i][j][k] = y [i][j][k];
          
       #ifdef WAVELET_DBG
          /* Dump mRGBE-decoded coefficients */
          puts("\n\n-------------------- mRGBE COEFFICIENTS ----------------------\n");           
-         dumpCoeffs2(y, ymrgbe, len, len, 0);
+         dumpCoeffs2(y, ymrgbe, len, len, 0, NULL);
       #endif
    #endif
 
    #ifdef WAVELET_DBG
       puts("\n----------------------- INV XFORM -------------------------\n");
    #endif
    
       /* Inverse xform (also using mRGBE coeffs if enabled) */
       if (waveletInvXform2(y, NULL, l)
    #ifdef WAVELET_TEST_mRGBE
          || waveletInvXform2(ymrgbe, NULL, l)
    #endif
       ) {
          fputs("\nInverse xform failed\n", stderr);
          return -1;
       }
    
    #ifdef WAVELET_DBG
       puts("\n--------------------- ORIG vs. INV XFORM ------------------------\n");
-      dumpCoeffs2(y0, y, len, len, 0);
+      dumpCoeffs2(y0, y, len, len, 0, NULL);
    #endif
       printf("\nAvg RMSE = %.2f\n", rmseCoeffs2(y0, y, len));
       
    #ifdef WAVELET_TEST_mRGBE
       #ifdef WAVELET_DBG
          puts("\n------------------ ORIG vs. INV XFORM + mRGBE -------------------\n");
-         dumpCoeffs2(y0, ymrgbe, len, len, 0);
+         dumpCoeffs2(y0, ymrgbe, len, len, 0, NULL);
       #endif
       printf("\nAvg RMSE with mRGBE enc = %.2f\n", 
          rmseCoeffs2(y0, ymrgbe, len)
       );
    #endif
       
       freeWaveletMatrix2(y0, len);
       freeWaveletMatrix2(y, len);
    #ifdef WAVELET_TEST_mRGBE
       freeWaveletMatrix2(ymrgbe, len);
    #endif
       return 0;
    }   
 #endif /* WAVELET_TEST_2D */
diff --git a/wavelet2.h b/wavelet2.h
index 6b2feb5..32137f5 100644
--- a/wavelet2.h
+++ b/wavelet2.h
@@ -1,261 +1,290 @@
 /* 
    =========================================================================
    Definitions for 2D wavelet transforms on arrays of 3-tuples.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 
 #ifndef _WAVELET2_H
    #define _WAVELET2_H
+   
+   #include <stdio.h>
+   
 
    /* Wavelet coefficient type; defaults to double if not already defined.
      (may be overridden in compiler command line, for example).
       NOTE: single precision float may improve cache performance during 
       2D transform with large arrays. */
    #ifndef WAVELET_COEFF
       #define WAVELET_COEFF   double
    #endif
 
+
+   /* Compile 2D (padded) wavelet unit tests */
+   /* #define WAVELET_TEST_2D
+   /* #define WAVELET_TEST_2DPAD */
+   
+   /* Data initialisation option for wavelet unit test */
+   #define WAVELET_TEST_INIT        1
+   
+   /* Logarithmic input encoding for wavelet unit test (undef for linear) */
+   #define WAVELET_TEST_LOG
+
+
    /* Perform final Daubechies D1 (Haar) wavelet transform on coarsest two
     * detail levels.  This has the advantage of resulting in a single
     * coarsest level approximation coefficient, possibly at the expense of
     * poorer compression */
    /* #define WAVELET_FINAL_HAAR */
+   
+   /* Minimum input length for full wavelet transform; a value of 3
+      performs all steps up to the coarsest level.  Doubling this to 6
+      significantly increases the number of approximation coefficients (see
+      WAVELET_PADD2_APPROXDIM below), but may reduce artefacts at high
+      compression rates */
+   #define WAVELET_PADD2_MINLEN     3
+   /* #define WAVELET_PADD2_MINLEN  6 */
+
+   /* Number of approximation coefficients per dimension in upper left of
+    * the wavelet matrix after a full 4-tap transform with boundary padding */
+   #define WAVELET_PADD2_APPROXDIM     WAVELET_PADD2_MINLEN
+   #define WAVELET_PADD2_NUMAPPROX     ( \
+      (WAVELET_PADD2_APPROXDIM) * (WAVELET_PADD2_APPROXDIM) \
+   )
+   /* Number of 4-tap detail coefficients resulting from the above
+    * based on the number of total number of coefficients */
+   #define WAVELET_PADD2_NUMDETAIL(n)  ((n) - WAVELET_PADD2_NUMAPPROX)
+
 
-   /* Boundary extension modes for padded Daubechies D2 wavelet transform */
+   /* Boundary extension modes for padded 4-tap wavelet transform */
+   /* !!! WARNING: GRAD1, GRAD2 assume a LINEAR input signal and can cause
+    * !!! severe artefacts if the input is logarithmic! */
    #define WAVELET_EXTEND_CONST     0     /* Constant (default if undef) */
-   #define WAVELET_EXTEND_GRAD1     1     /* 1st order gradient */
-   #define WAVELET_EXTEND_GRAD2     2     /* 2nd order gradient */
+   #define WAVELET_EXTEND_GRAD1     1     /* 1st order gradient (linear!) */
+   #define WAVELET_EXTEND_GRAD2     2     /* 2nd order gradient (linear!) */
    #define WAVELET_EXTEND_CYCL      3     /* Periodic (wraparound) */
    #define WAVELET_EXTEND_SYMM      4     /* Symmetric (reflection) */
    #define WAVELET_EXTEND_ZERO      5     /* Constant zero */
-   #define WAVELET_EXTEND_INV       6     /* Negative symmetric/refl. */
+   #define WAVELET_EXTEND_INV       6     /* Inverted (antisymmetric) */
 
    #ifndef WAVELET_EXTEND
-      #define WAVELET_EXTEND_MODE   WAVELET_EXTEND_GRAD1
+      #define WAVELET_EXTEND_MODE   WAVELET_EXTEND_CONST
    #endif
 
    
    /* Convenience macros for handling coefficients */
    #define min(a, b)          ((a) < (b) ? (a) : (b))
    #define max(a, b)          ((a) > (b) ? (a) : (b))
    #define coeffAvg(a)        (((a) [0] + (a) [1] + (a) [2]) / 3)
    
    #define coeffIsEmpty(c) ( \
       isnan((c) [0]) || isnan((c) [1]) || isnan((c) [2]) \
    )
    #define coeffIsZero(c)  ((c) [0] == 0 && (c) [1] == 0 && (c) [2] == 0)
 
-   /* Number of approximation coefficients per dimension in upper left of
-    * the wavelet matrix after a full Daubechies D2 transform with boundary
-    * padding */
-   #define WAVELET_PADD2_APPROXDIM     3
-   #define WAVELET_PADD2_NUMAPPROX     ( \
-      (WAVELET_PADD2_APPROXDIM) * (WAVELET_PADD2_APPROXDIM) \
-   )
-   /* Number of Daubechies D2 detail coefficients resulting from the above
-    * based on the number of total number of coefficients */
-   #define WAVELET_PADD2_NUMDETAIL(n)  ((n) - WAVELET_PADD2_NUMAPPROX)
    
    /* Determine if coeff tuple c is thresholded, also including empty coeffs
     * in debugging mode, since these will be unconditionally dropped. Note
     * that the 3x3 coarsest approximation coefficients in the upper left of
     * matrix c are not thresholded, since they are essential for a
     * reasonably accurate reconstruction! */
    /* #ifdef WAVELET_FINAL_HAAR */
    #if 0
       #define coeffThresh(c, i, j, t)  ( \
          fabs(t) > 0 && ((i) || (j)) && ( \
             coeffIsEmpty((ca) [i][j]) || fabs((c) [i][j][0]) < (t) || \
             fabs((c) [i][j][1]) < (t) || fabs((c) [i][j][2]) < (t) \
          ) \
       )
    #else
       #define coeffThresh(c, i, j, t)  ( \
          fabs(t) > 0 && ( \
             (i) >= WAVELET_PADD2_APPROXDIM || \
             (j) >= WAVELET_PADD2_APPROXDIM \
          ) && (\
             coeffIsEmpty((c) [i][j]) || fabs((c) [i][j][0]) < (t) || \
             fabs((c) [i][j][1]) < (t) || fabs((c) [i][j][2]) < (t) \
          ) \
       )
    #endif
    
-
-
-   /* Daubechies D1 (Haar) / Daubechies D2 coefficients */
+   
+   /* 4-tap wavelet function: use biorthogonal instead of Daubechies */
+   /* #define WAVELET_BIOR */
+   
+   /* 2-tap / 4-tap coefficients */
    #define SQRT2        1.41421356
    #define SQRT3        1.73205081
    #define H2NORM       (0.25 / SQRT2)
 
-   /* Daubechies D1 (Haar) wavelet coeffs */
+
+
+   /* 2-tap wavelet coeffs */
    extern const WAVELET_COEFF h1;
 
-   /* Daubechies D2 wavelet coeffs */   
-   extern const WAVELET_COEFF h2 [4];
-   extern const WAVELET_COEFF g2 [4];
+   /* 4-tap wavelet coeffs */
+   extern const WAVELET_COEFF h2 [], g2 [], h2i [], g2i [];
 
 
 
    /* Wavelet matrix defs; these are stored as arrays of pointers (a.k.a.
     * Iliffe vectors), as this proved to be more performant than a flattened
     * representation. Encapsulating the coefficient's triplets in a struct
     * prevents the compiler from introducing another layer of indirection.
     * */
    typedef WAVELET_COEFF      WaveletCoeff3 [3];
    
    #define WAVELET_COEFFSIZE  (sizeof(WaveletCoeff3))
    typedef WaveletCoeff3      **WaveletMatrix2;
 
    
    
    /* 
       ----------- 2D COEFFICIENT MATRIX ALLOC/INIT ROUTINES -----------
    */
 
    WaveletMatrix2 allocWaveletMatrix2 (unsigned len);
    /* Allocate and return a 2D coefficient array of size (len x len).
       Returns NULL if allocation failed. */
 
    
    void freeWaveletMatrix2 (WaveletMatrix2 y, unsigned len);
    /* Free previously allocated 2D coeff array y of size (len x len) */
 
 
    void zeroCoeffs2 (WaveletMatrix2 y, unsigned len);
    /* Set 2D array coefficients to zero */
 
 
    /* 
       ---------- 2D WAVELET TRANSFORM OPTIMISED FOR SIZES 2^l ----------
       ------ THESE USE PERIODIC (WRAP-AROUND) BOUNDARY EXTENSION -------
    */
    
    int waveletXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l);
    /*
       Perform full 2D multiresolution forward wavelet transform on array y
       of size (2^l) x (2^l) containing original signal as 3-tuples, where 
       l >= 1.  Note no intra-tuple transform occurs.
       
       The wavelet coefficients are returned in array y, containing the
       coarsest approximation in y [0][0] followed by horizontal/vertical
       details in order of increasing resolution/frequency.
       
       A preallocated array yt of identical dimensions to y can be supplied
       as buffer for intermediate results.  If yt == NULL, a buffer is
       automatically allocated and freed on demand, but this is inefficient
       for frequent calls.  It is recommended to preallocate yt to the
       maximum expected size.  The dimensions of yt are not checked; this is
       the caller's responsibility.
          
       Returns 0 on success, else -1.
    */
    
 
    int waveletInvXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l);
    /* 
       Perform full 2D multiresolution inverse wavelet transform on array y
       of size (2^l) x (2^l) containing wavelet coefficients as 3-tuples,
       where l >= 1.  Note no intra-tuple transform occurs.
 
       A preallocated array yt of identical dimensions to y can be supplied
       as buffer for intermediate results.  If yt == NULL, a buffer is
       automatically allocated and freed on demand, but this is inefficient
       for frequent calls.  It is recommended to preallocate yt to the
       maximum expected size.  The dimensions of yt are not checked; this is
       the caller's responsibility.
       
       The reconstructed signal is returned in array y.  Returns 0 on
       success, else -1.
    */
 
 
 
    /* 
       ------------ 2D WAVELET TRANSFORM FOR ARBITRARY SIZES -------------
       ---- THESE USE CONSTANT BOUNDARY EXTENSION WITH PADDING COEFFS-----
    */
    
    unsigned padWaveletXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
       unsigned yLen0, unsigned *numNonZero
    );
    /* Perform full 2D multiresolution forward wavelet transform with padded
       boundary coefficients on array y containing (yLen0 x yLen0) samples of
       3-tuples.  Note no intra-tuple transform occurs.  An additional array
       yt of identical dimension to y is required as buffer for intermediate
       results.
       
       The wavelet coefficients are returned in array y, containing the
       coarsest approximation coefficients in y [0][0] followed by
       horizontal/vertical detail coefficients in order of increasing
       resolution/frequency.
 
       NOTE: Both y and yt must be large enough to accommodate the additional
       padding coefficients introduced at the array boundaries during the
       transform.  It is the caller's responsibility to preallocate and
       dimension these arrays accordingly with allocWaveletMatrix2().  The
       dimension of the output array (which generally exceeds the input
       dimension yLen0) must be interrogated beforehand by calling the
       function with y or yt set to NULL.
       
       The returned value is the output dimension, or 0 if the transform
       failed.  In addition, the number of nonzero coefficients is returned
       in numNonZero (if not NULL), which is a subset of the number of output
       coefficients, i.e.  (output dimension)^2.
    */
    
    
    unsigned padWaveletInvXform2 (WaveletMatrix2 y, WaveletMatrix2 yt,
       unsigned yLen0, unsigned yLenOrg
    );
    /* Perform full 2D multiresolution inverse wavelet transform on array y
       containing (yLen0 x yLen0) boundary padded wavelet coefficients as
       3-tuples.  The original number of samples (yLenOrg x yLenOrg) is
       required for the reconstruction.  An additional array yt of identical
       dimension to y is required as buffer for intermediate results.
     
       The reconstructed samples are returned in y.  This is smaller than
       (yLen0 x yLen0) since the boundary coefficients from the forward
       transform are collapsed.
     
       The returned value is the reconstructed number of samples per
       dimension (yLenOrg), or 0 if the inverse transform failed.
     */ 
    
   
    
    #ifdef WAVELET_DBG
       /* 
          ----------------- SUPPORT ROUTINES FOR DEBUGGING -----------------
       */
       
       void clearCoeffs2 (WaveletMatrix2 y, unsigned len);
       /* Clear 2D array to facilitate debugging */
       
       
       void dumpCoeffs2 (const WaveletMatrix2 y1, const WaveletMatrix2 y2,
-         unsigned y1Len, unsigned y2Len, float thresh 
+         unsigned y1Len, unsigned y2Len, float thresh, FILE *stream 
       );
       /* 
          Dump 2D arrays y1 and y2 of dims y1Len resp.  y2Len side-by-side to
          stdout (skip y2 if NULL).  Coefficients with absolute magnitude
          less than thresh are marked with brackets ('[]') as thresholded.
       */
       
       
       float rmseCoeffs2 (const WaveletMatrix2 y1, const WaveletMatrix2 y2, 
          unsigned len
       );
       /* Calculate RMSE between 2D matrices y1 and y2 */
    #endif
 #endif
 
diff --git a/wavelet3.c b/wavelet3.c
index 3510671..aa6916e 100644
--- a/wavelet3.c
+++ b/wavelet3.c
@@ -1,1170 +1,1204 @@
 /* 
    =========================================================================
    2-dimensional Daubechies D2 and D1 (Haar) wavelet transforms on arbitrary
    sized arrays of 3-tuples.  Array boundaries are extended as defined by
    WAVELET_EXTEND_MODE, which introduces padding coefficients.
 
    Compile with -DWAVELET_TEST_2DPAD to build standalone unit tests.
    
    The implementation here is mainly distilled from the following sources:
    -- Filip Wasilewski's pywavelets:
       https://github.com/PyWavelets/pywt/tree/master/pywt/_extensions/c
       [D2 with boundary condition handling via padding, arbitrary sizes]
    -- Ingrid Daubechies and Wim Sweldens, "Factoring Wavelet Tranforms into 
       Lifting Steps", section 7.5:
       https://services.math.duke.edu/~ingrid/publications/J_Four_Anal_Appl_4_p247.pdf
       [Lifting scheme on D2]
    -- Ian Kaplan's webpages on wavelets and signal processing::
       http://bearcave.com/misl/misl_tech/wavelets/daubechies/index.html
       http://bearcave.com/misl/misl_tech/wavelets/lifting/basiclift.html
       [Lifting scheme on D2]
    -- Stefan Moebius' TurboWavelets.Net:
       https://github.com/codeprof/TurboWavelets.Net/tree/master/TurboWavelets
       [Boundary condition handling with arbitrary sizes, backreferencing]
    -- "Numerical Recipes in C", chapter 13.10:
       http://cacs.usc.edu/education/cs653/NRC-c13.10-Wavelets.pdf
       [Linear algebra (matrix mult.) implementation of D2]
    
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 #include "wavelet2.h"
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
-#include <stdio.h>
 
 
 
 static unsigned padD1Step2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned len
 )
 /* 
    Single iteration of forward 2D Daubechies D1 (Haar) wavelet transform
    with padding on array y of arbitrary size (len x len) containing
    3-tuples.  This variant uses constant boundary extension.  The transform
    is performed over y's 2nd axis (i.e.  horizontally, assuming row-major
    addressing as per C convention).  Note the triplets per array element are
    _not_ decorrelated, but transformed independently.
    
    The wavelet coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the generated
    coefficients vertically, and prepares the array for another transform
    over its 2nd axis in a subsequent call (this time as input y).  The
    result of a subsequent transform restores yt to y's original orientation,
    in which case both horizontal and vertical axes have been decorrelated.
    
    Returns array dimension len on success, else 0.
 */
 {
    static unsigned   axis = 0;
    const unsigned    hLen = -(-(signed)len >> 1);
    int               h, i, j, k;
 
    if (len < 2 || !y || !yt)
       /* Input shorter than wavelet support, or no input/output */
       return 0;
 
 #ifdef WAVELET_DBG
    clearCoeffs2(yt, len);
 #endif      
 
    /* NOTE: yt is transposed on the fly such that the next function call
     * transforms over the alternate axis. This is done by simply swapping
     * the indices during assignment */    
    for (i = 0; i < len; i++) {
       for (j = 0; j <= len - 2; j += 2) {
          h = j >> 1;
          
          for (k = 0; k < 3; k++) {
             /* Smooth/approx/avg/lowpass */
             yt [h       ] [i] [k] = h1 * (
                y [i] [j    ] [k] + 
                y [i] [j + 1] [k]
             );
             
             /* Detail/diff/highpass */
             yt [h + hLen] [i] [k] = h1 * (
                y [i] [j    ] [k] - 
                y [i] [j + 1] [k]
             );
          }
       }
       
       if (len & 1) {
          /* Odd length; pad with last value as additional approx term */
          for (k = 0; k < 3; k++)
             /* Smooth/approx/avg/lowpass */
             yt [hLen - 1] [i] [k] = y [i] [len - 1] [k];
       }
    }
 
 #ifdef WAVELET_DBG
    printf("%s FWD D1 (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
-   dumpCoeffs2(y, yt, len, len, 0);
+   dumpCoeffs2(y, yt, len, len, 0, NULL);
    putchar('\n');
 #endif
 
    axis ^= 1;
    return len;
 }
 
 
 
 static unsigned padD1InvStep2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned len
 )
 /* 
    Single iteration of inverse 2D Daubechies D1 (Haar) wavelet transform
    with padding on array y of arbitrary size (len x len) containing
    3-tuples.  This reverses the forward transform above.  The transform is
    inverted over y's 2nd axis (i.e.  horizontally, assuming row-major
    addressing as per C convention).
    
    The inverted coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the inverted
    coefficients vertically, and prepares the array for another inverse
    transform over its 2nd axis in a subsequent call (this time as input y). 
    The result of a subsequent inverse transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    inversely transformed.
    
    Returns array dimension len on success, else 0.
 */
 {
    static unsigned   axis = 1;
    const unsigned    hLen = -(-(signed)len >> 1);
    int               h, i, j, k;
 
    if (len < 2 || !y || !yt)
       /* Too few coeffs for reconstruction, or no input/output */
       return 0;
 
 #ifdef WAVELET_DBG
    clearCoeffs2(yt, len);
 #endif   
 
    /* NOTE: i, j are swapped relative to the forward transform, as axis
     * order is now reversed. */
    /* NOTE: yt is transposed on the fly such that the next function call
     * inverts over the alternate axis. This is done by simply swapping
     * the indices during assignment */    
    for (i = 0; i < len; i++) {
       for (j = 0; j <= len - 2; j += 2) {      
          h = j >> 1;
       
          for (k = 0; k < 3; k++) {
             yt [i] [j    ] [k] = h1 * (
                y [h       ] [i] [k] +  /* Approx */
                y [h + hLen] [i] [k]    /* Detail */
             );
             
             yt [i] [j + 1] [k] = h1 * (
                y [h       ] [i] [k] -  /* Approx */
                y [h + hLen] [i] [k]    /* Detail */
             );
          }
       }
 
       if (len & 1) {
          /* Odd length; pad additional value from last approximation term */
          for (k = 0; k < 3; k++)
             yt [i] [len - 1] [k] = y [hLen - 1] [i] [k];
       }   
    }
    
 #ifdef WAVELET_DBG
    printf("%s INV D1 (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
-   dumpCoeffs2(y, yt, len, len, 0);
+   dumpCoeffs2(y, yt, len, len, 0, NULL);
    putchar('\n');
 #endif
 
    axis ^= 1;
    return len;
 }
 
 
 
 static unsigned padD2Step2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned yLen, unsigned ytOffset
 )
 /* 
    Single iteration of forward 2D Daubechies D2 wavelet transform with
    boundary padding on array y containing (yLen x yLen) 3-tuples.  This
    variant extends the input array beyond its boundary as defined by
    WAVELET_EXTEND_MODE, which results in padding coefficients.  The
    transform is performed over y's 2nd axis (i.e.  horizontally, assuming
    row-major addressing as per C convention).  Note the triplets per array
    element are _not_ decorrelated, but transformed independently.
    
    The output wavelet coefficients are returned in the *TRANSPOSED* array
    yt.  The transpose arranges the generated coefficients vertically,
    and prepares the array for another transform over its 2nd axis in a
    subsequent call (this time as input y).  The result of a subsequent
    transform restores yt to y's original orientation, in which case both
    horizontal and vertical axes have been decorrelated.
    
    Both the input array y and output array yt must be large enough to also
    accommodate the additional padding coefficients introduced at the
    boundaries.  The number of output coefficient pairs ytLen returned in yt
    is obtained as return value, and can be interrogated beforehand without
    performing a transform by passing either y or yt as NULL.
    
    The approximation coefficients are returned in yt [0..ytLen-1] [i], while
    the details are returned in yt [ytOffset..ytOffset+ytLen-1] [i].  The
    offset ytOffset must be supplied by the caller to accommodate the detail
    coefficients (plus padding!) without clobbering the approximation
    coefficients (again, plus padding!).  Consequently, there is a gap
    between the approximation coefficients in the upper left of the matrix,
    and the detail coefficients in the lower right.  This gap shrinks and
    shifts toward the upper left with each iteration.
    
    The following schema indicates the matrix contents during the transform
    (see the unit test output when compiled with WAVELET_TEST_2DPAD):
 
       ----> j
      +----------+--------------+                 +-----------+-------------+
    | |   y_ij   |              |                 |  S(y_ji)  |             |
    | +----------+              |  (H_k xform)^T  +-----------+             |
    v |                         |   ---------->   |                         |
    i |                         |                 +-----------+             |
      |                         |                 |  D(y_ji)  |             |
      +-------------------------+                 +-------------------------+
 
 
       ----> i
      +----------+--------------+                 +----------+---+----------+
    | | S(y_ji)  |              |                 |S(S(y_ij))|   |S(D(y_ij))|
    | +----------+              |  (V_k xform)^T  +----------+   +----------+
    v |                         |   ---------->   |                         |
    j +----------+              |                 +----------+   +----------+
      | D(y_ji)  |              |                 |D(S(y_ij))|   |D(D(y_ij))|
      +-------------------------+                 +----------+---+----------+
    
                  ...                    ...                  ...
                  ...                    ...                  ...
                
                      +------+------+--------+--------+----------------+
                      |SS_n-1|SD_n-1|        |        |                |
                      |------+------+ SD_n-2 |        |                |
                      +DS_n-1|DD_n-1+--------+ . . .  |      SD_0      |
                      +------+----+-+--------+        |                |
                      |  DS_n-2   | | DD_n-2 |        +----------------+
     (V_n-1 xform)^T  +-----------+ +--------+ .           .     .     |
       ---------->    |    .      .               .        .     .     |
                      |    .      .                  .     .     .     |
                      +----------------+              +----------------+    
                      |                |              |                |
                      |     DS_0       | . . . . . .  |      DD_0      |
                      |                |              |                |
                      |                |              |                |
                      +----------------+--------------+----------------+
                                                  
     where i, j are the array indices of the outer resp.  inner loop, S(y_ij)
     D(y_ij) are approximation resp.  details coefficients of input y_ij
     (itself either the original signal or approximation/detail coefficients
     from previous iterations), and H_k, V_k are the horizontal resp.
     vertical transforms of the k-th (out of the maximum n) iteration.
 */
 { 
    static unsigned   axis = 0;
    unsigned          h, i, j, k, l, i0 = 0, iN, wIdx, dExt;
    /* Number of output coeff pairs ytLen, incl. boundary padding, and
       max dimension of entire array */
    const unsigned    ytLen = (yLen + 3) >> 1, ytTotalLen = ytOffset + ytLen;
    WAVELET_COEFF     yExt;
 #if (WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD1 || \
     WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD2 \
 )   
    /* Gradient boundary extension stuff */
    WAVELET_COEFF     dy1_0, dy1_1, dy2_0;
 #endif
 
    if (y && yt && yLen >= 2) {
 #ifdef WAVELET_DBG
       /* Clear output array yt to facilitate debugging */
       clearCoeffs2(yt, ytTotalLen);
 #else
       /* Zero output (sub)array yt; note this will not affect accumulated
        * coefficients from previous iterations, as they lie outside the
        * array bounds */
       zeroCoeffs2(yt, ytTotalLen);
 #endif
       /* FWD TRANSFORM GOTCHAS:
        *
        * Each full iteration of the forward transform consists of a
        * horizontal step followed by a vertical one.  yt is transposed on
        * the fly such that the next function call transforms over the
        * alternate axis. This is done by simply swapping the indices during
        * assignment.
        *
        * In the initial horizontal transform, the input array only contains
        * a set of approximation coefficients (or the original signal) at 
        * y [0..yLen-1] [0..yLen-1].       
        *
        * In the subsequent vertical transform, the input array then contains
        * two sets of coefficients from the preceding horizontal transform:
        * approximation coeffs y [0..ytLen-1] [0..ytLen-1] and detail coeffs
        * y [ytOffset..ytTotalLen-1] [ytOffset..ytTotalLen-1].  In this case,
        * the loop below is iterated twice with an offset yOffset to select
        * both sets of coefficients to transform.  Note the detail coeffs are
        * selected first to facilitate the looping logic; if i0==0 at the end
        * of the loop (i.e.  approx coeffs were selected), it terminates. 
        */
       do {
          i0 ^= ytOffset * axis;
          iN = i0 ? ytTotalLen : yLen;
          
          for (i = i0; i < iN; i++) {
             /* First iteration (j = -2); pad coefficients at left edge and
              * extend input beyond boundary according to extension mode */
             for (k = 0; k < 3; k++) {
 #if (WAVELET_EXTEND_MODE == WAVELET_EXTEND_CONST || \
    !defined(WAVELET_EXTEND_MODE) \
 )
                /* Constant boundary extension (=default) */
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
                   (h2 [0] + h2 [1] + h2 [2]) * y [i] [0] [k] +
                   (h2 [3]                  ) * y [i] [1] [k];
                   
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
                   (g2 [0] + g2 [1] + g2 [2]) * y [i] [0] [k] +
                   (g2 [3]                  ) * y [i] [1] [k];
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD1
                /* 1st order gradient boundary extension */
                dy1_0 = y [i] [0] [k] - y [i] [1] [k];
 
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
                   h2 [0]   * (y [i] [0] [k] + 2 * dy1_0) +
                   h2 [1]   * (y [i] [0] [k] +     dy1_0) +
                   h2 [2]   * (y [i] [0] [k]            ) +
                   h2 [3]   * (y [i] [1] [k]            );
 
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
                   g2 [0]   * (y [i] [0] [k] + 2 * dy1_0) +
                   g2 [1]   * (y [i] [0] [k] +     dy1_0) +
                   g2 [2]   * (y [i] [0] [k]            ) +
                   g2 [3]   * (y [i] [1] [k]            );
                   
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD2
                /* 2nd order gradient boundary extension */
                dy1_0 = y [i] [0] [k] - y [i] [1] [k];
                dy1_1 = y [i] [1] [k] - y [i] [2] [k];
                dy2_0 = dy1_0 - dy1_1;
                
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
 /*                h2 [0]   * (y [i] [0] [k] + 2 * (dy1_0 + 1 * dy2_0)) +
                   h2 [1]   * (y [i] [0] [k] + 1 * (dy1_0 + 0 * dy2_0)) + 
                   h2 [2]   * (y [i] [0] [k] + 0 * (dy1_0            )) +
                   h2 [3]   * (y [i] [1] [k]                          ); */
                   h2 [0]   * (y [i] [0] [k] + 2 * (dy1_0 + dy2_0)) +
                   h2 [1]   * (y [i] [0] [k] +      dy1_0         ) +
                   h2 [2]   * (y [i] [0] [k]                      ) +
                   h2 [3]   * (y [i] [1] [k]                      );
 
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
 /*                g2 [0]   * (y [i] [0] [k] + 2 * (dy1_0 + 1 * dy2_0)) +
                   g2 [1]   * (y [i] [0] [k] + 1 * (dy1_0 + 0 * dy2_0)) + 
                   g2 [2]   * (y [i] [0] [k] + 0 * (dy1_0            )) + 
                   g2 [3]   * (y [i] [1] [k]                          ); */
                   g2 [0]   * (y [i] [0] [k] + 2 * (dy1_0 + dy2_0)) +
                   g2 [1]   * (y [i] [0] [k] +      dy1_0         ) +
                   g2 [2]   * (y [i] [0] [k]                      ) +
                   g2 [3]   * (y [i] [1] [k]                      );
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_CYCL
                /* Cyclic (perdiodic / wraparound) boundary extension */
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
                   h2 [0] * y [i] [yLen - 2] [k] + 
                   h2 [1] * y [i] [yLen - 1] [k] + 
                   h2 [2] * y [i] [0       ] [k] +
                   h2 [3] * y [i] [1       ] [k];
                   
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
                   g2 [0] * y [i] [yLen - 2] [k] + 
                   g2 [1] * y [i] [yLen - 1] [k] + 
                   g2 [0] * y [i] [0       ] [k] +
                   g2 [3] * y [i] [1       ] [k];
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_SYMM
                /* Symmetric (reflected) boundary extension */
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
                   h2 [0] * y [i] [1] [k] + 
                   h2 [1] * y [i] [0] [k] + 
                   h2 [2] * y [i] [0] [k] +
                   h2 [3] * y [i] [1] [k];
                   
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
                   g2 [0] * y [i] [1] [k] + 
                   g2 [1] * y [i] [0] [k] + 
                   g2 [2] * y [i] [0] [k] +
                   g2 [3] * y [i] [1] [k];
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_ZERO
                /* Constant zero boundary extension */
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] =
                   h2 [2] * y [i] [0] [k] + h2 [3] * y [i] [1] [k];
                   
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] =
                   g2 [2] * y [i] [0] [k] + g2 [3] * y [i] [1] [k];
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_INV
                /* Inverted (negative reflected) boundary extension */
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
                   -h2 [0] * y [i] [1] [k] +
                   -h2 [1] * y [i] [0] [k] + 
                   h2 [2] * y [i] [0] [k] +
                   h2 [3] * y [i] [1] [k];
                   
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
                   -g2 [0] * y [i] [1] [k] + 
                   -g2 [1] * y [i] [0] [k] + 
                   g2 [2] * y [i] [0] [k] +
                   g2 [3] * y [i] [1] [k];
 #endif
             } 
          
             /* Intermediate and last iterations (j >= 0) */
             for (j = 0; j < yLen; j += 2) {
                h = (j >> 1) + 1;
             
                for (k = 0; k < 3; k++) {
 #ifdef WAVELET_DBG
                   /* Only necessary in debugging mode, since otherwise
                      already cleared by zeroCoeffs2() */
                   yt [h] [i] [k] = yt [h + ytOffset] [i] [k] = 0;
 #endif
                   /* Convolve up to boundary of wavelet or input signal,
                    * whichever comes first */
                   for (l = j; l < min(j + 4, yLen); l++) {
                      /* Smooth/approx/avg/lowpass */
                      yt [h           ] [i] [k] += h2 [l - j] * y [i] [l] [k];
                      
                      /* Detail/diff/highpass */
                      yt [h + ytOffset] [i] [k] += g2 [l - j] * y [i] [l] [k];
                   }
 
                   /* If necessary, convolve beyond input signal boundary
                    * with extension. This is skipped if the wavelet boundary
                    * was already reached */
                   for (; l < j + 4; l++) {
                      wIdx = l - j;
                      
 #if (WAVELET_EXTEND_MODE == WAVELET_EXTEND_CONST || \
    !defined(WAVELET_EXTEND_MODE) \
 )
                      /* Constant boundary extension (=default) */
                      yExt = y [i] [yLen - 1] [k];
                         
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD1
                      /* 1st order gradient boundary extension */
                      dy1_0 = y [i] [yLen - 1] [k] - y [i] [yLen - 2] [k];
                      dExt  = l - yLen + 1;
                      yExt  = y [i] [yLen - 1] [k] + dExt * dy1_0;
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD2
                      /* 2nd order gradient boundary extension */
                      dy1_0 = y [i] [yLen - 1] [k] - y [i] [yLen - 2] [k];
                      dy1_1 = y [i] [yLen - 2] [k] - y [i] [yLen - 3] [k];
                      dy2_0 = dy1_0 - dy1_1;
                      dExt  = l - yLen + 1;
                      yExt  = y [i] [yLen - 1] [k] + 
                         dExt * (dy1_0 + (dExt ? dExt - 1 : 0) * dy2_0);
                      
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_CYCL
                      /* Cyclic (perdiodic/wraparound) boundary extension */
                      yExt = y [i] [l - yLen] [k];
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_SYMM
                      /* Symmetric (reflected) boundary extension */
                      yExt = y [i] [((yLen - 1) << 1) - l + 1] [k];
                      /* = (yLen - 1) - (l - (yLen - 1)) + 1 */
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_ZERO
                      /* Constant zero boundary extension */
                      yExt = 0;
                      
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_INV
                      /* Inverted (negative reflected) boundary extension */
                      yExt = -y [i] [((yLen - 1) << 1) - l + 1] [k];
 #endif
 
                      /* Smooth/approx/avg/lowpass */
                      yt [h           ] [i] [k] += h2 [wIdx] * yExt;
                         
                      /* Detail/diff/highpass */
                      yt [h + ytOffset] [i] [k] += g2 [wIdx] * yExt;
                   }
                }
             }   
          }
       } while (i0);
       
 #ifdef WAVELET_DBG
       if (axis)
          printf("VERT FWD D2: (%dx%d) H-approx, (%dx%d) H-detail\n"
             "-> (%dx%d) V-approx of H-approx, (%dx%d) V-approx of H-detail,\n"
             "   (%dx%d) V-detail of H-approx, (%dx%d) V-detail of V-detail\n",
             ytLen, yLen, ytLen, yLen,
             ytLen, ytLen, ytLen, ytLen, ytLen, ytLen, ytLen, ytLen
          );
       else 
          printf(
             "HORIZ FWD D2: (%dx%d) -> (%dx%d) H-approx, (%dx%d) H-detail\n",
             yLen, yLen,
             ytLen, yLen, ytLen, yLen
          );
          
-      dumpCoeffs2(y, yt, ytTotalLen, ytTotalLen, 0);
+      dumpCoeffs2(y, yt, ytTotalLen, ytTotalLen, 0, NULL);
       putchar('\n');
 #endif
 
       axis ^= 1;
    }
    
    return ytLen;
 }
 
 
 
 static unsigned padD2InvStep2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned yLen, unsigned yOffset
 )
 /*
    Single iteration of inverse 2D Daubechies D2 wavelet transform with
    boundary padding on array y containing pairs of (yLen x yLen)
    approximation and detail coefficients in 3-tuples.  This reverses the
    forward transform above.  The transform is inverted over y's 2nd axis
    (i.e.  horizontally, assuming row-major addressing as per C convention).
    
    The inverted coefficients are returned in the *TRANSPOSED* output array
    yt. The transpose arranges the inverted coefficients vertically, and
    prepares the array for another inverse transform over its 2nd axis in a
    subsequent call (this time as input y).  The result of a subsequent
    inverse transform restores yt to y's original orientation, in which case
    both horizontal and vertical axes have been inversely transformed.
    
    The number of output coefficients is smaller than the input as padding
    coefficients are collapsed at the boundaries during the inverse
    transform.  The number of output coefficients per dimension ytLen is
    obtained as returned value, and can be interrogated beforehand without
    performing a transform by passing either y or yt as NULL.
    
    The inversion expects approximation coeffs y [0..yLen-1][0..yLen-1]
    and y [0..yLen-1] [yOffset..yOffset+yLen-1], and detail coefficients
    y [yOffset..yOffset+yLen-1] [0..yLen-1] and 
    y [yOffset..yOffset+yLen-1] [yOffset..yOffset+yLen-1].  
    
    The offset yOffset must be supplied by the caller to account for the fact
    that the number of output coefficients ytLen is smaller than yLen due to
    the collapsed padding coefficients.  Consequently there is a gap between
    the inverted coefficients located in the corners of the matrix.  This gap
    increases with each iteration, until all coefficients have been consumed
    by the inversion.
 */
 {
    static unsigned   axis = 1;
    unsigned          h, i, j, k, l, i0 = 0, iN;
    /* Number of output coeff (pairs) ytLen, max dimension of entire array */
    unsigned          ytLen = (yLen - 1) << 1;
    const unsigned    yTotalLen = yOffset + yLen;
 
    if (y && yt && ytLen >= 2) {
 #ifdef WAVELET_DBG
       /* Clear output array yt to facilitate debugging */
       clearCoeffs2(yt, yTotalLen);
 #else
       /* Zero output (sub)array yt; note this will not affect accumulated
        * coefficients from previous iterations, as they lie outside the
        * array bounds */
       zeroCoeffs2(yt, yTotalLen);
 #endif
       /* INV TRANSFORM GOTCHAS:
        *
        * i, j are swapped relative to the forward transform, as axis order
        * is now reversed (vertical before horizontal xform).  yt is
        * transposed on the fly such that the next function call transforms
        * over the alternate axis.  This is done by simply swapping the
        * indices during assignment.
        *
        * In the inverse vertical transform, the input array contains two
        * sets of coefficients:
        * (a) approx of approx coeffs y [0..yLen-1] [0..yLen-1] in the
        *     upper left, and details of approximation coefficients 
        *     y [yOffset..yTotalLen-1] [0..yLen-1] in the lower left.
        * (b) approx of detail coeffs y [0..yLen-1] [yOffset..yTotalLen-1]
        *     in the upper right, and details of detail coefficients 
        *     y [yOffset..yTotalLen-1] [yOffset..yTotalLen-1] in lower right.
        *
        * The loop below is iterated twice with an offset to select both sets
        * of coefficients to inverse the transform.  Note set (b) is selected
        * first to facilitate the looping logic; if i0==0 at the end of the
        * loop (i.e.  set (a) was selected), it terminates.
        */
       do {
          i0 ^= yOffset * (axis);
          iN = i0 ? yTotalLen : ytLen;
          
          for (i = i0; i < iN; i++) {
             for (j = 0; j < ytLen; j += 2) {
                h = j >> 1;
                
                for (k = 0; k < 3; k++) {
                   yt [i] [j    ] [k] = 
-                     h2 [2] * y [h              ] [i] [k] +   /* Approx */
-                     g2 [2] * y [h + yOffset    ] [i] [k] +   /* Detail */
-                     h2 [0] * y [h + 1          ] [i] [k] +   /* Approx */
-                     g2 [0] * y [h + 1 + yOffset] [i] [k];    /* Detail */
+                     h2i [1] * y [h              ] [i] [k] +   /* Approx */
+                     g2i [1] * y [h + yOffset    ] [i] [k] +   /* Detail */
+                     h2i [3] * y [h + 1          ] [i] [k] +   /* Approx */
+                     g2i [3] * y [h + 1 + yOffset] [i] [k];    /* Detail */
                      
                   yt [i] [j + 1] [k] =
-                     h2 [3] * y [h              ] [i] [k] +   /* Approx */
-                     g2 [3] * y [h + yOffset    ] [i] [k] +   /* Detail */
-                     h2 [1] * y [h + 1          ] [i] [k] +   /* Approx */
-                     g2 [1] * y [h + 1 + yOffset] [i] [k];    /* Detail */
+                     h2i [0] * y [h              ] [i] [k] +   /* Approx */
+                     g2i [0] * y [h + yOffset    ] [i] [k] +   /* Detail */
+                     h2i [2] * y [h + 1          ] [i] [k] +   /* Approx */
+                     g2i [2] * y [h + 1 + yOffset] [i] [k];    /* Detail */
                }
             }
          }  
       } while (i0);
 
 #ifdef WAVELET_DBG
       if (axis)
          printf("VERT INV D2: "
             "(%dx%d) V-approx of H-approx, (%dx%d) V-approx of H-detail,\n"
             "(%dx%d) V-detail of H-approx, (%dx%d) V-detail of V-detail\n"
             "-> (%dx%d) H-approx, (%dx%d) H-detail\n",
             yLen, yLen, yLen, yLen, yLen, yLen, yLen, yLen, 
             yLen, ytLen, yLen, ytLen
          );
       else 
          printf(
             "HORIZ INV D2: (%dx%d) H-approx, (%dx%d) H-detail -> (%dx%d)\n",
             yLen, ytLen, yLen, ytLen,
             ytLen, ytLen
          );
 
-      dumpCoeffs2(y, yt, yTotalLen, yTotalLen, 0);
+      dumpCoeffs2(y, yt, yTotalLen, yTotalLen, 0, NULL);
       putchar('\n');
 #endif
 
       axis ^= 1;
    }
 
    return ytLen;
 }
 
 
 
 unsigned padWaveletXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned yLen0, unsigned *numNonZero
 )
 /* Perform full 2D multiresolution forward wavelet transform with padded
    boundary coefficients on array y containing (yLen0 x yLen0) samples of
    3-tuples.  Note no intra-tuple transform occurs.  An additional array yt
    of identical dimension to y is required as buffer for intermediate
    results.
    
    The wavelet coefficients are returned in array y, containing the coarsest
    approximation coefficients in y [0][0] followed by horizontal/vertical
    detail coefficients in order of increasing resolution/frequency.
    
    NOTE: Both y and yt must be large enough to accommodate the additional
    padding coefficients introduced at the array boundaries during the
    transform.  It is the caller's responsibility to preallocate and
    dimension these arrays accordingly with allocWaveletMatrix2().  The
    dimension of the output array (which generally exceeds the input
    dimension yLen0) must be interrogated beforehand by calling the function
    with y or yt set to NULL.
    
    The returned value is the output dimension, or 0 if the transform failed. 
    In addition, the number of nonzero coefficients is returned in
    numNonZero (if not NULL), which is a subset of the number of output
    coefficients, i.e. (output dimension)^2.
 */
 {
    /* Output size (number of coeffs, including boundary padding coeffs),
       output array offset and pointers */
    unsigned       yLen, ytLen, ytOffset, totalLen = 0, totalNZ = 0;
 
    /* Transform if enough samples, else just return output size.
       NOTE: the D2 transform will be skipped for input sizes yLen0 < 4. */
    if (yLen0 >= 2) {
       /* Figure out total number of coefficients (= maximum size of output
          array including boundary padding) by summing the number of detail
          coeffs returned per dimension by each xform iteration.  Also sum
          number of nonzero detail coefficients.  */
       for (yLen = yLen0, totalLen = 0;
-         yLen > 3; 
+         yLen > WAVELET_PADD2_MINLEN; 
          totalLen += yLen = padD2Step2(NULL, NULL, yLen, 0),
          totalNZ += yLen * yLen
       );
       /* Finally add number of approx coeffs at coarsest resolution */
       totalLen += yLen;
       totalNZ = 3 * totalNZ + yLen * yLen;
 
       if (y && yt) {
          /* Apply Daubechies D2 transform, accounting for boundary padding */
          for (yLen = yLen0, ytOffset = totalLen; 
-            yLen > 3;
+            yLen > WAVELET_PADD2_MINLEN;
             yLen = ytLen
          ) {
             /* Get num of coeff pairs returned by the next xform iteration,
                and calculate the offset of detail coeffs in output array */
             ytOffset -= ytLen = padD2Step2(NULL, NULL, yLen, 0);
             /* Apply horizontal & vertical Daubechies D2 transform, swapping
                input and transposed output array.  The resulting detail
                coefficients are prepended to those from the previous
                iteration from position ytOffset; consequently, the returned
                number of coefficient pairs is the input size for next
                iteration */
             if (!padD2Step2(y, yt, yLen, ytOffset) || 
                !padD2Step2(yt, y, yLen, ytOffset)
             )
                return 0;
          }
 #ifdef WAVELET_FINAL_HAAR
          /* Apply horizontal & vertical Daubechies D1 (Haar) transform in
             penultimate and final iterations (since yLen is odd) to
             decompose to a single approximation coefficient y [0] [0] at the
             coarsest resolution; all other coeffs are details.  Obviously no
             boundary padding is necessary here.  */
          for (; yLen >= 2; yLen--) {
             if (!padD1Step2(y, yt, yLen) || !padD1Step2(yt, y, yLen))
                return 0;
          }
 #endif
       }
    }
 
    /* Optionally return number of nonzero coeffs */
    if (numNonZero)
       *numNonZero = totalNZ;
       
    /* All coeffs now in y; return output size */
    return totalLen;
 }
 
 
 
 unsigned padWaveletInvXform2 (WaveletMatrix2 y, WaveletMatrix2 yt,
    unsigned yLen0, unsigned yLenOrg
 )
 /* Perform full 2D multiresolution inverse wavelet transform on array y
    containing (yLen0 x yLen0) boundary padded wavelet coefficients as
    3-tuples.  The original number of samples (yLenOrg x yLenOrg) is required
    for the reconstruction.  An additional array yt of identical dimension to
    y is required as buffer for intermediate results.
  
    The reconstructed samples are returned in y.  This is smaller than (yLen0
    x yLen0) since the boundary coefficients from the forward transform are
    collapsed.
  
    The returned value is the reconstructed number of samples per dimension
    (yLenOrg), or 0 if the inverse transform failed.
  */
 {
    unsigned    yLen = 0, nextyLen, ytLen, ytOffset, i, j, d2Iter;
 
    if (yLenOrg < 2)
       /* Too few coefficients for reconstruction */
       return 0;
       
    /* Figure out num Daubechies iterations based on original num coeffs */
    for (d2Iter = 0, yLen = yLenOrg; 
-      yLen > 3; 
+      yLen > WAVELET_PADD2_MINLEN; 
       d2Iter++, yLen = padD2Step2(NULL, NULL, yLen, 0)
    );
 #ifdef WAVELET_FINAL_HAAR
    /* Invert horizontal & vertical Daubechies D1 (Haar) transform at the
       two coarsest levels */ 
    for (yLen = 2; yLen <= min(3, yLen0); yLen++)
       if (!padD1InvStep2(y, yt, yLen) || !padD1InvStep2(yt, y, yLen))
          return 0;
 #endif
    /* Invert horiz & vert Daubechies D2 transform on subsequent levels */
    for (i = d2Iter; i; i--) {
       /* Get num detail coeffs and their offset for current iteration */
       for (j = 0, ytOffset = yLen0, yLen = nextyLen = yLenOrg; j < i; j++) {
          nextyLen = yLen;
          ytOffset -= yLen = padD2Step2(NULL, NULL, yLen, 0);
       }
 
       if (!(ytLen = padD2InvStep2(y, yt, yLen, ytOffset)) ||
          !(ytLen = padD2InvStep2(yt, y, yLen, ytOffset))
       )
          return 0;
    
       /* XXX: Drop excess reconstructed approximation coeffs to match number
          of detail coeffs in next iteration. */
       yLen = min(ytLen, nextyLen);
    }
    
    return yLenOrg;
 }
 
 
 
 #ifdef WAVELET_TEST_2DPAD
-   #include <stdio.h>
-   #include "pmapcontrib.h"
+   #ifdef WAVELET_TEST_mRGBE
+      #include "mrgbe.h"
+      #define  TINY  1e-6
+      #define  HUGE  1e10
+   #endif
+
+
+   #ifndef WAVELET_TEST_INIT
+      /* Default test data initialisation */
+      #define WAVELET_TEST_INIT 0
+   #endif
+   
+   #if (WAVELET_TEST_INIT < 0 || WAVELET_TEST_INIT > 7)
+      #error Invalid wavelet unit test data initialiation option
+   #endif
+
+   
+   /* Define to save inv. xformed data to file */
+   #define WAVELET_TEST_OUT               "wavelet3-test-xform.dat"
+   #define WAVELET_TEST_OUT_mRGBE         "wavelet3-test-xform-mrgbe.dat"
+   
    
-   #define WAVELET_TEST_INIT 6
+   #ifdef WAVELET_TEST_LOG
+      #define WAVELET_TEST_VAL(c)         ((c) > TINY ? log(c) : 0)
+      #define WAVELET_TEST_IVAL(c)        (fabs(c) > TINY ? exp(c) : 0)
+   #else
+      #define WAVELET_TEST_VAL(c)         (c)
+      #define WAVELET_TEST_IVAL(c)        (c)
+   #endif    
 
    
       
    static void avgDetailCoeffs (const WaveletMatrix2 y, unsigned yLen, 
       WaveletCoeff3 avg
    )
    {
       unsigned i, j, k, numCoeffs = 0;
       
       avg [0] = avg [1] = avg [2] = 0;
       for (i = 0; i < yLen; i++)
          for (j = 0; j < yLen; j++)
             /* Skip zero and cleared coeffs */
             if ((i > 2 || j > 2) && !coeffIsZero(y [i][j]) && 
                !coeffIsEmpty(y [i][j])
             ) {
                for (k = 0; k < 3; k++)
                   avg [k] += y [i][j][k];
                numCoeffs++;
             }
       
       for (k = 0; k < 3; k++)
          avg [k] /= numCoeffs;
    }
          
    
    
    int main (int argc, char *argv [])
    {
       int            i, j, k;
       unsigned       y0Len, yLen, numThresh = 0, numNonzero = 0;
-      WaveletMatrix2  y0 = NULL, y = NULL, yt = NULL;
+      WaveletMatrix2 y0 = NULL, y = NULL, yt = NULL;
       FILE           *dataFile = NULL;
       WAVELET_COEFF  inData, thresh = 0;
       WaveletCoeff3  avgDetail;
    #ifdef WAVELET_TEST_mRGBE
-      #define        HUGE 1e10
       mRGBE          mrgbeCoeff;
       mRGBERange     mrgbeRange;
       WaveletMatrix2 ymrgbe = NULL;
    #endif
       
       if (argc < 2) {
          fprintf(stderr, "%s <dim> [threshold] [dataFile]\n", argv [0]);
-         fputs("Missing array dimension dim >= 2, "
-            "compression threshold >= 0\n", stderr
-         );
+         fputs("Missing array dimension dim >= 2\n", stderr);
          return -1;
       }
       if (!(y0Len = atoi(argv [1])) || y0Len < 2) {
          fprintf(stderr, "Invalid array dimension %d\n", y0Len);
          return -1;
       }
       if (argc > 2 && (thresh = atof(argv [2])) < 0) {
          fprintf(stderr, "Invalid threshold %.3f\n", thresh);
          return -1;
       }
       
       /* Get size of (padded) output array */
       if (!(yLen = padWaveletXform2(NULL, NULL, y0Len, &numNonzero))) {
          fputs("Can't determine number of coefficients "
             "(input too short?)\n", stderr
          );
          return -1;
       }
       
       /* Allocate & clear arrays for original and reconstruction */
       if (!(y0 = allocWaveletMatrix2(y0Len)) || 
          !(y = allocWaveletMatrix2(yLen)) ||
          !(yt = allocWaveletMatrix2(yLen))
    #ifdef WAVELET_TEST_mRGBE
          || !(ymrgbe = allocWaveletMatrix2(yLen))
    #endif
       ) {
          fprintf(stderr, "Failed allocating %dx%d array\n", yLen, yLen);
          return -1;
       }
       clearCoeffs2(y0, y0Len);
       clearCoeffs2(y, yLen);
       clearCoeffs2(yt, yLen);
       
       if (argc > 3) {
          /* Load data from file; length must not exceed allocated */
          if (!(dataFile = fopen(argv [3], "r"))) {
             fprintf(stderr, "Failed opening data file %s\n", argv [3]);
             return -1;
          }
       
          for (i = 0; i < y0Len; i++) {
             for (j = 0; j < y0Len; j++) {
                if (feof(dataFile)) {
                   fprintf(stderr, 
                      "Premature end of file reading data from %s\n",
-                     argv [2]
+                     argv [3]
                   );
                   fclose(dataFile);
                   return -1;
                }
             
                /* Read next float, skipping any leading whitespace */
                if (fscanf(dataFile, " %lf", &inData)) {
                   y0 [i][j][0] = y0 [i][j][1] = y0 [i][j][2] = 
                      y [i][j][0] = y [i][j][1] = y [i][j][2] = inData;
                }
                else {
                   fprintf(stderr, 
                      "Error reading from data file %s\n", argv [2]
                   );
                   fclose(dataFile);
                   return -1;
                }
             }
          }
          
          fclose(dataFile);
       }
       else {
          /* Init input */
          srand48(111);
+
          for (i = 0; i < y0Len; i++) {
             for (j = 0; j < y0Len; j++) {
                for (k = 0; k < 3; k++) {
                   y0 [i] [j] [k] = y [i] [j] [k] =                  
    #if WAVELET_TEST_INIT == 0
                      /* Random data, channel-independent */
                      drand48();
    #elif WAVELET_TEST_INIT == 1
-                     /* Random data, indentical for all channels */
-                     k ? y [i] [j] [k - 1] : drand48();
+                     /* Random data, but correlated channels */
+                     k  ? (0.1 + 0.9 * drand48()) * y [i] [j] [k - 1] 
+                        : drand48();
    #elif WAVELET_TEST_INIT == 2
+                     /* Random data, but identical channels */
+                     k ? y [i] [j] [k - 1] : drand48();
+   #elif WAVELET_TEST_INIT == 3
                      /* Monotonically increasing along axis 0 (avoiding
                       * zero in case of log encoding) */
                      i + 1;
-   #elif WAVELET_TEST_INIT == 3
+   #elif WAVELET_TEST_INIT == 4
                      /* Monotonically increasing along axis 1 (avoiding zero
                       * in case of log encoding) */
                      j + 1;
-   #elif WAVELET_TEST_INIT == 4
+   #elif WAVELET_TEST_INIT == 5
                      /* Monotonically increasing along both axes (avoiding
                       * zero in case of log encoding) */
                      i * j + 1;
-   #elif WAVELET_TEST_INIT == 5
+   #elif WAVELET_TEST_INIT == 6
                      /* Monotonically increasing by linear index (avoiding
                       * zero in case of log encoding) */
                      i * y0Len + j + 1;
-   #elif WAVELET_TEST_INIT == 6
+   #elif WAVELET_TEST_INIT == 7
                      /* Symmetric "bump" function (avoiding zero in case of
                       * log encoding) */
                      (1.1 - fabs(j - y0Len/2. + 0.5) / (y0Len/2. - 0.5)) *
                      (1.1 - fabs(i - y0Len/2. + 0.5) / (y0Len/2. - 0.5));
-   #elif WAVELET_TEST_INIT == 7
-                     0;
-                  {
-                     /* Reference contribs from rcontrib classic */
-                     #include "rc-ref.c"
-                     
-                     if (y0Len * y0Len != PMAP_CONTRIB_REFSIZE) {
-                        fprintf(stderr, 
-                           "reference contribs require dim %d\n",
-                           (int)sqrt(PMAP_CONTRIB_REFSIZE)
-                        );
-                        return -1;
-                     }
-                     
-                     y0 [i] [j] [k] = y [i] [j] [k] =
-                        refContrib [PMAP_CONTRIB_XY2LIN(y0Len, i, j)] [k];
-                  }
    #endif
                }
             }
          }
       }
 
    #ifdef WAVELET_DBG
       puts("----------------------- FWD XFORM -------------------------\n");
    #endif
       
-   #ifdef PMAP_CONTRIB_LOG
-      /* Optionally apply logarithm before xform */
+   #ifdef WAVELET_TEST_LOG
+      /* Apply logarithm to input before xform */
       for (i = 0; i < y0Len; i++)
          for (j = 0; j < y0Len; j++)
             for (k = 0; k < 3; k++)
-               y [i] [j] [k] = PMAP_CONTRIB_VAL(y [i] [j] [k]);
+               y [i] [j] [k] = WAVELET_TEST_VAL(y [i] [j] [k]);
    #endif
       
       /* Forward xform */
       if (!padWaveletXform2(y, yt, y0Len, NULL)) {
          fputs("Forward xform failed\n", stderr);
          return -1;
       }
 
       /* Threshold coefficients; we use hard thresholding as it's easier to
        * implement than soft thresholding, which requires sorting the
        * coefficients.         
        * NOTE: the coarsest approximation coefficients in the upper left of
        * y are omitted as they are essential for reconstruction!  */
       for (i = numThresh = 0; i < yLen; i++)
          for (j = 0; j < yLen; j++) {
             if (coeffThresh(y, i, j, thresh)) {
                y [i][j][0] = y [i][j][1] = y [i][j][2] = 0;
                numThresh++;
    #if 0
                /* Replace thresholded values with random noise in range
                   [-threshold, threshold] */
                y [i][j][0] = y [i][j][1] = y [i][j][2] = thresh * (
                   2 * drand48() - 1
                );
    #endif
             }
          }
 
    #ifdef WAVELET_DBG
       /* Dump coefficients */
       puts("----------------------- COEFFICIENTS -------------------------\n");
-      dumpCoeffs2(y, NULL, yLen, 0, thresh);
+      dumpCoeffs2(y, NULL, yLen, 0, thresh, NULL);
       printf("\n%d/%d nonzero coefficients", numNonzero, yLen * yLen);
       
       avgDetailCoeffs(y, yLen, avgDetail);
       printf("\nAvg = [%.3g, %.3g, %.3g]", 
          avgDetail [0], avgDetail [1], avgDetail [2]
       );
       
       if (numThresh)
          printf("\n%d/%d coefficients thresholded = %.1f%% compression", 
             numThresh, yLen * yLen, 100. * numThresh / (yLen * yLen)
          );      
    #endif
    
    #ifdef WAVELET_TEST_mRGBE
       /* Test mRGBE coefficient encoding */
       mrgbeRange.min [0] = mrgbeRange.min [1] = mrgbeRange.min [2] = HUGE;
       mrgbeRange.max [0] = mrgbeRange.max [1] = mrgbeRange.max [2] = 0;
       clearCoeffs2(ymrgbe, yLen);   
       
       /* Find min/max coeff and init mRGBE range, omitting upper left 3x3
        * approximation coefficients, since these aren't encoded */
       for (i = 0; i < yLen; i++)
          for (j = 0; j < yLen; j++)
             if ((i >= WAVELET_PADD2_APPROXDIM || 
                   j >= WAVELET_PADD2_APPROXDIM
                ) && !coeffIsEmpty(y [i][j]) && !coeffIsZero(y [i][j])
             )
                for (k = 0; k < 3; k++) {
                   /* Skip empty (NaN) coeffs */
                   inData = fabs(y [i][j][k]);
                   if (inData < mrgbeRange.min [k])
                      mrgbeRange.min [k] = inData;
                   if (inData > mrgbeRange.max [k])
                      mrgbeRange.max [k] = inData;
                }
 
       mRGBEinit(&mrgbeRange, mrgbeRange.min, mrgbeRange.max);      
       
       /* Encode coeffs to mRGBE and back, but preserve upper 3x3
        * approximation coefficients */
       for (i = 0; i < yLen; i++)
          for (j = 0; j < yLen; j++)
             if ((i >= WAVELET_PADD2_APPROXDIM || 
                   j >= WAVELET_PADD2_APPROXDIM
                ) && !coeffIsEmpty(y [i][j]) && !coeffIsZero(y [i][j])
             ) {
                mrgbeCoeff = mRGBEencode(y [i][j], &mrgbeRange, 0);
                mRGBEdecode(mrgbeCoeff, &mrgbeRange, ymrgbe [i][j]);
             }
             else for (k = 0; k < 3; k++)
                ymrgbe [i][j][k] = y [i][j][k];
          
       #ifdef WAVELET_DBG
          /* Dump mRGBE-decoded coefficients */
          puts("\n\n-------------------- mRGBE COEFFICIENTS ----------------------\n");           
-         dumpCoeffs2(ymrgbe, NULL, yLen, yLen, 0);
+         dumpCoeffs2(ymrgbe, NULL, yLen, yLen, 0, NULL);
          
          avgDetailCoeffs(y, yLen, avgDetail);
          printf("\nAvg = [%.3g, %.3g, %.3g]", 
             avgDetail [0], avgDetail [1], avgDetail [2]
          );
       #endif
    #endif
 
    #ifdef WAVELET_DBG
       puts("\n----------------------- INV XFORM -------------------------\n");
    #endif
    
       /* Inverse xform */
       if (!padWaveletInvXform2(y, yt, yLen, y0Len)) {
          fputs("\nInverse xform failed\n", stderr);
          return -1;
       }
 
    #ifdef WAVELET_TEST_mRGBE
       #ifdef WAVELET_DBG
          puts("\n------------------- INV XFORM (mRGBE) ---------------------\n");
       #endif
       
       /* Inverse xform using mRGBE coeffs if enabled */
       if (!padWaveletInvXform2(ymrgbe, yt, yLen, y0Len)) {
          fputs("\nInverse xform failed\n", stderr);
          return -1;
       }
    #endif
    
-   #ifdef PMAP_CONTRIB_LOG
-      /* Optionally invert logarithm after xform */
+   #ifdef WAVELET_TEST_LOG
+      /* Invert logarithm after xform */
       for (i = 0; i < y0Len; i++)
          for (j = 0; j < y0Len; j++)
             for (k = 0; k < 3; k++) {
-               y [i] [j] [k] = PMAP_CONTRIB_IVAL(y [i] [j] [k]);
+               y [i] [j] [k] = WAVELET_TEST_IVAL(y [i] [j] [k]);
       #ifdef WAVELET_TEST_mRGBE
-               ymrgbe [i] [j] [k] = PMAP_CONTRIB_IVAL(ymrgbe [i] [j] [k]);
+               ymrgbe [i] [j] [k] = WAVELET_TEST_IVAL(ymrgbe [i] [j] [k]);
       #endif
             }
    #endif
    
    #ifdef WAVELET_DBG
       puts("\n--------------------- ORIG vs. INV XFORM ------------------------\n");
-      dumpCoeffs2(y0, y, y0Len, y0Len, 0);
+      dumpCoeffs2(y0, y, y0Len, y0Len, 0, NULL);
+
+      #ifdef WAVELET_TEST_OUT
+         if (!(dataFile = fopen(WAVELET_TEST_OUT, "w"))) {
+            fprintf(stderr, "Cannot open output file %s\n",
+               WAVELET_TEST_OUT
+            );
+            return -1;
+         }
+         dumpCoeffs2(y, NULL, y0Len, 0, 0, dataFile);
+         fclose(dataFile);
+         fprintf(stderr, "\nDumped to %s\n", WAVELET_TEST_OUT);
+      #endif
    #endif
       printf("\nAvg RMSE = %.2f\n", rmseCoeffs2(y0, y, y0Len));
       
    #ifdef WAVELET_TEST_mRGBE
       #ifdef WAVELET_DBG
          puts("\n------------------ ORIG vs. INV XFORM + mRGBE -------------------\n");
-         dumpCoeffs2(y0, ymrgbe, y0Len, y0Len, 0);
+         dumpCoeffs2(y0, ymrgbe, y0Len, y0Len, 0, NULL);
+
+         #ifdef WAVELET_TEST_OUT_mRGBE
+            if (!(dataFile = fopen(WAVELET_TEST_OUT_mRGBE, "w"))) {
+               fprintf(stderr, "Cannot open output file %s\n",
+                  WAVELET_TEST_OUT_mRGBE
+               );
+               return -1;
+            }
+            dumpCoeffs2(ymrgbe, NULL, y0Len, 0, 0, dataFile);
+            fclose(dataFile);
+            fprintf(stderr, "\nDumped to %s\n", WAVELET_TEST_OUT_mRGBE);
+         #endif         
       #endif
+            
       printf("\nAvg RMSE with mRGBE enc = %.2f\n", 
          rmseCoeffs2(y0, ymrgbe, y0Len)
       );
    #endif
       
       freeWaveletMatrix2(y0, y0Len);
       freeWaveletMatrix2(y, yLen);
       freeWaveletMatrix2(yt, yLen);
    #ifdef WAVELET_TEST_mRGBE
       freeWaveletMatrix2(ymrgbe, yLen);
    #endif
       return 0;
    }
 #endif /* WAVELET_TEST_2DPAD */