diff --git a/pmapcontrib.c b/pmapcontrib.c
index 32df644..a8609c2 100644
--- a/pmapcontrib.c
+++ b/pmapcontrib.c
@@ -1,1165 +1,1219 @@
 #ifndef lint
 static const char RCSid[] = "$Id: pmapcontrib.c,v 2.20 2021/02/16 20:06:06 greg Exp $";
 #endif
 
 /* 
    =========================================================================
    Photon map routines for precomputed light source contributions. 
    These routines handle contribution binning, compression and encoding,
    and are used by mkpmap.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #147053, "Daylight Redirecting Components",
         SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id: pmapcontrib.c,v 2.20 2021/02/16 20:06:06 greg Exp $
 */
 
 
 
 #include "pmapcontrib.h"
 #include "pmapdiag.h"
 #include "pmaprand.h"
 #include "pmapmat.h"
 #include "pmapsrc.h"
 #include "pmutil.h"
 #include "otspecial.h"
 #include "otypes.h"
 #include "lookup.h"
 
 
 
 #ifdef PMAP_CONTRIB
 /* The following are convenient placeholders interfacing to mkpmap 
    and rcontrib. They can be externally set via initPmapContribTab()
    and referenced within the contrib pmap modules. These variables 
    can then be safely ignored by rtrace/rpict/rvu, without annoying
    linking errors. */
 /* Global pointer to rcontrib's contribution binning LUT */
 LUTAB *pmapContribTab = NULL;
 /* Contribution/coefficient mode flag */
 int *pmapContribMode;
 
 
 
 extern void SDdisk2square(double sq[2], double diskx, double disky);
 
 
 
 #if 0
    int xy2lin (unsigned scDim, int x, int y)
    /* Serialise 2D contribution coords in (scDim x scDim) Shirley-Chiu square
       to 1D index.  Returns -1 if coordinates invalid */
    {
       return x < 0 || y < 0 
          ? -1
          : (x * scDim) + y;
    }
 
 
 
    void lin2xy (unsigned scDim, int bin, int *x, int *y)
    /* Deserialise 1D contribution index to 2D coordinates in (scDim x scDim)
     * Shirley-Chiu square.  Returns -1 if bin invalid */
    {
       *x = bin < 0 ? -1 : bin / scDim;
       *y = bin < 0 ? -1 : bin % scDim;
    }
 #endif
 
 
 
 static int ray2bin (const RAY *ray, unsigned scDim)
 /* Map ray dir (pointing away from origin) to its 1D bin in an
    (scDim x scDim) Shirley-Chiu square.
    Returns -1 if mapped bin is invalid (e.g. behind plane) */
 {
    static int        scRHS, varInit = 0;
    static FVECT      scNorm, scUp;
    unsigned          scBin [2];
    FVECT             diskPlane;
    RREAL             dz, diskx, disky, rad, diskd2, scCoord [2];
 
    if (!varInit) {
       /* Lazy init shirley-Chiu mapping orientation from function variables */
       scRHS       = varvalue(PMAP_CONTRIB_SCRHS);
       scNorm [0]  = varvalue(PMAP_CONTRIB_SCNORMX);
       scNorm [1]  = varvalue(PMAP_CONTRIB_SCNORMY);
       scNorm [2]  = varvalue(PMAP_CONTRIB_SCNORMZ);
       scUp [0]    = varvalue(PMAP_CONTRIB_SCUPX);
       scUp [1]    = varvalue(PMAP_CONTRIB_SCUPY);
       scUp [2]    = varvalue(PMAP_CONTRIB_SCUPZ);
       varInit ^= 1;
    }
 
    /* Map incident direction to disk */
    dz = DOT(ray -> rdir, scNorm);                         /* normal proj */
    if (dz > 0) {
       fcross(diskPlane, scUp, scNorm);    /* y-axis in plane, perp to up */
       diskPlane [0] *= scRHS; 
       diskx = DOT(ray -> rdir, diskPlane);
       disky = DOT(ray -> rdir, scUp) - dz * DOT(ray -> rdir, scUp);
       diskd2 = diskx * diskx + disky * disky;       /* in-plane length^2 */
       rad = dz>FTINY ? sqrt((1 - dz*dz) / diskd2) : 0;  /* radial factor */
       disky *= rad;                 /* diskx, disky now in range [-1, 1] */
 
       /* Apply Shirley-Chiu mapping of (diskx, disky) to square */
       SDdisk2square(scCoord, diskx, disky);
       /* Map Shirley-Chiu square coords to 1D bin */
       scBin [0] = scCoord [0] * scDim;
       scBin [1] = scCoord [1] * scDim;
       return PMAP_CONTRIB_XY2LIN(scDim, scBin [0], scBin [1]);
    }
    else return -1;
 }
 
 
 /* ------------------ CONTRIBSRC  STUFF --------------------- */
 
 
 #ifndef PMAP_CONTRIB_TEST
    MODCONT *addContribModifier (LUTAB *contribTab, unsigned *numContribs, 
       char *mod, char *binParm, int binCnt
    )
    /* Add light source modifier mod to contribution lookup table contribTab,
       and update numContribs.  Return initialised contribution data for this
       modifier. This code adapted from rcontrib.c:addmodifier(). */
    {
       LUENT    *lutEntry = lu_find(contribTab, mod);
       MODCONT  *contrib;
       EPNODE   *eBinVal;
       unsigned numCoeffs;
 
       /* Check for overflow in mRGBE encoding BEFORE photon distrib!
          This requires getting the number of wavelet coefficients generated
          by the transform a priori. */
       if (!(numCoeffs = padWaveletXform2(NULL, NULL, sqrt(binCnt), NULL))) {
          sprintf(errmsg, "can't determine number of wavelet coefficients "
             "for modifier %s", mod
          );
          error(INTERNAL, errmsg);
       }
          
       if (numCoeffs * numCoeffs > PMAP_CONTRIB_MAXCOEFFS) {
          sprintf(errmsg, "too many coefficients (max %d) for modifier %s; "
             "reduce -bn", PMAP_CONTRIB_MAXCOEFFS, mod
          );
          error(USER, errmsg);
       }
        
       if (lutEntry -> data) {
          /* Reject duplicate modifiers */
          sprintf(errmsg, "duplicate light source modifier %s", mod);
          error(USER, errmsg);
       }
       
       if (*numContribs >= MAXMODLIST) {
          sprintf(errmsg, "too many modifiers (max. %d)", MAXMODLIST);
          error(INTERNAL, errmsg);
       }
       
       lutEntry -> key = mod;
       
       if (binCnt <= 0) {
          sprintf(errmsg, "undefined/invalid bin count for modifier %s", mod);
          error(USER, errmsg);
       }
       
       /* Allocate and init contributions */
       contrib = (MODCONT *)malloc(
          sizeof(MODCONT) + sizeof(DCOLOR) * (binCnt - 1)
       );
       if (!contrib)
          error(SYSTEM, "out of memory in addContribModifier()");
       contrib -> outspec   = NULL;
       contrib -> modname   = mod;
       contrib -> params    = binParm;
       contrib -> nbins     = binCnt;
       contrib -> binv      = eBinVal;
       contrib -> bin0      = 0;
       memset(contrib -> cbin, 0, sizeof(DCOLOR) * binCnt);
       lutEntry -> data = lutEntry -> data = (char *)contrib;
       ++(*numContribs);
 
       return(contrib);
    }
 
 
 
    void addContribModfile (LUTAB *contribTab, unsigned *numContribs, 
       char *modFile, char *binParm, int binCnt
    )
    /* Add light source modifiers from file modFile to contribution lookup table
     * contribTab, and update numContribs. 
     * NOTE: This code is adapted from rcontrib.c */
    {
       char  *mods [MAXMODLIST];
       int   i;
       
       /* Find file and store strings */
       i = wordfile(mods, MAXMODLIST, getpath(modFile, getrlibpath(), R_OK));
       if (i < 0) {
          sprintf(errmsg, "can't open modifier file %s", modFile);
          error(SYSTEM, errmsg);
       }
       if (*numContribs + i >= MAXMODLIST - 1) {
          sprintf(errmsg, "too many modifiers (max. %d) in file %s", 
             MAXMODLIST - 1, modFile
          );
          error(INTERNAL, errmsg);
       }
 
       for (i = 0; mods [i]; i++)
          /* Add each modifier */
          addContribModifier(contribTab, numContribs, 
             mods [i], binParm, binCnt
          );
    }
 
 
 
    static int contribSourceBin (LUTAB *contribs, const RAY *ray)
    /* Map contribution source ray to its bin for light source ray -> rsrc,
       using Shirley-Chiu disk-to-square mapping.
       Return invalid bin -1 if mapping failed. */
    {
       const SRCREC   *src;
       const OBJREC   *srcMod;
       const MODCONT  *srcCont;
       RAY            srcRay;
       int            bin, i;
 
       /* Check we have a valid ray and contribution LUT */
       if (!ray || !contribs)
          return -1;
 
       src = &source [ray -> rsrc];
       srcMod = findmaterial(src -> so);
       srcCont = (MODCONT*)lu_find(contribs, srcMod -> oname) -> data;
 
       if (!srcCont)
          /* Not interested in this source (modifier not in contrib LUT) */
          return -1;
 
       /* Set up shadow ray pointing to source for disk2square mapping */
       rayorigin(&srcRay, SHADOW, NULL, NULL);
       srcRay.rsrc = ray -> rsrc;
       VCOPY(srcRay.rorg, ray -> rop);
       for (i = 0; i < 3; i++)
          srcRay.rdir [i] = -ray -> rdir [i];
               
       if (!(src -> sflags & SDISTANT 
          ? sourcehit(&srcRay) 
          : (*ofun[srcMod -> otype].funp)(srcMod, &srcRay)
       ))
          /* (Redundant?) sanity check for valid source ray? */
          return -1;
    #if 0
       worldfunc(RCCONTEXT, &srcRay);
    #endif      
       set_eparams((char*)srcCont -> params);
 
       if ((bin = ray2bin(&srcRay, sqrt(srcCont -> nbins))) < 0)
          error(WARNING, "Ignoring invalid bin in contribSourceBin()");
 
       return bin;
    }
 
 
 
    PhotonContribSourceIdx newPhotonContribSource (PhotonMap *pmap, 
       const RAY *contribSrcRay, FILE *contribSrcHeap
    )
    /* Add contribution source ray for emitted contribution photon and save 
     * light source index and binned direction. The current contribution 
     * source is stored in pmap -> lastContribSrc. If the previous contrib
     * source spawned photons (i.e. has srcIdx >= 0), it's appended to 
     * contribSrcHeap. If contribSrcRay == NULL, the current contribution 
     * source is still flushed, but no new source is set. Returns updated 
     * contribution source counter pmap -> numContribSrc. */
    {
       if (!pmap || !contribSrcHeap)
          return 0;
          
       /* Check if last contribution source has spawned photons (srcIdx >= 0, 
        * see newPhoton()), in which case we save it to the heap file before 
        * clobbering it. (Note this is short-term storage, so we doan' need 
        * da portable I/O stuff here). */
       if (pmap -> lastContribSrc.srcIdx >= 0) {
          if (!fwrite(&pmap -> lastContribSrc, sizeof(PhotonContribSource), 
             1, contribSrcHeap
          ))
             error(SYSTEM, "failed writing photon contrib source in "
                "newPhotonContribSource()"
             );
             
          pmap -> numContribSrc++;
          if (!pmap -> numContribSrc || 
             pmap -> numContribSrc > PMAP_MAXCONTRIBSRC
          );
       }
 
       if (contribSrcRay) {
          /* Mark this contribution source unused with a negative source
             index until its path spawns a photon (see newPhoton()) */
          pmap -> lastContribSrc.srcIdx = -1;
 
          /* Map ray to bin in anticipation that this contrib source will be
             used, since the ray will be lost once a photon is spawned */
          pmap -> lastContribSrc.srcBin = contribSourceBin(
             pmapContribTab, contribSrcRay
          );
 
          if (pmap -> lastContribSrc.srcBin < 0) {
             /* Warn if invalid bin, but trace photon nonetheless. It will
                count as emitted to prevent bias, but will not be stored in
                newPhoton(), as it contributes zero flux */
             sprintf(errmsg, "invalid bin for light source %s, "
                "contribution photons will be discarded", 
                source [contribSrcRay -> rsrc].so -> oname
             );
             error(WARNING, errmsg);
          }
       }
 
       return pmap -> numContribSrc;
    }
 
 
 
    PhotonContribSourceIdx buildContribSources (PhotonMap *pmap, 
       FILE **contribSrcHeap, char **contribSrcHeapFname, 
       PhotonContribSourceIdx *contribSrcOfs, unsigned numHeaps
    )
    /* Consolidate per-subprocess contribution source heaps into array
     * pmap -> contribSrc. Returns offset for contribution source index 
     * linearisation in pmap -> numContribSrc. The heap files in 
     * contribSrcHeap are closed on return. */
    {
       PhotonContribSourceIdx  heapLen;
       unsigned                heap;
 
       if (!pmap || !contribSrcHeap || !contribSrcOfs || !numHeaps)
          return 0;
 
       pmap -> numContribSrc = 0;
 
       for (heap = 0; heap < numHeaps; heap++) {
          contribSrcOfs [heap] = pmap -> numContribSrc;
          
          if (fseek(contribSrcHeap [heap], 0, SEEK_END) < 0)
             error(SYSTEM, "failed photon contrib source seek "
                "in buildContribSources()"
             );
          pmap -> numContribSrc += heapLen = 
             ftell(contribSrcHeap [heap]) / sizeof(PhotonContribSource);
 
          if (!(pmap -> contribSrc = realloc(pmap -> contribSrc, 
             pmap -> numContribSrc * sizeof(PhotonContribSource)
          ))) 
             error(SYSTEM, "failed photon contrib source alloc "
                "in buildContribSources()"
             );
 
          rewind(contribSrcHeap [heap]);
          if (fread(pmap -> contribSrc + contribSrcOfs [heap], 
             sizeof(PhotonContribSource), heapLen, contribSrcHeap [heap]
          ) != heapLen)
             error(SYSTEM, "failed reading photon contrib source "
                "in buildContribSources()"
             );
          
          fclose(contribSrcHeap [heap]);
          unlink(contribSrcHeapFname [heap]);
       }
       
       return pmap -> numContribSrc;
    }
 
 
 
    /* ----------------- CONTRIB PMAP ENCODING STUFF -------------------- */
 
 
 
    static int coeffCompare (const void *c1, const void *c2)
    /* Comparison function to REVERSE sort thresholded coefficients */
    {
       const PreComputedContribCoeff *tcoeff1 = c1, *tcoeff2 = c2;
       /* Use dot product as magnitude to compare _absolute_ values */
       const WAVELET_COEFF  v1 = DOT(tcoeff1 -> coeff, tcoeff1 -> coeff),
                            v2 = DOT(tcoeff2 -> coeff, tcoeff2 -> coeff);
       
       if (v1 < v2)
          return 1;
       else if (v1 > v2)
          return -1;
       else
          return 0;
    }
 
 
 
    static int thresholdContribs (PreComputedContrib *preCompContrib)
-   /* Threshold wavelet detail coefficients in preCompContrib ->
-      waveletMatrix [1..coeffDim-1] [1..coeffDim-1] by keeping the
-      (preCompContrib -> nCompressedCoeffs) largest of these and returning them
-      in preCompContrib -> compressedCoeffs along with their original order. 
-      NOTE: preCompContrib -> waveletMatrix [0][0] is the average wavelet
-      coefficient and excluded from thresholding.  
-      Returns 0 on success. */
+   /* Threshold wavelet detail coefficients in preCompContrib -> 
+      waveletMatrix [approxDim..coeffDim-1] [approxDim..coeffDim-1] (where
+      approxDim = WAVELET_PADD4_APPROXDIM) by keeping the (preCompContrib -> 
+      nCompressedCoeffs) largest of these and returning them in
+      preCompContrib -> compressedCoeffs along with their original matrix
+      indices. 
+      NOTE: The wavelet approximation coefficients 
+      preCompContrib -> waveletMatrix [0..approxDim-1] [0..approxDim-1] 
+      and excluded from thresholding to minimise compression artefacts.
+      Returns 0 on success, else -1. */
    {
-      unsigned                i, j, coeffDim, coeffIdx;
+      unsigned                i, j, coeffDim, coeffIdx, nNzDetailCoeffs,
+                              numThresh;
       WaveletMatrix2          waveletMatrix;
-      PreComputedContribCoeff *threshCoeffs;
+      PreComputedContribCoeff *threshCoeffs, *threshCoeffPtr;
       
       if (!preCompContrib || !(coeffDim = preCompContrib -> coeffDim) ||
          !(threshCoeffs = preCompContrib -> compressedCoeffs) ||
          !(waveletMatrix = preCompContrib -> waveletMatrix)
       )
          /* Should be initialised by caller! */
          return -1;
 
       /* Set up coefficient buffer for compression (thresholding), skipping
-       * coefficient at waveletMatrix [0][0] since it's the average and
-       * therefore not thresholded.  The 2D coefficient indices (matrix
-       * coords) are linearised to 1D.  */
-      for (i = 0; i < coeffDim; i++)
-         for (j = 0; j < coeffDim; j++) {
-            coeffIdx = PMAP_CONTRIB_XY2LIN(coeffDim, i, j);
-            
-            if (coeffIdx) {
-               /* Set up pointers to coeffs (sorted faster than 3 doubles)
-                  and remember original (linearised) index prior to sort */
-               threshCoeffs [coeffIdx - 1].idx = coeffIdx;
-               threshCoeffs [coeffIdx - 1].coeff = (WAVELET_COEFF*)&(
+       * the approximation coefficients in the upper left of waveletMatrix,
+       * which are not thresholded.  Also skip zero coefficients (resulting
+       * from padding), since these are already implicitly thresholded.  The
+       * original 2D matrix indices are linearised to 1D and saved with each
+       * coefficient to restore the original sparse coefficient matrix.  */
+      for (i = 0, threshCoeffPtr = threshCoeffs; i < coeffDim; i++)
+         for (j = 0; j < coeffDim; j++)
+            if ((i >= WAVELET_PADD4_APPROXDIM || 
+                  j >= WAVELET_PADD4_APPROXDIM
+               ) && !coeffIsZero(waveletMatrix [i] [j])
+            ) {
+               /* Nonzero detail coefficient; set up pointer to coeff 
+                  (sorts faster than 3 doubles) and save original
+                  (linearised) matrix index */
+               threshCoeffPtr -> idx = PMAP_CONTRIB_XY2LIN(coeffDim, i, j);
+               threshCoeffPtr++ -> coeff = (WAVELET_COEFF*)&(
                   waveletMatrix [i] [j]
                );
             }
-         }
+      
+      /* Num of nonzero detail coeffs in buffer, number actually expected */
+      numThresh = threshCoeffPtr - threshCoeffs;
+      nNzDetailCoeffs = WAVELET_PADD4_NUMDETAIL(
+         preCompContrib -> nNonZeroCoeffs
+      );
+      
+      /* If fewer nonzero detail coeff are in the threshold buffer than
+       * anticipated, the loop below fills the remainder of the threshold
+       * buffer with duplicates of a coefficient in the lower right of the
+       * matrix, which is padding and guaranteed to be zero.  This condition
+       * can occur if the wavelet transform actually generated genuine zero
+       * detail coefficients.  Infact it's quite common if the wavelet
+       * transformed contributions are quite sparse.  */
+      for (i = j = coeffDim - 1; numThresh < nNzDetailCoeffs; numThresh++) {
+         threshCoeffPtr -> idx = PMAP_CONTRIB_XY2LIN(coeffDim, i, j);
+         threshCoeffPtr++ -> coeff = (WAVELET_COEFF*)(waveletMatrix [i][j]);
+      }
       
       /* REVERSE sort coeffs by magnitude; the non-thresholded coeffs
          will then start at threshCoeffs [0] (note nCoeffs == coeffDim^2) */
       /* XXX: PARTITIONING WOULD ACTUALLY BE SUFFICIENT AND FASTER! */
-      qsort(threshCoeffs, preCompContrib -> nCoeffs - 1,
-         sizeof(PreComputedContribCoeff), coeffCompare
+      qsort(threshCoeffs, nNzDetailCoeffs, sizeof(PreComputedContribCoeff), 
+         coeffCompare
       );
       
       return 0;
    }
 
 
 
    static int encodeContribs (PreComputedContrib *preCompContrib, 
       float compressRatio
    )
    /* Apply wavelet transform to input matrix preCompContrib ->
       waveletMatrix and compress according to compressRatio, storing
       thresholded and mRGBE-encoded coefficients in preCompContrib ->
-      mrgbeCoeffs.  Note that the average coefficient is not encoded, and
-      returned in preCompContrib -> waveletMatrix [0][0].  
+      mrgbeCoeffs.  Note that the approximation coefficients in the upper
+      left of the matrix are not encoded, and returned in 
+      preCompContrib -> waveletMatrix
+         [0..WAVELET_PADD4_APPROXDIM-1] [0..WAVELET_PADD4_APPROXDIM-1].
       Returns 0 on success.  */
    {
       unsigned                   i, j, k, scDim;
       WaveletMatrix2             waveletMatrix, tWaveletMatrix;
       PreComputedContribCoeff    *threshCoeffs;
       mRGBERange                 *mrgbeRange;
       mRGBE                      *mrgbeCoeffs;
       WAVELET_COEFF              absCoeff;
    #ifdef PMAP_CONTRIB_DBG
       WaveletCoeff3              decCoeff;
       unsigned                   decIdx;
    #endif 
       
       if (!preCompContrib || !preCompContrib -> mrgbeCoeffs ||
          !preCompContrib -> compressedCoeffs || 
          !(waveletMatrix = preCompContrib -> waveletMatrix) || 
          !(tWaveletMatrix = preCompContrib -> tWaveletMatrix) ||
          !(scDim = preCompContrib -> scDim)
       )
          /* Should be initialised by the caller! */
          return -1;
       
    #ifdef PMAP_CONTRIB_DBG
       for (i = 0; i < scDim; i++) {
          for (j = 0; j < scDim; j++) {
             for (k = 0; k < 3; k++) {
       #if 0
                /* Set contributions to bins for debugging */
                waveletMatrix [i] [j] [k] = PMAP_CONTRIB_XY2LIN(scDim, i, j);
       #else
                /* Replace contribs with "bump" function */
                waveletMatrix [i] [j] [k] =
                   (1. - fabs(i - scDim/2. + 0.5) / (scDim/2. - 0.5)) *
                   (1. - fabs(j - scDim/2. + 0.5) / (scDim/2. - 0.5));
       #endif
             }
                
       #if 0
             /* Dump contribs prior to encoding for debugging */         
             printf("% 7.3g\t", colorAvg(waveletMatrix [i] [j]));
          }
          putchar('\n');
       }
       putchar('\n');
       #else
          }
       }
       #endif
    #endif
 
       /* Do 2D wavelet transform on preCompContrib -> waveletMatrix */
       if (padWaveletXform2(waveletMatrix, tWaveletMatrix, scDim, NULL) !=
          preCompContrib -> coeffDim
       )
          error(INTERNAL, "failed wavelet transform in encodeContribs()");
 
       /* Compress wavelet detail coeffs by thresholding */
       if (thresholdContribs(preCompContrib) < 0)
          error(INTERNAL, "failed wavelet compression in encodeContribs()");
       threshCoeffs = preCompContrib -> compressedCoeffs;
 
       /* Init per-channel coefficient range for mRGBE encoding */
       mrgbeRange = &preCompContrib -> mrgbeRange;
       setcolor(mrgbeRange -> min, FHUGE, FHUGE, FHUGE);
       setcolor(mrgbeRange -> max, 0, 0, 0);
 
       /* Update per-channel coefficient range */
       for (i = 0; i < preCompContrib -> nCompressedCoeffs; i++) {
          for (k = 0; k < 3; k++) {
    #ifdef PMAP_CONTRIB_DBG
       #if 0
-            /* Replace wavelet coefficient with bin for debugging, ordering
-               bins linearly */ 
+            /* Replace wavelet coeff with its linear index for debugging */
             threshCoeffs [i].coeff [k] = threshCoeffs [i].idx = i + 1;
       #endif
    #endif
             absCoeff = fabs(threshCoeffs [i].coeff [k]);
             
             if (absCoeff < mrgbeRange -> min [k])
                mrgbeRange -> min [k] = absCoeff;
             if (absCoeff > mrgbeRange -> max [k])
                mrgbeRange -> max [k] = absCoeff;
          }
       }
       
       if (preCompContrib -> nCompressedCoeffs == 1)
          /* Maximum compression with just 1 (!) compressed detail coeff (is
           * this even useful?), so mRGBE range is undefined since min & max
           * coincide; set minimum to 0, maximum to the single remaining
           * coefficient */
          setcolor(mrgbeRange -> min, 0, 0, 0);
       
       /* Init mRGBE coefficient normalisation from range */
       if (!mRGBEinit(mrgbeRange, mrgbeRange -> min, mrgbeRange -> max))
          return -1;
          
       mrgbeCoeffs = preCompContrib -> mrgbeCoeffs;
 
       /* Encode wavelet detail coefficients to mRGBE */
       for (i = 0; i < preCompContrib -> nCompressedCoeffs; i++) {
          /* XXX: threshCoeffs [i].idx could overflow the mRGBE data field,
-          * since there are more padded coefficients than bins!!!
+          * since there are more padded coefficients than bins!!! (This is
+          * currently checked prior to precomp by addContribModifier()).
           * SORT BY IDX AND USE AN INCREMENTAL INDEXING HERE INSTEAD??? */
          mrgbeCoeffs [i] = mRGBEencode(threshCoeffs [i].coeff, 
             mrgbeRange, threshCoeffs [i].idx
          );
          if (!mrgbeCoeffs [i].all)
             error(INTERNAL, "failed mRGBE encoding in encodeContribs()");
 
    #ifdef PMAP_CONTRIB_DBG
          /* Encoding sanity check */
          decIdx = mRGBEdecode(mrgbeCoeffs [i], mrgbeRange, decCoeff);
       #if 0
          if (decCoeff != threshCoeffs [i].idx ||
             sqrt(dist2(decCoeff, threshCoeffs [i].coeff)) > 1.7
          )
             error(CONSISTENCY, "failed sanity check in encodeContribs()");
       #endif
          for (k = 0; k < 3; k++)
             if (decCoeff [k] / threshCoeffs [i].coeff [k] < 0)
                error(CONSISTENCY, 
                   "coefficient sign reversal in encodeContribs()"
                );
    #endif
       }
          
       return 0;
    }
 
 
 
    static void initContribHeap (PhotonMap *pmap)
    /* Initialise precomputed contribution heap */
    {
       int fdFlags;
       
       if (!pmap -> contribHeap) {
          /* Open heap file for precomputed contributions */
          mktemp(strcpy(pmap -> contribHeapFname, PMAP_TMPFNAME));
          if (!(pmap -> contribHeap = fopen(pmap -> contribHeapFname, "w+b")
             )
          )
             error(SYSTEM, "failed opening precomputed contribution "
                "heap file in initContribHeap()"
             );
    #ifdef F_SETFL /* XXX is there an alternative needed for Wind0z? */
          fdFlags = fcntl(fileno(pmap -> contribHeap), F_GETFL);
          fcntl(fileno(pmap -> contribHeap), F_SETFL, fdFlags | O_APPEND);
    #endif   /* ftruncate(fileno(pmap -> heap), 0); */
       }
    }
 
 
 
    static MODCONT *getPhotonContrib (PhotonMap *pmap, RAY *ray, COLOR irrad)
    /* Locate photons near ray -> rop which originated from the light source
       modifier indexed by ray -> rsrc, and accumulate their contributions 
       in pmap -> srcContrib. Return total contributions in irrad and
       pointer to binned contributions (or NULL if none were found). */
    {
       const OBJREC            *srcMod = findmaterial(source [ray->rsrc].so);
       MODCONT                 *contrib = (MODCONT*)lu_find(
                                  pmapContribTab, srcMod -> oname
                               ) -> data;
       Photon                  *photon;
       COLOR                   flux;
       PhotonSearchQueueNode   *sqn;
       float                   r2, norm;
       unsigned                i;
 
       /* Zero bins for this source modifier */     
       memset(contrib -> cbin, 0, sizeof(DCOLOR) * contrib -> nbins);
       setcolor(irrad, 0, 0, 0);
     
       if (!contrib || !pmap -> maxGather) 
          /* Modifier not in LUT or zero bandwidth */
          return NULL;
    
       /* Lookup photons */
       pmap -> squeue.tail = 0;
       /* Pass light source index to filter in findPhotons() via 
          pmap -> lastContribSrc */ 
       pmap -> lastContribSrc.srcIdx = ray -> rsrc;
       findPhotons(pmap, ray);
       
       /* Need at least 2 photons */
       if (pmap -> squeue.tail < 2) {
    #ifdef PMAP_NONEFOUND
          sprintf(errmsg, "no photons found on %s at (%.3f, %.3f, %.3f)", 
             ray -> ro ? ray -> ro -> oname : "<null>",
             ray -> rop [0], ray -> rop [1], ray -> rop [2]
          );
          error(WARNING, errmsg);
    #endif
          return NULL;
       }
 
       /* Avg radius^2 between furthest two photons to improve accuracy */ 
       sqn = pmap -> squeue.node + 1;
       r2 = max(sqn -> dist2, (sqn + 1) -> dist2);
       r2 = 0.25 * (pmap -> maxDist2 + r2 + 2 * sqrt(pmap -> maxDist2 * r2));
 
       /* XXX: OMIT extra normalisation factor 1/PI for ambient calc? */
    #ifdef PMAP_EPANECHNIKOV
       /* Normalise accumulated flux by Epanechnikov kernel integral in 2D
          (see Eq. 4.4, p.76 in Silverman, "Density Estimation for 
          Statistics and Data Analysis", 1st Ed., 1986, and
          Wann Jensen, "Realistic Image Synthesis using Photon Mapping"),
          include RADIANCE-specific lambertian factor PI */
       norm = 2 / (PI * PI * r2);
    #else
       /* Normalise accumulated flux by search area PI * r^2, including
          RADIANCE-specific lambertian factor PI */
       norm = 1 / (PI * PI * r2);
    #endif 
 
       /* Skip the extra photon */
       for (i = 1 ; i < pmap -> squeue.tail; i++, sqn++) {
          /* Get photon's contribution to density estimate */
          photon = getNearestPhoton(&pmap -> squeue, sqn -> idx);
          getPhotonFlux(photon, flux);
          scalecolor(flux, norm);
    #ifdef PMAP_EPANECHNIKOV
          /* Apply Epanechnikov kernel to photon flux based on distance */
          scalecolor(flux, 1 - sqn -> dist2 / r2);
    #endif
          addcolor(irrad, flux);
          addcolor(contrib -> cbin [photonSrcBin(pmap, photon)], flux);
       }
 
       return contrib;
    }
    
    
    
    void freePreCompContribNode (void *p)
    /* Clean up precomputed contribution LUT entry */   
    {
       PhotonMap            *preCompContribPmap = (PhotonMap*)p;
       PreComputedContrib   *preCompContrib;
       
       if (preCompContribPmap) {
          preCompContrib = (PreComputedContrib*)(
             preCompContribPmap -> preCompContrib
          );
          
          if (preCompContrib) {
             /* Free primary and transposed wavelet matrices */
             freeWaveletMatrix2(preCompContrib -> waveletMatrix,
                preCompContrib -> coeffDim
             );
             freeWaveletMatrix2(preCompContrib -> tWaveletMatrix, 
                preCompContrib -> coeffDim
             );
          
             /* Free thresholded coefficients */
             free(preCompContrib -> compressedCoeffs);
          
             /* Free mRGBE encoded coefficients */
             free(preCompContrib -> mrgbeCoeffs);
             free(preCompContrib -> waveletFname);
             
             if (preCompContrib -> cache) {
                /* Free contribution cache */
                OOC_DeleteCache(preCompContrib -> cache);
                free(preCompContrib -> cache);
             }
          }
          
          /* Clean up precomputed contrib photon map */
          deletePhotons(preCompContribPmap);
          free(preCompContribPmap);
       }
    }
 
 
 
    void initPreComputedContrib (PreComputedContrib *preCompContrib)
    /* Initialise precomputed contribution container in photon map */
    {
       preCompContrib -> waveletFname = NULL;
       preCompContrib -> waveletFile = NULL;
       preCompContrib -> waveletMatrix = 
          preCompContrib -> tWaveletMatrix = NULL;
       preCompContrib -> compressedCoeffs = NULL;
       setcolor(preCompContrib -> mrgbeRange.min, 0, 0, 0);
       setcolor(preCompContrib -> mrgbeRange.max, 0, 0, 0);
       setcolor(preCompContrib -> mrgbeRange.norm, 0, 0, 0);
       preCompContrib -> mrgbeCoeffs = NULL;
       preCompContrib -> scDim = 
          preCompContrib -> nBins = 
          preCompContrib -> coeffDim = 
          preCompContrib -> nCoeffs = 
          preCompContrib -> nNonZeroCoeffs = 
          preCompContrib -> nCompressedCoeffs = 
          preCompContrib -> contribSize = 0;
       preCompContrib -> cache = NULL;
    }
 
 
 
    void preComputeContrib (PhotonMap *pmap)
    /* Precompute contributions for a random subset of (finalGather * 
       pmap -> numPhotons) photons, and return the per-modifier precomputed
       contribution photon maps in LUT preCompContribTab, discarding the
       original photons.  */
    {
       unsigned long        p, numPreComp;
-      unsigned             i, scDim, nBins, 
-                           coeffDim, nCoeffs, nCompressedCoeffs;
+      unsigned             i, j, scDim, nBins, 
+                           coeffDim, nCoeffs, nCompressedCoeffs,
+                           nNzDetailCoeffs;
       PhotonIdx            pIdx;
       Photon               photon;
       RAY                  ray;
       MODCONT              *binnedContribs;
-      COLR                 mrgbeRange32 [2]; 
+      COLR                 rgbe32 [WAVELET_PADD4_NUMAPPROX + 2]; 
       LUENT                *preCompContribNode;
       PhotonMap            *preCompContribPmap, nuPmap;
       PreComputedContrib   *preCompContrib;
       LUTAB                lutInit = LU_SINIT(NULL, freePreCompContribNode);
       WaveletMatrix2       waveletMatrix;
    #ifdef PMAP_CONTRIB_DBG
       RAY                  dbgRay;
    #endif
       
       if (verbose) {
          sprintf(errmsg, 
             "\nPrecomputing contributions for %ld photons\n", numPreComp
          );
          eputs(errmsg);
    #if NIX
          fflush(stderr);
    #endif
       }
 
       /* Init new parent photon map and set output filename */
       initPhotonMap(&nuPmap, pmap -> type);
       nuPmap.fileName = pmap -> fileName;
       
       /* Set contrib/coeff mode in new parent photon map */
       nuPmap.contribMode = pmap -> contribMode;
       
       /* Allocate and init LUT containing per-modifier child photon maps */
       if (!(nuPmap.preCompContribTab = malloc(sizeof(LUTAB))))
          error(SYSTEM, 
             "out of memory allocating LUT in preComputeContrib()"
          );
       memcpy(nuPmap.preCompContribTab, &lutInit, sizeof(LUTAB));
       
       /* Record start time, baby */
       repStartTime = time(NULL);
    #ifdef SIGCONT
       signal(SIGCONT, pmapPreCompReport);
    #endif
       repComplete = numPreComp = finalGather * pmap -> numPhotons;
       repProgress = 0;
    
       photonRay(NULL, &ray, PRIMARY, NULL);
       ray.ro = NULL;
       
       for (p = 0; p < numPreComp; p++) {
          /* Get random photon from stratified distribution in source heap 
           * to avoid duplicates and clustering */
          pIdx = firstPhoton(pmap) + (unsigned long)(
             (p + pmapRandom(pmap -> randState)) / finalGather
          );
          getPhoton(pmap, pIdx, &photon);
 
          /* Init dummy photon ray with intersection and normal at photon
             position. Set emitting light source index from origin. */
          VCOPY(ray.rop, photon.pos);
          ray.rsrc = photonSrcIdx(pmap, &photon);
          for (i = 0; i < 3; i++)
             ray.ron [i] = photon.norm [i] / 127.0;
 
    #ifndef PMAP_CONTRIB_DBG
          /* Get contributions at photon position; retry with another 
             photon if no contribs found */
          binnedContribs = getPhotonContrib(pmap, &ray, ray.rcol);
    #else
          /* Get all contribs from same photon for debugging */
          /* Position must differ otherwise too many identical photon keys
           * will cause ooC octree to overflow! */ 
          VCOPY(dbgRay.rop, photon.pos);
          getPhoton(pmap, 0, &photon);
          memcpy(&dbgRay, &ray, sizeof(RAY));
          dbgRay.rsrc = photonSrcIdx(pmap, &photon);
          for (i = 0; i < 3; i++)
             dbgRay.ron [i] = photon.norm [i] / 127.0;
          binnedContribs = getPhotonContrib(pmap, &dbgRay, dbgRay.rcol);
    #endif
 
          if (binnedContribs) {
    #if 0
             if (!(p & PMAP_CNTUPDATE))
                printf("Precomputing contribution photon %lu / %lu\n", 
                   p, numPreComp
                );
    #endif
             /* Found contributions at photon position, so generate
              * precomputed photon */
             if (!(preCompContribNode = lu_find(nuPmap.preCompContribTab, 
                   binnedContribs -> modname)
                )
             )
                error(SYSTEM, "out of memory allocating LUT entry in "
                   "preComputeContrib()"
                );
 
             if (!preCompContribNode -> key) {
                /* New LUT node for precomputed contribs for this modifier */
                preCompContribNode -> key = (char*)binnedContribs -> modname;
                preCompContribNode -> data = (char*)(
                   preCompContribPmap = malloc(sizeof(PhotonMap))
                );
 
                if (preCompContribPmap) {
                   /* Init new child photon map and its contributions */
                   initPhotonMap(preCompContribPmap, 
                      PMAP_TYPE_CONTRIB_CHILD
                   );
                   initPhotonHeap(preCompContribPmap);
                   initContribHeap(preCompContribPmap);
                   preCompContrib = (PreComputedContrib*)(
                      preCompContribPmap -> preCompContrib = 
                         malloc(sizeof(PreComputedContrib))
                   );
                   initPreComputedContrib(preCompContrib);
                }
                if (!preCompContribPmap || !preCompContrib)
                   error(SYSTEM, "out of memory allocating new photon map "
                      "in preComputeContrib()"
                   );
 
                /* Set output filename from parent photon map */
                preCompContribPmap -> fileName = nuPmap.fileName;
                
                /* Set contrib/coeff mode from parent photon map */
                preCompContribPmap -> contribMode = nuPmap.contribMode;
 
                /* Set Shirley-Chiu square & wavelet matrix dimensions
                   (number of bins resp. coefficients). */
                preCompContrib -> scDim = scDim = sqrt(
                   preCompContrib -> nBins = nBins = binnedContribs -> nbins
                );
                
                if (scDim * scDim != nBins)
                   /* Mkpmap shoulda took care of dis */
                   error(INTERNAL, "contribution bin count not "
                      "integer square in preComputeContrib()"
                   );
                   
                if (nBins > 1) {
                   /* BINNING ENABLED; set up wavelet xform & compression.
                      Get dimensions of wavelet coefficient matrix after
                      boundary padding, and number of nonzero coefficients. 
-                     The number of compressed coefficients is fixed and
-                     based on the number of nonzero coefficients (minus one
-                     as the average coefficient is not thresholded), since
-                     zero coeffs are already thresholded by default.  */
+                     The number of compressed (detail) coefficients is fixed
+                     and based on the number of NONZERO coefficients (minus
+                     approximations, see NOTE below) since zero coeffs are
+                     considered thresholded by default.
+                     
+                     !!! NOTE: THE APPROXIMATION COEFFICIENTS IN THE UPPER
+                     !!! LEFT OF THE MATRIX ARE _NEVER_ THRESHOLDED TO
+                     !!! MINIMISE COMPRESSION ARTEFACTS!  THEY ARE ESSENTIAL
+                     !!! FOR RECONSTRUCTING THE ORIGINAL CONTRIBUTIONS.  */
                   preCompContrib -> coeffDim = coeffDim = padWaveletXform2(
                      NULL, NULL, scDim, &preCompContrib -> nNonZeroCoeffs 
                   );
                   preCompContrib -> nCoeffs = nCoeffs = coeffDim * coeffDim;
-                  nCompressedCoeffs = (1 - compressRatio) * 
-                     (preCompContrib -> nNonZeroCoeffs - 1);
+                  nNzDetailCoeffs = WAVELET_PADD4_NUMDETAIL(
+                     preCompContrib -> nNonZeroCoeffs
+                  );
+                  nCompressedCoeffs = (1 - compressRatio) * nNzDetailCoeffs;
+                     
                   if (nCompressedCoeffs < 1) {
                      error(WARNING, 
                         "maximum contribution compression, clamping number "
                         "of wavelet coefficients in preComputeContrib()"
                      );
                      nCompressedCoeffs = 1;
                   }
                   if (nCompressedCoeffs >= preCompContrib -> nCoeffs) {
                      error(WARNING, 
                         "minimum contribution compression, clamping number "
                         "of wavelet coefficients in preComputeContrib()"
                      );
-                     nCompressedCoeffs = nCoeffs - 1;
+                     nCompressedCoeffs = nNzDetailCoeffs;
                   } 
                   preCompContrib -> nCompressedCoeffs = nCompressedCoeffs;
 
                   /* Lazily allocate primary and transposed wavelet
                    * coefficient matrix */
                   preCompContrib -> waveletMatrix = waveletMatrix =
                      allocWaveletMatrix2(coeffDim);
-                  preCompContrib -> tWaveletMatrix = allocWaveletMatrix2(
-                     coeffDim
-                  );
+                  preCompContrib -> tWaveletMatrix = 
+                     allocWaveletMatrix2(coeffDim);
                   if (!waveletMatrix || !preCompContrib -> tWaveletMatrix)
                      error(SYSTEM, "out of memory allocating wavelet "
                         "coefficient matrix in preComputeContrib()"
                      );
                   
-                  /* Lazily allocate thresholded detail coefficients (minus
-                     the average coeff) */
+                  /* Lazily allocate thresholded detail coefficients */
                   preCompContrib -> compressedCoeffs = calloc(
-                     nCoeffs - 1, sizeof(PreComputedContribCoeff)
+                     nNzDetailCoeffs, sizeof(PreComputedContribCoeff)
                   );
                   
                   /* Lazily alloc mRGBE-encoded compressed wavelet coeffs */
                   preCompContrib -> mrgbeCoeffs = calloc(
                      nCompressedCoeffs, sizeof(mRGBE)
                   );
                   if (!preCompContrib -> compressedCoeffs || 
                      !preCompContrib -> mrgbeCoeffs
                   )
                      error(SYSTEM, "out of memory allocating compressed "
                         "contribution buffer in preComputeContrib()"
                      );
                   
                   /* Set size of compressed contributions, in bytes */
                   preCompContrib -> contribSize = PMAP_CONTRIB_ENCSIZE(
                      nCompressedCoeffs
                   );
                }
             }
             else {
                /* LUT node already exists for this modifier; use its data */
                preCompContribPmap = (PhotonMap*)preCompContribNode -> data;
                preCompContrib = (PreComputedContrib*)(
                   preCompContribPmap -> preCompContrib
                );
                scDim             = preCompContrib -> scDim;
                nBins             = preCompContrib -> nBins;
                nCoeffs           = preCompContrib -> nCoeffs;
                nCompressedCoeffs = preCompContrib -> nCompressedCoeffs;
                waveletMatrix     = preCompContrib -> waveletMatrix;
             }
             
             if (nBins > 1) {
-               /* Binning enabled; copy binned contribs to wavelet xform
-                * input matrix, then compress & encode */
+               /* Binning enabled; copy binned contribs row-by-row to
+                * wavelet xform input matrix, then compress & encode */
                for (i = 0; i < scDim; i++)
                   memcpy(waveletMatrix [i], 
                      binnedContribs->cbin + PMAP_CONTRIB_XY2LIN(scDim, i, 0),
                      scDim * WAVELET_COEFFSIZE
                   );
 
                if (encodeContribs(preCompContrib, compressRatio))
                   error(INTERNAL, "failed contribution "
                      "compression/encoding in preComputeContrib()"
                   );
 
-               /* Dump encoded wavelet coeffs to contrib heap file,
-                  prepended by mRGBE range in 32-bit RGBE.  NOTE: minimum
-                  and maximum are reversed in the file to facilitate
-                  handling the special (but pointless) case of a single
-                  compressed coeff (see below) */
-               setcolr(mrgbeRange32 [0], 
+               /* Encode wavelet approx coeffs in the upper left of the
+                * wavelet matrix to 32-bit RGBE.  These are not thresholded
+                * to minimise compression artefacts.  */
+               for (i = 0; i < WAVELET_PADD4_APPROXDIM; i++)
+                  for (j = 0; j < WAVELET_PADD4_APPROXDIM; j++)
+                     setcolr(
+                        rgbe32 [PMAP_CONTRIB_XY2LIN(
+                           WAVELET_PADD4_APPROXDIM, i, j
+                        )],
+                        waveletMatrix [i][j][0], waveletMatrix [i][j][1], 
+                        waveletMatrix [i][j][2]
+                     );
+
+               /* Encode wavelet detail coeff range to 32-bit RGBE */
+               setcolr(rgbe32 [WAVELET_PADD4_NUMAPPROX], 
                   preCompContrib -> mrgbeRange.max [0],
                   preCompContrib -> mrgbeRange.max [1],
                   preCompContrib -> mrgbeRange.max [2]
                );
-               setcolr(mrgbeRange32 [1], 
+               setcolr(rgbe32 [WAVELET_PADD4_NUMAPPROX + 1], 
                   preCompContrib -> mrgbeRange.min [0],
                   preCompContrib -> mrgbeRange.min [1], 
                   preCompContrib -> mrgbeRange.min [2]
                );
+
+               /* Dump 32-bit RGBE approx coeffs followed by mRGBE range to
+                * contribution heap file.  NOTE: mRGBE range minimum and
+                * maximum are reversed in the file to facilitate handling
+                * the special (but pointless) case of a single compressed
+                * coeff; if so, only the mRGBE maximum is dumped, since the
+                * minimum is implicitly zero and can be omitted to save
+                * space */
+               if (putbinary(rgbe32, sizeof(COLR), 
+                     WAVELET_PADD4_NUMAPPROX + 1 + (nCompressedCoeffs > 1),
+                     preCompContribPmap -> contribHeap
+                  ) != WAVELET_PADD4_NUMAPPROX + 1 + (nCompressedCoeffs > 1)
+               )
+                  error(SYSTEM, 
+                     "can't write wavelet approximation coefficients to "
+                     "contribution heap in preComputeContrib()"
+                  );
                
-               /* Dump only mRGBE maximum if 1 compressed coeff (maximum
-                * compression, pointless as it may be!), since minimum is
-                * implicitly zero and can be omitted to save space */
-               putbinary(mrgbeRange32, sizeof(COLR), 
-                  1 + (nCompressedCoeffs > 1), 
-                  preCompContribPmap -> contribHeap
-               );
-               
+               /* Now dump mRGBE encoded, compressed detail coefficients */
                if (putbinary(preCompContrib -> mrgbeCoeffs, sizeof(mRGBE),
                      nCompressedCoeffs, preCompContribPmap -> contribHeap
                   ) != nCompressedCoeffs
                )
-                  error(SYSTEM, "failed writing to contribution heap "
-                     "in preComputeContrib()"
+                  error(SYSTEM, 
+                     "can't write wavelet detail coefficients to "
+                     "contribution heap in preComputeContrib()"
                   );
             }
-            
+   #if 1
             /* HACK: signal newPhoton() to set precomputed photon's 
                contribution source from ray -> rsrc */
             /* XXX: DO WE STILL NEED THIS CRAP AFTER PRECOMP, 
                SINCE CHILD PMAPS ARE PER-MODIFIER ANYWAY? */
             preCompContribPmap -> lastContribSrc.srcIdx = -2;
-            
+   #endif    
             /* Append photon to new heap from ray and increment total 
                count for all modifiers in parent photon map */
             newPhoton(preCompContribPmap, &ray);
             nuPmap.numPhotons++;
          }
 
          /* Update progress */
          repProgress++;
 
          if (photonRepTime > 0 && time(NULL) >= repLastTime + photonRepTime)
             pmapPreCompReport();
    #ifdef SIGCONT
          else signal(SIGCONT, pmapPreCompReport);
    #endif
       }
 
       /* Trash original pmap and its leaf file, and replace with new parent,
          which now acts as container for the per-modifier child pmaps */
       unlink(pmap -> store.leafFname);
       deletePhotons(pmap);
       memcpy(pmap, &nuPmap, sizeof(PhotonMap));
 
    #ifdef SIGCONT
       signal(SIGCONT, SIG_DFL);
    #endif
 
       /* Bail out if no photons could be precomputed */
       if (!pmap -> numPhotons)
          error(USER, 
             "no contribution photons precomputed; try increasing -am"
          );
          
       /* Build per-modifier precomputed photon maps from their contribution
          heaps */
       lu_doall(pmap -> preCompContribTab, buildPreCompContribPmap, NULL);
    }
 
 
 
 #else
    /* -------------------------------------------------------------------
       U N I T   T E S T S
       ------------------------------------------------------------------- */
 
    #include <stdio.h>
    #include <random.h>
    #include "func.h"
    
    int main (int argc, char *argv [])
    {
       unsigned       i, scdim, nsamp, numTheta = 0, numPhi = 0;
       double         t, p;
       RAY            ray;
       int            bin;
 
       if (argc < 3) {
          fprintf(stderr, "%s <scdim> <nsamp> [<var>=<value>; ..]\n", 
             argv [0]
          );
          fprintf(stderr, "Default variable defs: %s\n", 
             PMAP_CONTRIB_SCDEFAULTS
          );
          fputs("\nMissing Shirley-Chiu dimension scDim>1, "
             "number of samples nsamp\n",
             stderr
          );
          return -1;
       }
       
       if (!(scdim = atoi(argv [1]))) {
          fputs("Invalid Shirley-Chiu dimension\n", stderr);
          return -1;
       }
       
       if (!(nsamp = atoi(argv [2]))) {
          fputs("Invalid num samples\n", stderr);
          return -1;
       }
       else {
          numTheta = (int)(sqrt(nsamp) / 2);
          numPhi = 4* numTheta;
       }
       
       /* (Doan' need to) Init cal func routines for binning */
    #if 0
       initfunc();
       setcontext(RCCONTEXT);
    #endif
    
       /* Compile default orientation variables for contribution binning */
       scompile(PMAP_CONTRIB_SCDEFAULTS, NULL, 0);
       /* Compile custom orientation variabls from command line */
       for (i = 3; i < argc; i++)
          scompile(argv [i], NULL, 0);
       
       for (i = 0; i < nsamp; i++) {
          #if 0
             /* Random */
             t = 0.5 * PI * drand48();
             p = 2 * PI * drand48();
          #else
             /* Stratified */
             t = 0.5 * PI * ((i % numTheta) + drand48()) / numTheta;
             p = 2.0 * PI * ((i / numTheta) + drand48()) / numPhi;
          #endif
          ray.rdir [0] = sin(t) * cos(p);
          ray.rdir [1] = sin(t) * sin(p);
          ray.rdir [2] = cos(t);
          bin = ray2bin(&ray, scdim);
          
          printf("%.3f\t%.3f\t%.3f\t-->\t%d\n", 
             ray.rdir [0], ray.rdir [1], ray.rdir [2], bin
          );
       }
       
       return 0;
    }
 #endif
 
 #endif /* PMAP_CONTRIB */
diff --git a/pmapcontrib.h b/pmapcontrib.h
index fa7aa36..600cbf9 100644
--- a/pmapcontrib.h
+++ b/pmapcontrib.h
@@ -1,224 +1,226 @@
 /* RCSid $Id: pmapcontrib.h,v 2.5 2016/05/17 17:39:47 rschregle Exp $ */
 
 /* 
    =========================================================================
    Photon map routines for precomputed light source contributions; these
    routines interface to mkpmap and rcontrib.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #147053, "Daylight Redirecting Components",
         SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id: pmapcontrib.h,v 2.5 2016/05/17 17:39:47 rschregle Exp $
 */
 
 
 
 #ifndef _PMAPCONTRIB_H
    #define _PMAPCONTRIB_H
    
    #if defined(PMAP_OOC) && !defined(PMAP_CONTRIB)
       #define PMAP_CONTRIB
    #endif
       
    #include "pmapdata.h"
    #include "wavelet2.h"
    #include "mrgbe.h"
    #ifndef MAXPROCESS
       #include "rcontrib.h"
    #endif
    
    #ifndef MAXMODLIST 
       /* Max number of contributing sources */
       #define MAXMODLIST                  1024 
    #endif
    
    /* Filename templates for per-modifier contrib photon maps and wavelet
       coeffs; these are held in a separate subdirectory PMAP_CONTRIB_DIR */
    #define PMAP_CONTRIB_DIR               "%s.rc"
    #define PMAP_CONTRIB_FILE              "%s/%s.pm"
    #define PMAP_CONTRIB_WAVELETFILE       "%s/%s.wvt"
    
    /* The following variables can be specified to override the orientation
       of the Shirley-Chiu mapping (see also disk2square.cal).
       We use the built-in functions in disk2square.c for efficiency and
       in order to not depend on an external function file. These variables
       merely mimic the function file interace.
 
       RHS            : right-hand-coordinate system (-1 for left-hand)
       rNx, rNy, rNz  : surface normal
       Ux, Uy, Uz     : up vector (defines phi = 0) */
    #define PMAP_CONTRIB_SCRHS             "RHS"
    #define PMAP_CONTRIB_SCNORMX           "rNx"
    #define PMAP_CONTRIB_SCNORMY           "rNy"
    #define PMAP_CONTRIB_SCNORMZ           "rNz"
    #define PMAP_CONTRIB_SCUPX             "Ux"
    #define PMAP_CONTRIB_SCUPY             "Uy"
    #define PMAP_CONTRIB_SCUPZ             "Uz"
    #define PMAP_CONTRIB_SCDEFAULTS        ( \
       "RHS=1; rNx=0; rNy=0; rNz=1; Ux=0; Uy=1; Uz=0;" \
    )
    
    /* Maximum number of mRGBE encodable wavelet coefficients */
    #define PMAP_CONTRIB_MAXCOEFFS         (mRGBE_DATAMAX + 1)
    
-   /* Size of encoded coefficients in wavelet file (in bytes) as a function
-    * of the number of compressed coefficients nComp.  
+   /* Size of a set of encoded coefficients in wavelet file (in bytes) per
+    * photon.  This is a function of the number of (mRGBE encoded)
+    * compressed detail coefficients nComp, their mRGBE range, and the
+    * number of (uncompressed) approximation coefficients.      
     * NOTE: the coefficient range minimum is omitted from the mRGBE range if
     * nComp == 1, since it is implicitly zero!  */
-   #define PMAP_CONTRIB_ENCSIZE(nComp)    ( \
-      sizeof(COLR) * (1 + ((nComp) > 1)) + sizeof(mRGBE) * (nComp) \
+   #define PMAP_CONTRIB_ENCSIZE(nComp)    ((nComp) * sizeof(mRGBE) + \
+      (WAVELET_PADD4_NUMAPPROX + 1 + ((nComp) > 1)) * sizeof(COLR) \
    )
 
    /* Serialise 2D coordinates (x,y) in (dim x dim) Shirley-Chiu
       square to 1D index.  Returns -1 if coordinates invalid */
    #define PMAP_CONTRIB_XY2LIN(dim, x, y) ( \
       (x) < 0 || (y) < 0 ? -1 : (x) * (dim) + (y) \
    )
 
    /* Deserialise 1D index idx to 2D coordinates (x,y) in (dim x dim)
       Shirley-Chiu square.  Returns -1 if bin invalid */
    #define PMAP_CONTRIB_LIN2X(dim, idx)   ((idx) < 0 ? -1 : (idx) / (dim))
    #define PMAP_CONTRIB_LIN2Y(dim, idx)   ((idx) < 0 ? -1 : (idx) % (dim))
 
 
 
    /* Struct for wavelet coeff thresholding; saves original coeff index */
    typedef struct {
       WAVELET_COEFF  *coeff;
       unsigned       idx;
    } PreComputedContribCoeff;
    
    typedef struct {
       char                       *waveletFname;
       FILE                       *waveletFile;
       WaveletMatrix2             waveletMatrix, tWaveletMatrix;
       /* Wavelet coeff compression/encoding buffer */
       PreComputedContribCoeff    *compressedCoeffs;
       mRGBERange                 mrgbeRange;
       mRGBE                      *mrgbeCoeffs;
       unsigned                   scDim, nBins, coeffDim, nCoeffs, 
                                  nCompressedCoeffs, nNonZeroCoeffs;
       unsigned long              contribSize;
       OOC_Cache                  *cache;
    } PreComputedContrib;
 
 
 
    /* The following are convenient placeholders interfacing to mkpmap 
       and rcontrib. They can be externally set via initPmapContribTab()
       and referenced within the contrib pmap modules. These variables 
       can then be safely ignored by rtrace/rpict/rvu, without annoying
       linking errors. */
 
    /* Global pointer to rcontrib's contribution binning LUT */
    extern LUTAB *pmapContribTab;
    /* Contribution/coefficient mode flag */
    extern int *pmapContribMode;
 
 
    #if 0
       int xy2lin (unsigned scDim, int x, int y);
       /* Serialise 2D contribution coords in (scDim x scDim) Shirley-Chiu
          square to 1D index.  Returns -1 if coordinates invalid */
 
 
       void lin2xy (unsigned scDim, int bin, int *x, int *y); 
       /* Deserialise 1D contribution index to 2D coordinates in (scDim x
          scDim) Shirley-Chiu square.  Returns -1 if bin invalid */
    #endif
     
     
    MODCONT *addContribModifier(LUTAB *contribTab, unsigned *numContribs, 
       char *mod, char *binParm, int binCnt
    );
    /* Add light source modifier mod to contribution lookup table
       contribsTab, and update numContribs. Return initialised
       contribution data for this modifier. */
 
 
    void addContribModfile(LUTAB *contribTab, unsigned *numContribs, 
       char *modFile, char *binParm, int binCnt
    );
    /* Add light source modifiers from file modFile to contribution lookup
     * table contribTab, and update numContribs. */
 
 
    void freePreCompContribNode (void *p);
    /* Clean up precomputed contribution LUT entry */
 
 
    void initPmapContribTab (LUTAB *contribTab, int *contribMode);
    /* Set contribution table and contrib/coeff mode flag
       (interface to rcmain.c, see also pmapContribTab above) */
    
    
    void initPmapContrib (PhotonMap *pmap);
    /* Initialise contribution photon map and check modifiers. 
       NOTE: pmapContribTab must be set before via initPmapContribTab() */
    
    
    PhotonContribSourceIdx newPhotonContribSource (PhotonMap *pmap, 
       const RAY *contribSrcRay, FILE *contribSrcHeap
    );
    /* Add contribution source ray for emitted contribution photon and save
     * light source index and binned direction.  The current contribution
     * source is stored in pmap -> lastContribSrc.  If the previous
     * contribution source spawned photons (i.e.  has srcIdx >= 0), it's
     * appended to contribSrcHeap.  If contribSrcRay == NULL, the current
     * contribution source is still flushed, but no new source is set. 
     * Returns updated contribution source counter pmap -> numContribSrc. */
 
 
    PhotonContribSourceIdx buildContribSources (PhotonMap *pmap, 
       FILE **contribSrcHeap, char **contribSrcHeapFname, 
       PhotonContribSourceIdx *contribSrcOfs, unsigned numHeaps
    );
    /* Consolidate per-subprocess contribution sources heaps into array
     * pmap -> contribSrc. Returns offset for contribution source index 
     * linearisation in pmap -> numContribSrc. The heap files in 
     * contribSrcHeap are closed on return. */
 
 
    void initPreComputedContrib (PreComputedContrib *preCompContrib);
    /* Initialise precomputed contribution container in photon map */
    
    
    void preComputeContrib (PhotonMap *pmap);
    /* Precompute contributions for a random subset of (finalGather * 
       pmap -> numPhotons) photons, and init the per-modifier precomputed
       contribution photon maps in LUT pmap -> preCompContribTab, discarding
       the original photons. */
 
 
    void distribPhotonContrib (PhotonMap *pmap, LUTAB *contribTab, 
       unsigned numContribs, int *contribMode, unsigned numProc
    );
    /* Emit photons with binned light source contributions, precompute their
       contributions and build photon map */
 
 
    int buildPreCompContribPmap (const LUENT *preCompContribNode, void *p);
    /* Build per-modifier precomputed photon map. Returns 0 on success.  */
 
 
    void saveContribPhotonMap (const PhotonMap *pmap, const char *fname, 
       int argc, char **argv
    );
    /* Save contribution pmap and its per-modifier precomputed children */
 
 
    void loadContribPhotonMap (PhotonMap *pmap, const char *fname);
    /* Load contribution pmap and its per-modifier precomputed children */
 
     
    void getPreCompPhotonContrib (PhotonMap *pmap, RAY *ray, 
       COLOR totalContrib
    );
    /* Get precomputed light source contributions at ray -> rop for all
       specified modifiers and accumulate them in pmapContribTab (which maps
       to rcontrib's binning LUT). Also return total precomputed contribs. */
 #endif
diff --git a/pmapio.c b/pmapio.c
index 6b47224..d89b178 100644
--- a/pmapio.c
+++ b/pmapio.c
@@ -1,425 +1,426 @@
 #ifndef lint
 static const char RCSid[] = "$Id: pmapio.c,v 2.13 2021/02/18 17:08:50 rschregle Exp $";
 #endif
 
 /* 
    =========================================================================
    Photon map portable file I/O
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Fraunhofer Institute for Solar Energy Systems,
        supported by the German Research Foundation 
        (DFG LU-204/10-2, "Fassadenintegrierte Regelsysteme FARESYS")
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #147053, "Daylight Redirecting Components",
         SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    (c) Tokyo University of Science,
        supported by the JSPS Grants-in-Aid for Scientific Research 
        (KAKENHI JP19KK0115, "Three-Dimensional Light Flow")
    =========================================================================
    
    $Id: pmapio.c,v 2.13 2021/02/18 17:08:50 rschregle Exp $
 */
 
 
 
 #include "pmapio.h"
 #include "pmapdiag.h"
 #include "pmapcontrib.h"
 #include "resolu.h"
 #include <sys/stat.h>
 
 
 
 extern char *octname;
 
 
 
 void savePhotonMap (const PhotonMap *pmap, const char *fname, 
    int argc, char **argv
 )
 {
    unsigned j;
    FILE     *file;
 
    if (!pmap || !pmap -> numPhotons || !validPmapType(pmap -> type)) {
       error(INTERNAL, "attempt to save empty or invalid photon map");
       return;
    }
       
    if (verbose) {
       sprintf(errmsg, "Saving %s (%ld photons)\n", 
          fname, pmap -> numPhotons
       );
       eputs(errmsg);
       fflush(stderr);
    }
    
    if (!(file = fopen(fname, "wb"))) {
       sprintf(errmsg, "can't open photon map file %s", fname);
       error(SYSTEM, errmsg);
    }
       
    /* Write header */
    newheader("RADIANCE", file);
    
    /* Write command line */
    printargs(argc, argv, file);
    
    /* Include statistics in info text */
    fprintf(file, 
       "NumPhotons\t= %ld\n"
       "AvgFlux\t\t= [%.3g, %.3g, %.3g]\n"
       "Bbox\t\t= [%.3g, %.3g, %.3g] [%.3g, %.3g, %.3g]\n"
       "CoG\t\t= [%.3g, %.3g, %.3g]\n"
       "MaxDist^2\t= %.3g\n"
       "Velocity\t= %g / sec\n"
       "Timespan\t= [%g, %g] sec\n"
       "AvgPathLen\t= %.3g\n",
       pmap -> numPhotons, pmap -> photonFlux [0], 
       pmap -> photonFlux [1], pmap -> photonFlux [2],
       pmap -> minPos [0], pmap -> minPos [1], pmap -> minPos [2],
       pmap -> maxPos [0], pmap -> maxPos [1], pmap -> maxPos [2],
       pmap -> CoG [0], pmap -> CoG [1], pmap -> CoG [2], 
       pmap -> CoGdist, pmap -> velocity, 
       pmap -> minPathLen / pmap -> velocity, 
       pmap -> maxPathLen / pmap -> velocity, 
       pmap -> avgPathLen
    );
    
 #ifdef PMAP_CONTRIB
    if (isContribChildPmap(pmap) && pmap -> preCompContrib) {
       /* Child contrib pmap containing precomputed contributions;
          append binning params to header */
       const PreComputedContrib *preCompContrib = (PreComputedContrib*)(
          pmap -> preCompContrib
       );
                                                       
       fprintf(file, "Photon Map contains precomputed %s\n",
          pmap -> contribMode ? "contributions" : "coefficients"
       );
       
       if (preCompContrib -> nBins > 1)
          fprintf(file,
             "Num bins\t= %u x %u = %u\n"
-            "Num coeffs\t= %u compressed / %u nonzero / %u total\n"
+            "Num coeffs\t= %u/%u compressed detail, %u approx, %u total\n"
             "Compression\t= %.1f%%\n",
             preCompContrib -> scDim, preCompContrib -> scDim, 
             preCompContrib -> nBins,
             preCompContrib -> nCompressedCoeffs, 
-            preCompContrib -> nNonZeroCoeffs, preCompContrib -> nCoeffs,
+            WAVELET_PADD4_NUMDETAIL(preCompContrib -> nNonZeroCoeffs),
+            WAVELET_PADD4_NUMAPPROX, preCompContrib -> nCoeffs,
             100 * (1 - 
                (float)preCompContrib -> nCompressedCoeffs / 
-               (preCompContrib -> nNonZeroCoeffs - 1)
+               WAVELET_PADD4_NUMDETAIL(preCompContrib -> nNonZeroCoeffs)
             )
          );
       else
          fputs("Binning disabled\n", file);
    }
 #endif
    
    /* Write format, including human-readable file version */
    fputformat((char *)pmapFormat [pmap -> type], file);
    fprintf(file, "VERSION=%s\n", PMAP_FILEVER);
    
    /* Empty line = end of header */
    putc('\n', file);
    
    /* Write machine-readable file format version */
    putstr(PMAP_FILEVER, file); 
    
    /* Write number of photons as fixed size, which possibly results in
     * padding of MSB with 0 on some platforms.  Unlike sizeof() however,
     * this ensures portability since this value may span 32 or 64 bits
     * depending on platform.  */
    putint(pmap -> numPhotons, PMAP_LONGSIZE, file);
    
    /* Write average photon flux */
    for (j = 0; j < 3; j++) 
       putflt(pmap -> photonFlux [j], file);
 
    /* Write max and min photon positions */
    for (j = 0; j < 3; j++) {
       putflt(pmap -> minPos [j], file);
       putflt(pmap -> maxPos [j], file);
    }
 
    /* Write centre of gravity */
    for (j = 0; j < 3; j++)
       putflt(pmap -> CoG [j], file);
       
    /* Write avg distance to centre of gravity */
    putflt(pmap -> CoGdist, file);
       
    /* Save photon's velocity (speed of light in units of scene geometry),
       min/max/avg photon path length (= timespan and temporal CoG) */
    putflt(pmap -> velocity, file);
    putflt(pmap -> minPathLen, file);
    putflt(pmap -> maxPathLen, file);
    putflt(pmap -> avgPathLen, file);
 
 #ifdef PMAP_CONTRIB
    if (isContribChildPmap(pmap) && pmap -> preCompContrib) {
       /* Child contrib photon map containing per-modifier precomputed 
          contributions; save mode, number of bins and number of wavelet
          coefficients */
       PreComputedContrib *preCompContrib = 
          (PreComputedContrib*)(pmap -> preCompContrib);
       
       putint(pmap -> contribMode, sizeof(pmap -> contribMode), file);
       putint(preCompContrib -> scDim, 
          sizeof(preCompContrib -> scDim), file
       );
       putint(preCompContrib -> coeffDim, 
          sizeof(preCompContrib -> coeffDim), file
       );
       putint(preCompContrib -> nNonZeroCoeffs,
          sizeof(preCompContrib -> nNonZeroCoeffs), file
       );
       putint(preCompContrib -> nCompressedCoeffs, 
          sizeof(preCompContrib -> nCompressedCoeffs), file
       );
    }
    else if (isContribPmap(pmap) && pmap -> preCompContribTab)
       /* Parent contrib photon map; save child photon maps containing
          per-modifier precomputed contributions */
       saveContribPhotonMap(pmap, fname, argc, argv);
 #endif
       
    /* Save photon storage (except if parent contribution photon map, which
     * just acts as a container for its children) */
 #ifdef PMAP_OOC
    if (
    #ifdef PMAP_CONTRIB
       !isContribPmap(pmap) && 
    #endif
       OOC_SavePhotons(pmap, file)
    ) {
 #else
    if (kdT_SavePhotons(pmap, file)) {
 #endif
       sprintf(errmsg, "error writing photon map file %s", fname);
       error(SYSTEM, errmsg);
    }
 
    fclose(file);
 }
 
 
 
 PhotonMapType loadPhotonMap (PhotonMap *pmap, const char *fname)
 {
    PhotonMapType  ptype = PMAP_TYPE_NONE;
    char           format [MAXFMTLEN];
    unsigned       j;
    FILE           *file;
 
    if (!pmap)
       return PMAP_TYPE_NONE;
    
    if ((file = fopen(fname, "rb")) == NULL) {
       sprintf(errmsg, "can't open photon map file %s", fname);
       error(SYSTEM, errmsg);
    }
 
    /* Get format string */
    strcpy(format, PMAP_FORMAT_GLOB);
    if (checkheader(file, format, NULL) != 1) {
       sprintf(errmsg, "photon map file %s has an unknown format", fname);
       error(USER, errmsg);
    }
    
    /* Identify photon map type from format string */
    for (ptype = 0; 
       ptype < NUM_PMAP_TYPES && strcmp(pmapFormat [ptype], format);
       ptype++
    );
       
    if (!validPmapType(ptype)) {
       sprintf(errmsg, "file %s contains an unknown photon map type", fname);
       error(USER, errmsg);
    }
 
    /* Init empty photon map of found type */
    initPhotonMap(pmap, ptype);
    
    /* Set filename (prefixed by contrib pmap for per-modifier files) */
    pmap -> fileName = (char*)fname;
    
    /* Get file format version and check for compatibility */
    if (strcmp(getstr(format, file), PMAP_FILEVER))
       error(USER, "incompatible photon map file format");
       
    /* Get number of photons as fixed size, which possibly results in
     * padding of MSB with 0 on some platforms.  Unlike sizeof() however,
     * this ensures portability since this value may span 32 or 64 bits
     * depending on platform.  */
    pmap -> numPhotons = getint(PMAP_LONGSIZE, file);
    
    /* Get average photon flux */
    for (j = 0; j < 3; j++) 
       pmap -> photonFlux [j] = getflt(file);
    
    /* Get max and min photon positions */
    for (j = 0; j < 3; j++) {
       pmap -> minPos [j] = getflt(file);
       pmap -> maxPos [j] = getflt(file);
    }
    
    /* Get centre of gravity */
    for (j = 0; j < 3; j++)
       pmap -> CoG [j] = getflt(file);
       
    /* Get avg distance to centre of gravity */
    pmap -> CoGdist = getflt(file);
 
    /* Get photon's velocity (speed of light in units of scene geometry),
       min/max/avg photon path length (= timespan) and temporal CoG */
    pmap -> velocity = getflt(file);
    pmap -> minPathLen = getflt(file);
    pmap -> maxPathLen = getflt(file);
    pmap -> avgPathLen = getflt(file);
 
 #ifdef PMAP_CONTRIB
    if (isContribChildPmap(pmap)) {
       /* Per-modifier precomputed contribution (child) photon map;
          allocate precomputed contributions, get mode, number of bins,
          and number of wavelet coefficients */
       PreComputedContrib   *preCompContrib = (PreComputedContrib*)(
          pmap -> preCompContrib = malloc(sizeof(PreComputedContrib))
       );
 
       if (!preCompContrib)
          error(SYSTEM, "out of memory allocating precomputed contributions"
             " in loadPhotonMap()"
          );
 
       initPreComputedContrib(preCompContrib);
       pmap -> contribMode = getint(sizeof(pmap -> contribMode), file);
       preCompContrib -> scDim = getint(
          sizeof(preCompContrib -> scDim), file
       );
       preCompContrib -> coeffDim = getint(
          sizeof(preCompContrib -> coeffDim), file
       );
       preCompContrib -> nNonZeroCoeffs = getint(
          sizeof(preCompContrib -> nNonZeroCoeffs), file
       );
       preCompContrib -> nCompressedCoeffs = getint(
          sizeof(preCompContrib -> nCompressedCoeffs), file
       );
    }
    else if (isContribPmap(pmap)) 
       /* Parent contrib photon map; load child photon maps containing
          per-modifier precomputed contributions */
       loadContribPhotonMap(pmap, fname);
 #endif
       
    /* Load photon storage (except if parent contribution photon map, which
     * just acts as a container for its children) */
 #ifdef PMAP_OOC
    if (
    #ifdef PMAP_CONTRIB
       !isContribPmap(pmap) && 
    #endif
       OOC_LoadPhotons(pmap, file)
    ) {
 #else
    if (kdT_LoadPhotons(pmap, file)) {
 #endif
       sprintf(errmsg, "error reading photon map file %s", fname);
       error(SYSTEM, errmsg);
    }
 
    fclose(file);
    return ptype;
 }
 
 
 
 void savePmaps (const PhotonMap **pmaps, int argc, char **argv)
 /* Wrapper to save all defined photon maps with specified command line */
 {
    unsigned t;
    
    for (t = 0; t < NUM_PMAP_TYPES; t++) {
       if (pmaps [t])
          savePhotonMap(pmaps [t], pmaps [t] -> fileName, argc, argv);
    }
 }
 
 
 
 void loadPmaps (PhotonMap **pmaps, const PhotonMapParams *parm)
 /* Wrapper to load all photon maps and set their parameters */
 {
    unsigned t;
    struct stat octstat, pmstat;
    PhotonMap *pm;
    PhotonMapType type;
    
    for (t = 0; t < NUM_PMAP_TYPES; t++)
       if (setPmapParam(&pm, parm + t)) {         
          /* Check if photon map newer than octree */
          if (pm -> fileName && octname &&
             !stat(pm -> fileName, &pmstat) && !stat(octname, &octstat) && 
             octstat.st_mtime > pmstat.st_mtime
          ) {
             sprintf(errmsg, "photon map in file %s may be stale", 
                pm -> fileName
             );
             error(USER, errmsg);
          }
          
          /* Load photon map from file and get its type */
          if ((type = loadPhotonMap(pm, pm -> fileName)) == PMAP_TYPE_NONE)
             error(USER, "failed loading photon map");
             
          /* Assign to appropriate photon map type (deleting previously
           * loaded photon map of same type if necessary) */
          if (pmaps [type]) {
             sprintf(errmsg, "multiple %s photon maps, dropping previous",
                pmapName [type]
             );
             error(WARNING, errmsg);
             deletePhotons(pmaps [type]);
             free(pmaps [type]);
          }
          pmaps [type] = pm;
          
          /* Check for valid density estimate bandwidths */
          if ((pm -> minGather > 1 || pm -> maxGather > 1) &&
             (type == PMAP_TYPE_PRECOMP)
          ) {
             /* Force bwidth to 1 for precomputed pmap */
             error(WARNING, "ignoring bandwidth for precomp photon map");
             pm -> minGather = pm -> maxGather = 1;
          }
                      
          if ((pm -> maxGather > pm -> minGather) && 
             (type == PMAP_TYPE_VOLUME)
          ) {
             /* Biascomp for volume pmaps (see volumePhotonDensity() below) 
                is considered redundant, and there's probably no point in 
                recovering by using the lower bandwidth, since it's probably
                not what the user wants, so bail out. */
             sprintf(errmsg,
                "bias compensation is not available with %s photon maps",
                pmapName [type]
             );
             error(USER, errmsg);
          }
          
          if (pm -> maxGather > pm -> numPhotons) {
             /* Clamp lookup bandwidth to total number of photons (minus one,
                since density estimate gets extra photon to obtain averaged 
                radius) */
             sprintf(errmsg, "clamping density estimate bandwidth to %ld",
                pm -> numPhotons
             );
             error(WARNING, errmsg);
             pm -> minGather = pm -> maxGather = pm -> numPhotons - 1;
          }
       }
 }
diff --git a/pmcontrib4.c b/pmcontrib4.c
index dae44b3..3f7fa6d 100644
--- a/pmcontrib4.c
+++ b/pmcontrib4.c
@@ -1,611 +1,628 @@
 #ifndef lint
 static const char RCSid[] = "$Id: pmcontrib2.c,v 2.5 2018/11/08 00:54:07 greg Exp $";
 #endif
 
 /* 
    =========================================================================
    Photon map routines for precomputed light source contributions. 
    These routines interface to rcontrib.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #147053, "Daylight Redirecting Components",
         SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 
 #include "pmcontribcache.h"
 #ifdef PMAP_CONTRIB
 #include "pmapmat.h"
 #include "pmapsrc.h"
 #include "pmaprand.h"
 #include "pmapio.h"
 #include "pmapdiag.h"
 #include "otypes.h"
 #include "otspecial.h"
 #include "oocnn.h"
 #include "random.h"
 
 
 
 void initPmapContribTab (LUTAB *contribTab, int *contribMode)
 /* Set contribution table (interface to rcmain.c) */
 {
    pmapContribTab = contribTab;
    pmapContribMode = contribMode;
 }
 
 
 
 /* ------------------ CONTRIB PMAP LOADING STUFF --------------------- */
 
 
 
 static int checkContribModifier (const LUENT *modContNode, void *p)
 /* Check for valid modifier in LUT entry */
 {
    MODCONT              *modCont = (MODCONT*)modContNode -> data;
    char                 *modName = (char*)modCont -> modname;
    OBJECT               modObj = modifier(modName);
    const PhotonMap      *pmap = (PhotonMap*)p;
    
    if (modObj == OVOID) {
       sprintf(errmsg, "invalid modifier %s", modName);
       error(USER, errmsg);
    }
    if (!islight(objptr(modObj) -> otype)) {
       sprintf(errmsg, "%s is not a light source modifier", modName);
       error(USER, errmsg);
    }
    
    /* Warn if number of bins exceeds lookup bandwidth, as this is
     * guaranteed to result in empty bins! */
    if (pmap -> maxGather < modCont -> nbins) {
       sprintf(errmsg, "contribution photon lookup bandwidth too low "
          "for modifier %s, some bins will be zero", modName
       );
       error(WARNING, errmsg);
    }
    
    return 0;
 }
 
 
 
 void initPmapContrib (PhotonMap *pmap)
 /* Set up photon map contributions and get binning parameters */ 
 {
    unsigned t;
 
    /* Avoid incompatible photon maps */
    for (t = 0; t < NUM_PMAP_TYPES; t++)
       if (photonMaps [t] && t != PMAP_TYPE_CONTRIB) {
          sprintf(errmsg, "%s photon map does not support contributions",
             pmapName [t]
          );
          error(USER, errmsg);
       }
 
    if (!pmapContribTab)
       error(INTERNAL, 
          "contribution LUT not initialised in initPmapContrib()"
       );
       
    /* Check for valid contribution modifiers */
    lu_doall(pmapContribTab, checkContribModifier, pmap);
 
    /* Set callback to collect contributions */
    if (pmap) {
       pmap -> lookup = getPreCompPhotonContrib;
       /* XXX: Need tighter tolerance for contribution photon lookups? */
       pmap -> gatherTolerance = 1.0;
    }
 }
 
 
 
 static int loadPreCompContrib (const LUENT *modContNode, void *p)
 /* Load child contribution photon map for current contrib modifier entry */
 {
    MODCONT              *modCont = (MODCONT*)modContNode -> data;
    char                 *modName = (char*)modCont -> modname;
    PhotonMap            *preCompContribPmap;
    const PhotonMap      *parentPmap = (const PhotonMap*)p;
    LUENT                *preCompContribNode;
    PreComputedContrib   *preCompContrib;
    char                 dirName [1024];
    
    /* Allocate new LUT entry for child precomputed contribution pmap */
    preCompContribNode = lu_find(parentPmap -> preCompContribTab, modName);
    if (!preCompContribNode)
       error(SYSTEM, 
          "out of memory allocating LUT entry in loadPreCompContrib()"
       );
    preCompContribNode -> key = modName;
 
    /* Allocate child precomputed contribution photon map */
    preCompContribNode -> data = (char*)(
       preCompContribPmap = malloc(sizeof(PhotonMap))
    );
    if (!preCompContribPmap)
       error(SYSTEM, "out of memory allocating precomputed contribution "
          "photon map in loadPreCompContrib()"
       );
    
    /* Set subdirectory from parent photon map's filename */
    snprintf(dirName, sizeof(dirName), PMAP_CONTRIB_DIR, 
       parentPmap -> fileName
    );
 
    /* Allocate & set child pmap's filename from subdir and modifier */
    preCompContribPmap -> fileName = malloc(strlen(parentPmap -> fileName) +
       strlen(modName) + strlen(PMAP_CONTRIB_FILE)
    );
    if (!preCompContribPmap -> fileName)
       error(SYSTEM, "out of memory allocating filename in "
          "loadPreCompContrib()"
       );   
    snprintf(preCompContribPmap -> fileName, sizeof(dirName) + 4, 
       PMAP_CONTRIB_FILE, dirName, modName
    );      
    
    /* Load child precomp contrib photon map for current modifier; 
       loadPhotonMap() will not recurse and call loadContribPhotonMap() 
       again because preCompContribPmap -> preCompContribTab == NULL */
    loadPhotonMap(preCompContribPmap, preCompContribPmap -> fileName);   
    
    /* Pass parent pmap's lookup parameters on to child */
    preCompContribPmap -> minGather        = parentPmap -> minGather;
    preCompContribPmap -> maxGather        = parentPmap -> maxGather;
    preCompContribPmap -> gatherTolerance  = parentPmap -> gatherTolerance;   
    
    /* Override contrib/coefficient mode if it doesn't match precomputed */
    if (*pmapContribMode != preCompContribPmap -> contribMode) {
       sprintf(errmsg, 
          "photon map contains precomputed %s, overriding rcontrib setting",
          preCompContribPmap -> contribMode ? "contributions" : "coefficients"
       );
       error(WARNING, errmsg);
       *pmapContribMode = preCompContribPmap -> contribMode;
    }
    
    /* NOTE: preCompContribPmap -> preCompContrib is initialised by
     * loadPhotonMap() */
    preCompContrib = (PreComputedContrib*)(
       preCompContribPmap -> preCompContrib
    );
    
    /* Set number of bins and wavelet coefficients */
    preCompContrib -> nBins = preCompContrib -> scDim * 
       preCompContrib -> scDim;
    preCompContrib -> nCoeffs = preCompContrib -> coeffDim *
       preCompContrib -> coeffDim;
    
    /* Set wavelet coefficient filename */
    preCompContrib -> waveletFname = malloc(strlen(dirName) + 
       strlen(modName) + strlen(PMAP_CONTRIB_WAVELETFILE)
    );
    if (!preCompContribPmap -> fileName)
       error(SYSTEM, "out of memory allocating filename in "
          "loadPreCompContrib()"
       );
    snprintf(preCompContrib -> waveletFname, sizeof(dirName) + 5, 
       PMAP_CONTRIB_WAVELETFILE, dirName, modName
    );
    
    return 0;
 }
 
 
 
 void loadContribPhotonMap (PhotonMap *pmap, const char *fname)
 /* Load contribution pmap and its per-modifier precomputed children */
 {
    LUTAB lutInit = LU_SINIT(NULL, freePreCompContribNode);
    
    /* Allocate & init LUT containing per-modifier child contrib pmaps */
    if (!(pmap -> preCompContribTab = malloc(sizeof(lutInit))))
       error(SYSTEM, "out of memory allocating precomputed contribution "
          "LUT in loadContribPhotonMap()"
       );
    memcpy(pmap -> preCompContribTab, &lutInit, sizeof(lutInit));
 
    /* NOTE: filename already set in loadPhotonMap()
    pmap -> fileName = (char*)fname; */
 
    /* Init contrib photon map for light source contributions */
    initPmapContrib(pmap);
      
    /* Load child contribution photon maps for each modifier in contribTab */
    lu_doall(pmapContribTab, loadPreCompContrib, pmap);
 }
 
 
 
 static int decodeContribs (PreComputedContrib *preCompContrib)
 /* Decode and decompress mRGBE-encoded wavelet coefficients in
    preCompContrib -> mrgbeCoeffs, apply inverse wavelet transform and 
    return decoded contributions in the wavelet coefficient matrix 
    preCompContrib -> waveletMatrix.
    NOTE: THE WAVELET COEFFICIENT MATRIX IS ASSUMED TO BE ZEROED, with 
-   the average wavelet coefficient in the upper left of the matrix 
-   (i.e. preCompContrib -> waveletMatrix [0][0]).
-   Returns 0 on success, else -1.  */
+   the wavelet approximation coefficients in the upper left of the matrix 
+   (i.e. preCompContrib -> waveletMatrix [0..approxDim-1][0..approxDim-1],
+   where approxDim = WAVELET_PADD4_APPROXDIM).
+   Returns 0 on success, else -1. */
 {
    unsigned          c, i, j, coeffIdx, scDim, 
                      coeffDim, nCoeffs, nCompressedCoeffs;
    WaveletMatrix2    waveletMatrix;
    mRGBERange        *mrgbeRange;
    mRGBE             *mrgbeCoeffs;
    DCOLOR            fCoeff;
    
    if (!preCompContrib || !preCompContrib -> mrgbeCoeffs ||
       !(waveletMatrix = preCompContrib -> waveletMatrix) || 
       !preCompContrib -> tWaveletMatrix
    )
       /* Should be initialised by caller! */
       return -1;
    
    scDim             = preCompContrib -> scDim;
    coeffDim          = preCompContrib -> coeffDim;
    nCoeffs           = preCompContrib -> nCoeffs;
    nCompressedCoeffs = preCompContrib -> nCompressedCoeffs;
    mrgbeCoeffs       = preCompContrib -> mrgbeCoeffs;
    mrgbeRange        = &preCompContrib -> mrgbeRange;
    
    /* Init mRGBE coefficient normalisation from range */
    if (!mRGBEinit(mrgbeRange, mrgbeRange -> min, mrgbeRange -> max))
       return -1;
 
-   /* Decode mRGBE coefficients and populate wavelet coefficient matrix, 
-      omitting average at waveletMatrix [0][0].
-      NOTE: The matrix should be initialised to zero so */
+   /* Decode mRGBE detail coeffs and populate wavelet coefficient matrix,
+      omitting approximation coeffs in the upper left (these should have
+      been set by the caller).
+      NOTE: The matrix should be initialised to zero to account for
+      coefficients that were omitted by thresholding. */
    for (c = 0; c < nCompressedCoeffs; c++) {
       coeffIdx = mRGBEdecode(mrgbeCoeffs [c], mrgbeRange, fCoeff);
+      i = PMAP_CONTRIB_LIN2X(coeffDim, coeffIdx);
+      j = PMAP_CONTRIB_LIN2Y(coeffDim, coeffIdx);
 #ifdef PMAP_CONTRIB_DBG
       /* Check for invalid decoded coefficients */
-      if (coeffIdx < 1 || coeffIdx >= nCoeffs)
+      if (i < WAVELET_PADD4_APPROXDIM && j < WAVELET_PADD4_APPROXDIM ||
+         coeffIdx >= nCoeffs
+      )
          error(CONSISTENCY, 
             "wavelet coefficient index out of range in decodeContribs()"
          );
 #endif
-      i = PMAP_CONTRIB_LIN2X(coeffDim, coeffIdx);
-      j = PMAP_CONTRIB_LIN2Y(coeffDim, coeffIdx);
       copycolor(waveletMatrix [i] [j], fCoeff);
    }
 
-   /* Do inverse 2D wavelet transform on preCompContrib -> waveletMatrix
-      (which actually maps to binnedContribs) */
-   if (!padWaveletInvXform2(waveletMatrix, 
-         preCompContrib -> tWaveletMatrix, coeffDim, scDim
+   /* Do inverse 2D wavelet transform on preCompContrib -> waveletMatrix */
+   if (!padWaveletInvXform2(waveletMatrix, preCompContrib -> tWaveletMatrix,
+         coeffDim, scDim
       )
    )
       error(INTERNAL, 
          "failed inverse wavelet transform in decodeContribs()"
       );
 
 #ifdef PMAP_CONTRIB_DBG
    for (i = 0; i < scDim; i++) {
       for (j = 0; j < scDim; j++) {
          /* Dump decoded contribs for debugging */
          printf("% 7.3g\t", colorAvg(waveletMatrix [i][j]));
          
          /* Warn if coefficient is negative; this _can_ happen due to loss
             of precision by the mRGBE encoding */
          if (min(min(waveletMatrix [i][j][0], waveletMatrix [i][j][1]),
                waveletMatrix [i][j][2]
             ) < 0
          )
             error(WARNING, "negative contributions in getPreCompContrib()");
       }
       putchar('\n');
    }
    putchar('\n');
 #endif
 
    return 0;
 }
 
 
 
 static void getPreCompContribByPhoton (const Photon *photon, 
    OOC_DataIdx photonIdx, PreComputedContrib *preCompContrib, 
    DCOLOR *binnedContribs 
 )
 /* Fetch and decode mRGBE-encoded wavelet coefficients for given photon 
    from wavelet file at offset photonIdx, perform inverse wavelet xform,
    and return the reconstructed binned contributions in binnedContribs
    (which is assumed to be variable). Returns 0 on success, else -1. */
-/* XXX: HERE binnedContribs POINTS TO CACHE PAGE DATA! */
+/* NOTE: binnedContribs IS ASSUMED TO POINT TO CACHE PAGE DATA! */
 {
    unsigned    i, j, scDim, coeffDim, nCompressedCoeffs;
-   COLR        mrgbeRange32 [2];
+   COLR        rgbe32 [WAVELET_PADD4_NUMAPPROX + 2];
    COLOR       fCoeff;
    
    if (!binnedContribs || !preCompContrib)
       /* Shouldn't happen */
       error(INTERNAL, 
          "contributions not initialised in getPreCompContrib()"
       );
 
    if (preCompContrib -> nBins <= 1)
       /* No binning, so nothing to decode */
       return;
 
    scDim             = preCompContrib -> scDim;
    coeffDim          = preCompContrib -> coeffDim;
    nCompressedCoeffs = preCompContrib -> nCompressedCoeffs;
 
    /* Lazily initialise preCompContrib */
    if (!preCompContrib -> waveletFile) {
       /* Lazily open wavelet coefficient file */
       preCompContrib -> waveletFile = fopen(
          preCompContrib -> waveletFname, "rb"
       );
       if (!preCompContrib -> waveletFile) {
-         sprintf(errmsg, "failed to open wavelet coefficient file %s "
+         sprintf(errmsg, "can't open wavelet coefficient file %s "
             "in getPreCompContrib()", preCompContrib -> waveletFname
          );
          error(SYSTEM, errmsg);
       }
 
       /* Set record size of encoded coeffs in wavelet file */
       preCompContrib -> contribSize = PMAP_CONTRIB_ENCSIZE(
          nCompressedCoeffs
       );
 
       /* Lazily allocate buffer for mRGBE wavelet coeffs */
-      preCompContrib -> mrgbeCoeffs = calloc(
-         nCompressedCoeffs, sizeof(mRGBE)
+      preCompContrib -> mrgbeCoeffs = calloc(nCompressedCoeffs, 
+         sizeof(mRGBE)
       );
       if (!preCompContrib -> mrgbeCoeffs)
          error(SYSTEM, "out of memory allocating decoded wavelet "
             "coefficients in getPreCompContrib()"
          );
          
       /* Lazily allocate primary and transposed wavelet matrices */
       preCompContrib -> waveletMatrix = allocWaveletMatrix2(coeffDim);
       preCompContrib -> tWaveletMatrix = allocWaveletMatrix2(coeffDim);
       if (!preCompContrib -> waveletMatrix || 
          !preCompContrib -> tWaveletMatrix
       )
          error(SYSTEM, "out of memory allocating wavelet coefficient "
             "matrix in getPreCompContrib()"
          );
    }
 
-   /* Seek to photon index in wavelet file and read mRGBE normalisation 
-      and encoded bins */
-   if (fseek(preCompContrib -> waveletFile, 
+   /* Seek to photon index in wavelet file and read its associated
+      coefficients */ 
+   if (fseek(preCompContrib -> waveletFile,
          photonIdx * preCompContrib -> contribSize, SEEK_SET
       ) < 0
    )
-      error(SYSTEM, "failed seek in wavelet coefficient file "
+      error(SYSTEM, "can't seek in wavelet coefficient file "
          "in getPreCompContrib()"
       );
 
-   /* Read only mRGBE maximum if 1 compressed bin (maximum compression),
-    * since minimum is implicitly zero and therefore omitted */
-   getbinary(mrgbeRange32, sizeof(COLR), 1 + (nCompressedCoeffs > 1),
-      preCompContrib -> waveletFile
-   );
+   /* Read 32-bit encoded wavelet approximation coefficients and mRGBE
+    * range; omit mRGBE minimum if only 1 compressed bin (=maximum
+    * compression), since it's implicitly zero in this case. */
+   if (getbinary(rgbe32, sizeof(COLR), 
+         WAVELET_PADD4_NUMAPPROX + 1 + (nCompressedCoeffs > 1),
+         preCompContrib -> waveletFile
+      ) != WAVELET_PADD4_NUMAPPROX + 1 + (nCompressedCoeffs > 1)
+   )
+      error(SYSTEM, "can't read approximation coefficients from "
+         "wavelet coefficient file in getPreCompContrib()"
+      );
 
    if (getbinary(preCompContrib -> mrgbeCoeffs, sizeof(mRGBE),
          nCompressedCoeffs, preCompContrib -> waveletFile
       ) != nCompressedCoeffs
    )
-      error(SYSTEM, "failed reading from wavelet coefficient file "
-         "in getPreCompContrib()"
+      error(SYSTEM, "can't read detail coefficients from "
+         "wavelet coefficient file in getPreCompContrib()"
       );
 
-   /* Get mRGBE range (NOTE min/max are reversed in the coeff file) */
-   colr_color(fCoeff, mrgbeRange32 [0]);
+   /* Get mRGBE range (NOTE min/max are reversed in the coeff file)
+      Note that direct assignment isn't possible as colr_color() returns
+      float, not double. */
+   colr_color(fCoeff, rgbe32 [WAVELET_PADD4_NUMAPPROX]);
    copycolor(preCompContrib -> mrgbeRange.max, fCoeff);
    
    if (nCompressedCoeffs > 1) {
-      colr_color(fCoeff, mrgbeRange32 [1]);
+      colr_color(fCoeff, rgbe32 [WAVELET_PADD4_NUMAPPROX + 1]);
       copycolor(preCompContrib -> mrgbeRange.min, fCoeff);
    }
    else {
       /* Single compressed bin; mRGBE minimum is implicitly zero */
       setcolor(preCompContrib -> mrgbeRange.min, 0, 0, 0);
    }
 
-   /* Zero wavelet coefficient matrix and set average coefficient from
-      photon flux; this coefficient is expected by the inverse transform
-      routine in the upper left of the matrix (i.e. preCompContrib ->
-      waveletMatrix [0] [0]) */
+   /* Zero wavelet coefficient matrix and set approximation coefficients in
+    * the upper left of the matrix, as expected by the decodeContribs(). */
    zeroCoeffs2(preCompContrib -> waveletMatrix, coeffDim);
-   getPhotonFlux((Photon*)photon, fCoeff);
-   copycolor(preCompContrib -> waveletMatrix [0] [0], fCoeff);
-   
+   for (i = 0; i < WAVELET_PADD4_APPROXDIM; i++)
+      for (j = 0; j < WAVELET_PADD4_APPROXDIM; j++) {
+         colr_color(fCoeff, 
+            rgbe32 [PMAP_CONTRIB_XY2LIN(WAVELET_PADD4_APPROXDIM, i, j)]
+         );
+         /* Direct assignment via colr_color() isn't possible as it returns
+          * float, not double */
+         copycolor(preCompContrib -> waveletMatrix [i] [j], fCoeff);
+      }
+
    /* All set, now decode mRGBE coeffs and invert wavelet transform */
    if (decodeContribs(preCompContrib))
       error(INTERNAL, "failed contribution decoding/decompression "
          "in getPreCompContrib()"
       );
       
    /* Copy decoded contributions from wavelet coefficient matrix rows */
    for (i = 0; i < scDim; i++)
       memcpy(binnedContribs + PMAP_CONTRIB_XY2LIN(scDim, i, 0), 
          preCompContrib -> waveletMatrix [i], scDim * WAVELET_COEFFSIZE
       );
 }
 
 
 
 typedef struct {
    const RAY   *ray;
    RREAL       rayCoeff [3];
    COLORV      *totalContrib;
 } PreCompContribRCData;
 
 
 
 static int getPreCompContribByMod (const LUENT *preCompContribTabNode, 
    void *p
 )
 /* Fetch and decode precomputed contributions from closest photon in pmap
  * referenced by preCompContribTabNode, and accumulate in pmapContribTab
  * for the corresponding modifier */
 {   
    PhotonMap                  *preCompContribPmap = (PhotonMap*)(
                                  preCompContribTabNode -> data
                               );
    PreComputedContrib         *preCompContrib;
    const char                 *modName = preCompContribTabNode -> key;
    MODCONT                    *modCont;
    PreCompContribRCData       *rcData = (PreCompContribRCData*)p;
    LUENT                      *contribTabNode;
    Photon                     photon;
    OOC_DataIdx                photonIdx;
    COLOR                      photonContrib;
    unsigned                   b, k;
 
    if (!preCompContribPmap || !rcData || 
       !(preCompContrib = (PreComputedContrib*)(
          preCompContribPmap -> preCompContrib
       ))
    ) {
       sprintf(errmsg, "missing precomputed contributions for modifier %s",
          modName
       );
       error(INTERNAL, errmsg);
    }
    
    /* Find rcontrib's LUT entry for this modifier */
    contribTabNode = lu_find(pmapContribTab, modName);
    if (!contribTabNode || !contribTabNode -> key || 
       !(modCont = (MODCONT*)contribTabNode -> data)
    ) {
       sprintf(errmsg, "missing contribution LUT entry for modifier %s",
          modName
       );
       error(INTERNAL, errmsg);
    }
    
    /* Reallocate rcontrib bins if they don't match precomputed */
    if (modCont -> nbins != preCompContrib -> nBins) {
       contribTabNode -> data = realloc(modCont,
          sizeof(MODCONT) + sizeof(DCOLOR) * (preCompContrib -> nBins - 1)
       );
 
       if (!contribTabNode -> data) {
          sprintf(errmsg, "out of memory reallocating contribution bins "
             "for modifier %s", modCont -> modname
          );
          error(SYSTEM, errmsg);
       }
       else {
          sprintf(errmsg, "bin count mismatch for modifier %s, resizing",
             modCont -> modname
          );
          error(WARNING, errmsg);
       }
       
       modCont = (MODCONT*)contribTabNode -> data;
       modCont -> nbins = preCompContrib -> nBins;
       memset(modCont -> cbin, 0, sizeof(DCOLOR) * modCont -> nbins);
    }
  
    /* Fetch closest photon and its contributions */
    if (find1Photon(preCompContribPmap, rcData -> ray, &photon, 
          &photonIdx
       )
    ) {
       if (preCompContrib -> nBins > 1) {
          /* BINNING ENABLED; lazily init contrib cache if necessary */
          if (initContribCache(preCompContrib)) {
             /* CACHE ENABLED; fetch cache page for found photon */
             DCOLOR *cacheBins;
             
             if (!getContribCache(preCompContrib, photonIdx, &cacheBins)) {
                /* PAGE NOT CACHED; decode contribs into new cache page */
                getPreCompContribByPhoton(&photon, photonIdx, preCompContrib, 
                   cacheBins
                );
             } 
             else; /* PAGE CACHED; (re)use decoded contribs from cache! */
             
             for (b = 0; b < preCompContrib -> nBins; b++) {
                /* Accumulate binned contributions into rcontrib's LUT entry
                   for this modifier, weighted by ray coefficient.  Also sum
                   total contributions.  */
                /* NOTE: Using multcolor() on the decoded contribs would
                   modify them in the cache, so use a temp variable here. */
                for (k = 0; k < 3; k++)
                   photonContrib [k] = cacheBins [b] [k] * 
                      rcData -> rayCoeff [k];
 
                addcolor(modCont -> cbin [b], photonContrib);
                addcolor(rcData -> totalContrib, photonContrib);
             }
          }
          else {   
             /* CACHE DISABLED; decode contribs into temp array on stack */
             DCOLOR tempBins [preCompContrib -> nBins];
             
             getPreCompContribByPhoton(&photon, photonIdx, preCompContrib,
                tempBins
             );
          
             for (b = 0; b < preCompContrib -> nBins; b++) {
                /* Accumulate binned contributions into rcontrib's LUT entry
                   for this modifier, weighted by ray coefficient.  Also sum
                   total contributions.  */
                for (k = 0; k < 3; k++)
                   photonContrib [k] = tempBins [b] [k] * 
                      rcData -> rayCoeff [k];
 
                addcolor(modCont -> cbin [b], photonContrib);
                addcolor(rcData -> totalContrib, photonContrib);
             }
          }
       }
       else {
          /* NO BINNING; get single contribution directly from photon,
             scale by ray coefficient and sum to total contribs */
          getPhotonFlux(&photon, photonContrib);
          multcolor(photonContrib, rcData -> rayCoeff);
          addcolor(modCont -> cbin [0], photonContrib);
          addcolor(rcData -> totalContrib, photonContrib);
       }
    }
    
    return 0;
 }
 
 
 
 void getPreCompPhotonContrib (PhotonMap *pmap, RAY *ray, 
    COLOR totalContrib
 )
 /* Get precomputed light source contributions at ray -> rop for all
    specified modifiers and accumulate them in pmapContribTab (which maps
    to rcontrib's binning LUT). Also return total precomputed contribs. */
 {
    PreCompContribRCData rcData;
 
    /* Ignore sources */
    if (ray -> ro && islight(objptr(ray -> ro -> omod) -> otype)) 
       return;
  
    /* Get cumulative path coefficient up to photon lookup point */
    raycontrib(rcData.rayCoeff, ray, PRIMARY);
    setcolor(totalContrib, 0, 0, 0);
    
    /* Rcontrib results passed to per-modifier child contrib pmaps */
    rcData.ray           = ray;
    rcData.totalContrib  = totalContrib;
 
    /* Iterate over child contribution photon maps for each modifier and
     * collect their contributions */
    lu_doall(pmap -> preCompContribTab, getPreCompContribByMod, &rcData);
 }
 
 #endif /* PMAP_CONTRIB */
diff --git a/pmcontribsort.c b/pmcontribsort.c
index 069ddb9..05d3bef 100644
--- a/pmcontribsort.c
+++ b/pmcontribsort.c
@@ -1,606 +1,610 @@
 #ifndef lint
 static const char RCSid[] = "$Id$";
 #endif
 
 
 /* 
    =========================================================================
    Support routines for N-way out-of-core merge sort of precomputed photon
    contributions. These routines are used adapted from OOC_Sort() to sort 
    binned contributions in the same order as their associated precomputed 
    photons.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    ==========================================================================
    
    $Id$
 */
 
 
 #if !defined(_WIN32) && !defined(_WIN64)
    /* No Windoze 'cos no fork() */
    #include "pmapcontrib.h"
    
    #ifdef PMAP_CONTRIB
       #include "oocsort.h"
       #include "pmutil.h"
       #include <stdio.h>
       #include <stdlib.h>
       #include <unistd.h>
       #include <fcntl.h>
       #include <sys/wait.h>
 
       #define PHOTONSIZE sizeof(Photon)
       #define PMAPCONTRIB_KEY(p, kdat) Key2Morton3D( \
          kdat -> key(p), kdat -> bbOrg, kdat -> mortonScale \
       )
       #define PMAPCONTRIB_KEYSWAP(i,j) (tKey = (i), (i) = (j), (j) = tKey)
 
       extern RREAL *OOC_PhotonKey (const void *p);
 
       /* Active subprocess counter updated by parent process in
        * contribPhotonSort(); must persist when recursing into
        * contribPhotonSortRecurse(), hence global */
       static unsigned procCnt = 0;
 
 
       
       static void contribPhotonSwap (Photon *photons, char *contribs,
          unsigned long i, unsigned long j, unsigned contribSize
       )
       {
          /* TODO: swap photon indices instead of photons themselves? */
          char  tmp [max(PHOTONSIZE, contribSize)],
                *ci = contribs + i * contribSize,
                *cj = contribs + j * contribSize;
 
          memcpy(tmp, photons + i, PHOTONSIZE);
          memcpy(photons + i, photons + j, PHOTONSIZE);
          memcpy(photons + j, tmp, PHOTONSIZE);
          memcpy(tmp, ci, contribSize);
          memcpy(ci, cj, contribSize);
          memcpy(cj, tmp, contribSize);
       }
 
 
 
       static void contribPhotonQuickSort (Photon *photons, char *contribs, 
          unsigned contribSize, const OOC_KeyData *keyData, 
          unsigned long left, unsigned long right
       )
       /* Partition photons and their binned contributions using a quicksort
        * algorithm. Returns index to pivot's position after partitioning, */
       {
          unsigned long        l, r, m;
          MortonIdx            lKey, rKey, mKey, tKey;
 
          if (left < right) {
             /* Select pivot from median-of-three and move to photons [right] 
                (a.k.a. Lomuto partitioning) */
             l = left;
             r = right;
             m = l + ((r - l) >> 1);   /* Avoids overflow vs. (l+r) >> 1 */
             lKey = PMAPCONTRIB_KEY(photons + l, keyData);
             rKey = PMAPCONTRIB_KEY(photons + r, keyData);
             mKey = PMAPCONTRIB_KEY(photons + m, keyData);
             if (mKey < lKey) {
                contribPhotonSwap(photons, contribs, l, m, contribSize);
                PMAPCONTRIB_KEYSWAP(lKey, mKey);
             }
             if (rKey < lKey) {
                contribPhotonSwap(photons, contribs, l, r, contribSize);
                PMAPCONTRIB_KEYSWAP(lKey, rKey);
             }
             if (mKey < rKey) {
                contribPhotonSwap(photons, contribs, m, r, contribSize);
                PMAPCONTRIB_KEYSWAP(mKey, rKey);
             }
             /* Pivot with key rKey is now in photons [right] */
 
             /* l & r converge, swapping photons (and their corresponding
                binned contributions) out of order with respect to pivot
                (a.k.a. Hoare partitioning). The convergence point l = r 
                is the pivot's final position */
             while (l < r) {
                while (l < r && PMAPCONTRIB_KEY(photons + l, keyData) < rKey)
                   l++;
                while (r > l && PMAPCONTRIB_KEY(photons + r, keyData) >= rKey)
                   r--;
                   
                /* Photons out of order; swap them and their contributions */
                if (l < r)
                   contribPhotonSwap(photons, contribs, l, r, contribSize);
             };
             
             /* Now l = r = pivot's final position; swap these photons */
             lKey = PMAPCONTRIB_KEY(photons + l, keyData);
             contribPhotonSwap(photons, contribs, l, right, contribSize);
             
             /* Recurse in partitions flanking pivot */
             if (l > left)
                contribPhotonQuickSort(photons, contribs, contribSize, 
                   keyData, left, l - 1
                );
             if (l < right)
                contribPhotonQuickSort(photons, contribs, contribSize, 
                   keyData, l + 1, right
                );
          }
       }
 
 
 
    #ifdef PMAP_CONTRIB_DBG
       static unsigned nCoeffs = 0, nCompressedCoeffs = 0;
    #endif
    
       static int contribPhotonSortRecurse (
          FILE *photonIn, FILE *contribIn, FILE* photonOut, FILE* contribOut,
          unsigned contribSize, unsigned numBlk, unsigned numProc, 
          Photon *photonBuf, char *contribBuf, unsigned long bufLen,
          const OOC_KeyData *keyData, unsigned long blkLo, unsigned long blkHi
       )
       /* Recursive part of contribPhotonSort().  Reads block of photons and
        * their binned contributions from input files photonIn resp. 
        * contribOut, within the interval [blkLo, blkHi], and divides them
        * into numBlk blocks until the number of photons/contributions in a
        * block does not exceed bufLen, then quicksorts each block in-core
        * into temporary files.  These files are then mergesorted via a
        * priority queue to the specified output files as the stack unwinds. 
        * NOTE: Critical sections are prepended with '!!!'
        *
        * Parameters are as follows:
        * photonIn    Opened primary input file containing unsorted photons;
        *             their Morton codes determine the ordering
        * contribIn   Opened secondary input file containing unsorted binned
        *             contributions size contribSize
        * photonOut   Opened primary output file for sorted photons
        * contribOut  Opened secondary output file for sorted binned
        *             contributions ordered by corresponding photons
        * contribSize Size of binned contributions per photon, in bytes
        * numBlk      Number of blocks to divide into / merge from 
        * numProc     Number of parallel processes for in-core sort 
        * photonBuf   Preallocated in-core sort buffer for maxRec photons
        * contribBuf  Preallocated in-core sort buffer for maxRec contribs
        * bufLen      In-core sort buffer length, in num photons/contribs
        * keyData     Aggregate data for Morton code generation as sort keys
        * blkLo       Start of current block in input files, in num photons
        * blkHi       End of current block in input files, in num photons
        */
       {
          const unsigned long  blkLen = blkHi - blkLo + 1;
 
          if (!blkLen || blkHi < blkLo)
             return 0;
 
          if (blkLen > bufLen) {
             unsigned long     blkLo2 = blkLo, blkHi2 = blkLo;
             const double      blkLen2 = (double)blkLen / numBlk;
             FILE              *pBlkFile [numBlk], *cBlkFile [numBlk];
             Photon            photon;
             char              contrib [contribSize];
             MortonIdx         pri;
             int               b, pid, stat;
             OOC_SortQueueNode pqueueNodes [numBlk];
             OOC_SortQueue     pqueue;
       #ifdef PMAP_CONTRIB_DBG
             unsigned long     pqCnt = 0;
       #endif
       
             /* ======================================================
              * Block too large for in-core sort -> divide into numBlk
              * subblocks and recurse
              * ====================================================== */
              
       #ifdef PMAP_CONTRIB_DBG
             fprintf(stderr, 
                "*DBG* contribPhotonSort: splitting block [%lu - %lu]\n",
                blkLo, blkHi
             );
       #endif
       
             for (b = 0; b < numBlk; b++) {
                /* Open temporary file as output for subblock */
                if (!(pBlkFile [b] = tmpfile()) || 
                   !(cBlkFile [b] = tmpfile())
                ) {
                   perror("contribPhotonSort: cannot open temp block file");
                   return -1;
                }
 
                /* Set new subblock interval [blkLo2, blkHi2] */
                blkHi2 = blkLo + (b + 1) * blkLen2 - 1;
 
                if (blkHi2 - blkLo2 + 1 <= bufLen) {
                   /* !!! Subblock [blkLo2, blkHi2] will be sorted in-core on
                    * !!! recursion, so fork kid process. But don't fork more
                    * !!! than numProc kids; wait if necessary */
                   while (procCnt >= numProc && wait(&stat) >= 0) {
                      /* Kid proc terminated; check status, update counter */
                      if (!WIFEXITED(stat) || WEXITSTATUS(stat))
                         return -1;
                      procCnt--;
                   }
 
                   /* Now fork kid process */
                   if (!(pid = fork())) {
       #if 0
                      fputs("waiting 10 sec to attach debugga", stderr);
                      sleep(10);
       #endif               
                      /* Recurse on subblock with temp output files */
                      if (contribPhotonSortRecurse(photonIn, contribIn,
                         pBlkFile [b], cBlkFile [b], contribSize, 
                         numBlk, numProc, photonBuf, contribBuf, bufLen,
                         keyData, blkLo2, blkHi2
                      ))
                         exit(-1);
 
                      /* !!! Kid exits here. Apparently the parent's tmpfile 
                       * !!! isn't deleted (which is what we want), though 
                       * !!! some sources suggest using _Exit() instead; 
                       * !!! is this implementation specific? Anyway, we 
                       * !!! leave cleanup to the parent (see below) */
                      exit(0);
                   }
                   else if (pid < 0) {
                      perror("contribPhotonSort: failed to fork subprocess");
                      return -1;
                   }
                   
       #ifdef PMAP_CONTRIB_DBG
                   fprintf(stderr, "*DBG* contribPhotonSort: forking kid %d "
                      "(PID %d) for block %d\n",
                      procCnt, pid, b
                   );
       #endif
 
                   /* Parent continues here */
                   procCnt++;
                }
                else {
                   /* Subblock will NOT be sorted in-core on recursion,
                      so DON'T fork */
                   if (contribPhotonSortRecurse(photonIn, contribIn,
                      pBlkFile [b], cBlkFile [b], contribSize,
                      numBlk, numProc, photonBuf, contribBuf, bufLen,
                      keyData, blkLo2, blkHi2
                   ))
                      return -1;
                }
 
                /* Update offset for next subblock */ 
                blkLo2 = blkHi2 + 1;
             }
 
             /* !!! Wait for any forked processes to terminate */
             while (procCnt && wait(&stat) >= 0) {
                /* Check status, bail out on error or update process count */
                if (!WIFEXITED(stat) || WEXITSTATUS(stat))
                   return -1;
                procCnt--;
             }
 
             /* ==========================================================
              * Subblocks are now sorted; prepare priority queue
              * ========================================================== */
              
       #ifdef PMAP_CONTRIB_DBG
             fprintf(stderr, 
                "*DBG* contribPhotonSort: merging block [%lu - %lu]\n", 
                blkLo, blkHi
             );
       #endif
 
             /* Init priority queue */
             pqueue.head = pqueueNodes;
             pqueue.tail = 0;
             pqueue.len = numBlk;
 
             /* Fill priority queue by peeking and enqueueing first photon
                from each subblock */
             for (b = 0; b < numBlk; b++) {
       #ifdef PMAP_CONTRIB_DBG
                fseek(pBlkFile [b], 0, SEEK_END);
                fseek(cBlkFile [b], 0, SEEK_END);
 
                if (ftell(pBlkFile [b]) % PHOTONSIZE || 
                   ftell(cBlkFile [b]) % contribSize
                ) {
                   fprintf(stderr, "contribPhotonSort: truncated record "
                      "in temp block file %d\n", b
                   );
                   return -1;
                }
                
                fprintf(stderr, "contribPhotonSort: temp block file %d "
                   "contains %lu photons, %lu contribs\n", b, 
                   ftell(pBlkFile [b]) / PHOTONSIZE, 
                   ftell(cBlkFile [b]) / contribSize
                );
       #endif
       
                rewind(pBlkFile [b]);
                rewind(cBlkFile [b]);
 
                if (OOC_SortPeek(pBlkFile [b], PHOTONSIZE, (char*)&photon) ||
                   OOC_SortPeek(cBlkFile [b], contribSize, contrib)
                ) {
                   perror("contribPhotonSort: error reading block file");
                   return -1;
                }
                
                /* Enqueue photon's Morton code as priority */
                pri = PMAPCONTRIB_KEY(&photon, keyData);
 
                if (OOC_SortPush(&pqueue, pri, b) < 0) {
                   perror("contribPhotonSort: failed priority queue insert");
                   return -1;
                }
             }
 
             /* ==========================================================
              * Subblocks now sorted and priority queue filled; 
              * merge subblocks from temporary files 
              * ========================================================== */
              
             do {
                /* Get head of priority queue, read next photon/contrib in 
                 * corresponding subblock, and send to output */
                if ((b = OOC_SortPop(&pqueue)) >= 0) {
                   /* Priority queue not empty */
                   if (OOC_SortRead(pBlkFile [b], PHOTONSIZE, (char*)&photon) ||
                      OOC_SortRead(cBlkFile [b], contribSize, contrib)
                   ) {
                      /* Corresponding photon/contrib should be in subblock */
                      perror("contribPhotonSort: unexpected EOF in block file");
                      return -1;
                   }
 
                   if (OOC_SortWrite(photonOut, PHOTONSIZE, (char*)&photon) ||
                      OOC_SortWrite(contribOut, contribSize, contrib)
                   ) {
                      perror("contribPhotonSort: error writing output file");
                      return -1;
                   }
 
       #ifdef PMAP_CONTRIB_DBG
                   pqCnt++;
       #endif
                   /* Peek next photon/contrib from same subblock and enqueue
                      for next iteration */
                   if (!OOC_SortPeek(pBlkFile [b], PHOTONSIZE, (char*)&photon) &&
                      !OOC_SortPeek(cBlkFile [b], contribSize, contrib)
                   ) {
                      /* Subblock not exhausted */
                      pri = PMAPCONTRIB_KEY(&photon, keyData);
 
                      if (OOC_SortPush(&pqueue, pri, b) < 0) {
                         perror("contribPhotonSort: failed priority enqueue");
                         return -1;
                      }
                   }
                }
             } while (b >= 0);
 
       #ifdef PMAP_CONTRIB_DBG
             fprintf(stderr, "*DBG* contribPhotonSort: dequeued %lu rec\n", 
                pqCnt
             );
             fprintf(stderr, "*DBG* contribPhotonSort: merged file contains "
                "%lu photons, %lu contribs\n",
                ftell(photonOut) / PHOTONSIZE, ftell(contribOut) / contribSize
             );
       #endif
 
             /* Priority queue now empty -> done; close temporary subblock
              * files, which deletes them (see note in kid kode above) */
             for (b = 0; b < numBlk; b++) {
                fclose(pBlkFile [b]);
                fclose(cBlkFile [b]);
             }
 
             /* !!! Commit output file to disk before caller reads it;
              * !!! omitting this apparently leads to corrupted files (race
              * !!! condition?) when the caller tries to read them!  */
             fflush(photonOut);
             fflush(contribOut);
             fsync(fileno(photonOut));
             fsync(fileno(contribOut));
          }
 
          else {
             /* ==========================================================
              * Block is small enough for in-core sort
              * ========================================================== */
 
             const unsigned long  pBlkSize = blkLen * PHOTONSIZE,
                                  cBlkSize = blkLen * contribSize;
             int                  pIfd = fileno(photonIn),
                                  pOfd = fileno(photonOut),
                                  cIfd = fileno(contribIn),
                                  cOfd = fileno(contribOut);
 
       #ifdef PMAP_CONTRIB_DBG
             unsigned long        contribIdx;
             unsigned             i;
             
             fprintf(stderr, "*DBG* contribPhotonSort: Proc %d (%d/%d) "
                "sorting block [%lu - %lu]\n",
                getpid(), procCnt, numProc - 1, blkLo, blkHi
             );
       #endif
 
             /* Atomically seek and read block into in-core sort buffer */
             /* !!! Unbuffered I/O via pread() avoids potential race
              * !!! conditions and buffer corruption which can occur with
              * !!! lseek()/fseek() followed by read()/fread().  */
             if (pread(pIfd, photonBuf, pBlkSize, blkLo*PHOTONSIZE) != pBlkSize ||
                pread(cIfd, contribBuf, cBlkSize, blkLo*contribSize) != cBlkSize
             ) {
                perror("contribPhotonSort: error reading block for sort");
                return -1;
             }
 
       #ifdef PMAP_CONTRIB_DBG
             /* Check compressed contribs for bullshit values BEFORE sort */
             for (contribIdx = 0; contribIdx < blkLen; contribIdx++)
                for (i = 0; i < nCompressedCoeffs; i++)
                   if (((mRGBE*)(contribBuf + contribIdx * contribSize + 
-                           (1 + (nCompressedCoeffs > 1)) * sizeof(COLR)
+                           (WAVELET_PADD4_NUMAPPROX + 1 + 
+                              (nCompressedCoeffs > 1)
+                           ) * sizeof(COLR)
                         )
                      ) [i].dat >= nCoeffs
                   )
                      error(CONSISTENCY, "bullshit contribs BEFORE sort");
       #endif
             /* Quicksort block in-core and write to output file */
             contribPhotonQuickSort(photonBuf, contribBuf, contribSize,
                keyData, 0, blkLen - 1
             );
 
       #ifdef PMAP_CONTRIB_DBG
             /* Check contribs for bullshit values AFTER sort */
             for (contribIdx = 0; contribIdx < blkLen; contribIdx++)
                for (i = 0; i < nCompressedCoeffs; i++)
                   if (((mRGBE*)(contribBuf + contribIdx * contribSize + 
-                           (1 + (nCompressedCoeffs > 1)) * sizeof(COLR)
+                           (WAVELET_PADD4_NUMAPPROX + 1 + 
+                              (nCompressedCoeffs > 1)
+                           ) * sizeof(COLR)
                         )
                      ) [i].dat >= nCoeffs
                   )
                      error(CONSISTENCY, "bullshit contribs AFTER sort");
       #endif
       
             if (write(pOfd, photonBuf, pBlkSize) != pBlkSize ||
                write(cOfd, contribBuf, cBlkSize) != cBlkSize
             ) {
                perror("contribPhotonSort: error writing sorted block");
                return -1;
             }
             
             fsync(pOfd);
             fsync(cOfd);
 
       #ifdef PMAP_CONTRIB_DBG
             fprintf(stderr, "*DBG* contribPhotonSort: proc %d wrote "
                "%lu photons, %lu contribs\n", getpid(), 
                lseek(pOfd, 0, SEEK_END) / PHOTONSIZE,
                lseek(cOfd, 0, SEEK_END) / contribSize
             );
       #endif
          }
 
          return 0;
       }
 
 
 
       int contribPhotonSort (
          const PhotonMap *pmap, FILE *leafFile, FVECT bbOrg, RREAL bbSize, 
          unsigned numBlk, unsigned long blkSize, unsigned numProc
       )
       /* Sort photons and their corresponding binned contributions by
        * subdividing input files into small blocks, sorting these in-core,
        * and merging them out-of-core via a priority queue.  This code is
        * adapted from oocsort.{h,c}
        *
        * The sort is implemented as a recursive (numBlk)-way sort; the input
        * files are successively split into numBlk smaller blocks until these
        * are of size <= blkSize, i.e.  small enough for in-core sorting,
        * then merging the sorted blocks as the stack unwinds.  The in-core
        * sort is parallelised over numProc processes.
        *
        * Parameters are as follows:
        * pmap        Photon Map with opened photon heap and precomputed 
        *             contributions
        * leafFile    Opened output file for sorted photons (=OOC octree leaves)
        * numBlk      Number of blocks to divide into / merge from 
        * blkSize     Max block size and size of in-core sort buffer, in bytes
        * numProc     Number of parallel processes for in-core sort 
        * bbOrg       Origin of bounding box containing photons
        * bbSize      Extent of bounding box containing photons
        */
       {
          Photon               *photonBuf = NULL;
          PreComputedContrib*  preCompContrib = (PreComputedContrib*)(
                                  pmap ? pmap -> preCompContrib : NULL
                               );
          char                 *contribBuf = NULL;
          unsigned long        bufLen;
          int                  stat;
          OOC_KeyData          keyData;
 
          if (numBlk < 1)
             numBlk = 1;
 
          if (!pmap || !preCompContrib || !pmap -> numPhotons || 
             !numBlk || !blkSize
          )
             error(INTERNAL, "contribPhotonSort: nothing to sort");
 
          if (preCompContrib -> nBins <= 1)
             /* Binning disabled, fall back to photon sorting only */
             return OOC_Sort(pmap -> heap, leafFile, 
                PMAP_OOC_NUMBLK, PMAP_OOC_BLKSIZE, numProc, sizeof(Photon), 
                bbOrg, bbSize, OOC_PhotonKey
             );
 
    #ifdef PMAP_CONTRIB_DBG
          nCoeffs           = preCompContrib -> nCoeffs;
          nCompressedCoeffs = preCompContrib -> nCompressedCoeffs;
          
          /* Check photon & contrib heap lengths before actual sort */
          if (fseek(pmap -> heap, 0, SEEK_END) < 0 || 
             ftell(pmap -> heap) / PHOTONSIZE != pmap -> numPhotons
          )
             error(INTERNAL, 
                "contribPhotonSort: photon heap length mismatch"
             );
          
          if (fseek(pmap -> contribHeap, 0, SEEK_END) < 0 ||
             ftell(pmap -> contribHeap) / preCompContrib -> contribSize !=
                pmap -> numPhotons
          )
             error(INTERNAL,
                "contribPhotonSort: contribution heap length mismatch"
             );
    #endif
    
          /* Figure out maximum buffer length (number of photons/contribs we 
             can fit into the maximum block size) and allocate in-core sort 
             buffers accordingly. Quit if block size too small */
          bufLen = blkSize / max(PHOTONSIZE, preCompContrib -> contribSize);
          if (!bufLen)
             error(INTERNAL, "contribPhotonSort: block size too small");
          if (!(photonBuf = calloc(bufLen, PHOTONSIZE)) ||
             !(contribBuf = calloc(bufLen, preCompContrib -> contribSize))
          )
             error(SYSTEM, 
                "contribPhotonSort: cannot allocate in-core sort buffer"
             );
 
          /* Set key callback and data; reduces params on stack */
          keyData.key          = OOC_PhotonKey;
          keyData.mortonScale  = MORTON3D_MAX / bbSize;
          VCOPY(keyData.bbOrg, bbOrg);
          
          stat = contribPhotonSortRecurse(
             pmap -> heap, pmap -> contribHeap, 
             leafFile, preCompContrib -> waveletFile, 
             preCompContrib -> contribSize, numBlk, numProc, 
             photonBuf, contribBuf, bufLen, &keyData, 
             0, pmap -> numPhotons - 1
          );
 
          /* Cleanup */
          free(photonBuf);
          free(contribBuf);
 
          return stat;
       }
 
    #endif /* PMAP_CONTRIB */
 #endif /* NIX */
 
diff --git a/wavelet2.h b/wavelet2.h
index 7cb6eb9..5cd5088 100644
--- a/wavelet2.h
+++ b/wavelet2.h
@@ -1,245 +1,256 @@
 /* 
    =========================================================================
    Definitions for 2D wavelet transforms on arrays of 3-tuples.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 
 #ifndef _WAVELET2_H
    #define _WAVELET2_H
 
    /* Wavelet coefficient type; defaults to double if not already defined.
      (may be overridden in compiler command line, for example).
       NOTE: single precision float may improve cache performance during 
       2D transform with large arrays. */
    #ifndef WAVELET_COEFF
       #define WAVELET_COEFF   double
    #endif
 
    /* Perform final Haar wavelet transform on coarsest two detail levels.
     * This has the advantage of resulting in a single coarsest level
     * approximation coefficient, possibly at the expense of poorer
     * compression */
    /* #define WAVELET_FINAL_HAAR */
 
    /* Boundary extension modes for padded Daubechies D4 wavelet transform */
    #define WAVELET_EXTEND_CONST     0     /* Constant (default if undef) */
    #define WAVELET_EXTEND_GRAD1     1     /* 1st order gradient */
    #define WAVELET_EXTEND_GRAD2     2     /* 2nd order gradient */
    #define WAVELET_EXTEND_CYCL      3     /* Periodic (wraparound) */
    #define WAVELET_EXTEND_SYMM      4     /* Symmetric (reflection) */
 
    #ifndef WAVELET_EXTEND
       #define WAVELET_EXTEND_MODE   WAVELET_EXTEND_GRAD1
    #endif
 
    
    /* Convenience macros for handling coefficients */
    #define min(a, b)          ((a) < (b) ? (a) : (b))
    #define max(a, b)          ((a) > (b) ? (a) : (b))
    #define coeffAvg(a)        (((a) [0] + (a) [1] + (a) [2]) / 3)
    
    #define coeffIsEmpty(c) ( \
       isnan((c) [0]) || isnan((c) [1]) || isnan((c) [2]) \
    )
    #define coeffIsZero(c)  ((c) [0] == 0 && (c) [1] == 0 && (c) [2] == 0)
+
+   /* Number of approximation coefficients per dimension in upper left of
+    * the wavelet matrix after a full Daubechies D4 transform with boundary
+    * padding */
+   #define WAVELET_PADD4_APPROXDIM     3
+   #define WAVELET_PADD4_NUMAPPROX     ( \
+      (WAVELET_PADD4_APPROXDIM) * (WAVELET_PADD4_APPROXDIM) \
+   )
+   /* Number of Daubechies D4 detail coefficients resulting from the above
+    * based on the number of total number of coefficients */
+   #define WAVELET_PADD4_NUMDETAIL(n)  ((n) - WAVELET_PADD4_NUMAPPROX)
    
    /* Determine if coeff tuple c is thresholded, also including empty coeffs
     * in debugging mode, since these will be unconditionally dropped. Note
     * that the 3x3 coarsest approximation coefficients in the upper left of
     * matrix c are not thresholded, since they are essential for a
     * reasonably accurate reconstruction! */
    /* #ifdef WAVELET_FINAL_HAAR */
    #if 0
       #define coeffThresh(c, i, j, t)  ( \
          fabs(t) > 0 && ((i) || (j)) && ( \
             coeffIsEmpty((c) [i][j]) || fabs((c) [i][j][0]) < (t) || \
             fabs((c) [i][j][1]) < (t) || fabs((c) [i][j][2]) < (t) \
          ) \
       )
    #else
       #define coeffThresh(c, i, j, t)  ( \
          fabs(t) > 0 && ((i) > 2 || (j) > 2) && ( \
             coeffIsEmpty((c) [i][j]) || fabs((c) [i][j][0]) < (t) || \
             fabs((c) [i][j][1]) < (t) || fabs((c) [i][j][2]) < (t) \
          ) \
       )
    #endif
    
 
 
    /* Haar / Daubechies D4 coefficients */
    #define SQRT2        1.41421356
    #define SQRT3        1.73205081
    #define H4NORM       (0.25 / SQRT2)
 
    /* Haar wavelet coeffs */
    extern const WAVELET_COEFF h2;
 
    /* Daubechies D4 wavelet coeffs */   
    extern const WAVELET_COEFF h4 [4];
    extern const WAVELET_COEFF g4 [4];
 
 
 
    /* Wavelet matrix defs; these are stored as arrays of pointers (a.k.a.
     * Iliffe vectors), as this proved to be more performant than a flattened
     * representation. Encapsulating the coefficient's triplets in a struct
     * prevents the compiler from introducing another layer of indirection.
     * */
    typedef WAVELET_COEFF      WaveletCoeff3 [3];
    
    #define WAVELET_COEFFSIZE  (sizeof(WaveletCoeff3))
    typedef WaveletCoeff3      **WaveletMatrix2;
 
    
    
    /* 
       ----------- 2D COEFFICIENT MATRIX ALLOC/INIT ROUTINES -----------
    */
 
    WaveletMatrix2 allocWaveletMatrix2 (unsigned len);
    /* Allocate and return a 2D coefficient array of size (len x len).
       Returns NULL if allocation failed. */
 
    
    void freeWaveletMatrix2 (WaveletMatrix2 y, unsigned len);
    /* Free previously allocated 2D coeff array y of size (len x len) */
 
 
    void zeroCoeffs2 (WaveletMatrix2 y, unsigned len);
    /* Set 2D array coefficients to zero */
 
 
    /* 
       ---------- 2D WAVELET TRANSFORM OPTIMISED FOR SIZES 2^l ----------
       ------ THESE USE PERIODIC (WRAP-AROUND) BOUNDARY EXTENSION -------
    */
    
    int waveletXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l);
    /*
       Perform full 2D multiresolution forward wavelet transform on array y
       of size (2^l) x (2^l) containing original signal as 3-tuples, where 
       l >= 1.  Note no intra-tuple transform occurs.
       
       The wavelet coefficients are returned in array y, containing the
       coarsest approximation in y [0][0] followed by horizontal/vertical
       details in order of increasing resolution/frequency.
       
       A preallocated array yt of identical dimensions to y can be supplied
       as buffer for intermediate results.  If yt == NULL, a buffer is
       automatically allocated and freed on demand, but this is inefficient
       for frequent calls.  It is recommended to preallocate yt to the
       maximum expected size.  The dimensions of yt are not checked; this is
       the caller's responsibility.
          
       Returns 0 on success, else -1.
    */
    
 
    int waveletInvXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l);
    /* 
       Perform full 2D multiresolution inverse wavelet transform on array y
       of size (2^l) x (2^l) containing wavelet coefficients as 3-tuples,
       where l >= 1.  Note no intra-tuple transform occurs.
 
       A preallocated array yt of identical dimensions to y can be supplied
       as buffer for intermediate results.  If yt == NULL, a buffer is
       automatically allocated and freed on demand, but this is inefficient
       for frequent calls.  It is recommended to preallocate yt to the
       maximum expected size.  The dimensions of yt are not checked; this is
       the caller's responsibility.
       
       The reconstructed signal is returned in array y.  Returns 0 on
       success, else -1.
    */
 
 
 
    /* 
       ------------ 2D WAVELET TRANSFORM FOR ARBITRARY SIZES -------------
       ---- THESE USE CONSTANT BOUNDARY EXTENSION WITH PADDING COEFFS-----
    */
    
    unsigned padWaveletXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
       unsigned yLen0, unsigned *numNonZero
    );
    /* Perform full 2D multiresolution forward wavelet transform with padded
       boundary coefficients on array y containing (yLen0 x yLen0) samples of
       3-tuples.  Note no intra-tuple transform occurs.  An additional array
       yt of identical dimension to y is required as buffer for intermediate
       results.
       
       The wavelet coefficients are returned in array y, containing the
       coarsest approximation coefficients in y [0][0] followed by
       horizontal/vertical detail coefficients in order of increasing
       resolution/frequency.
 
       NOTE: Both y and yt must be large enough to accommodate the additional
       padding coefficients introduced at the array boundaries during the
       transform.  It is the caller's responsibility to preallocate and
       dimension these arrays accordingly with allocWaveletMatrix2().  The
       dimension of the output array (which generally exceeds the input
       dimension yLen0) must be interrogated beforehand by calling the
       function with y or yt set to NULL.
       
       The returned value is the output dimension, or 0 if the transform
       failed.  In addition, the number of nonzero coefficients is returned
       in numNonZero (if not NULL), which is a subset of the number of output
       coefficients, i.e.  (output dimension)^2.
    */
    
    
    unsigned padWaveletInvXform2 (WaveletMatrix2 y, WaveletMatrix2 yt,
       unsigned yLen0, unsigned yLenOrg
    );
    /* Perform full 2D multiresolution inverse wavelet transform on array y
       containing (yLen0 x yLen0) boundary padded wavelet coefficients as
       3-tuples.  The original number of samples (yLenOrg x yLenOrg) is
       required for the reconstruction.  An additional array yt of identical
       dimension to y is required as buffer for intermediate results.
     
       The reconstructed samples are returned in y.  This is smaller than
       (yLen0 x yLen0) since the boundary coefficients from the forward
       transform are collapsed.
     
       The returned value is the reconstructed number of samples per
       dimension (yLenOrg), or 0 if the inverse transform failed.
     */ 
    
   
    
    #ifdef WAVELET_DBG
       /* 
          ----------------- SUPPORT ROUTINES FOR DEBUGGING -----------------
       */
       
       void clearCoeffs2 (WaveletMatrix2 y, unsigned len);
       /* Clear 2D array to facilitate debugging */
       
       
       void dumpCoeffs2 (const WaveletMatrix2 y1, const WaveletMatrix2 y2,
          unsigned y1Len, unsigned y2Len, float thresh 
       );
       /* 
          Dump 2D arrays y1 and y2 of dims y1Len resp.  y2Len side-by-side to
          stdout (skip y2 if NULL).  Coefficients with absolute magnitude
          less than thresh are marked with brackets ('[]') as thresholded.
       */
       
       
       float rmseCoeffs2 (const WaveletMatrix2 y1, const WaveletMatrix2 y2, 
          unsigned len
       );
       /* Calculate RMSE between 2D matrices y1 and y2 */
    #endif
 #endif