diff --git a/pmapcontrib.c b/pmapcontrib.c
index 397181b..48eb083 100644
--- a/pmapcontrib.c
+++ b/pmapcontrib.c
@@ -1,1148 +1,1153 @@
 #ifndef lint
 static const char RCSid[] = "$Id: pmapcontrib.c,v 2.20 2021/02/16 20:06:06 greg Exp $";
 #endif
 
 /* 
    =========================================================================
    Photon map routines for precomputed light source contributions. 
    These routines handle contribution binning, compression and encoding,
    and are used by mkpmap.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #147053, "Daylight Redirecting Components",
         SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id: pmapcontrib.c,v 2.20 2021/02/16 20:06:06 greg Exp $
 */
 
 
 
 #include "pmapcontrib.h"
 #include "pmapdiag.h"
 #include "pmaprand.h"
 #include "pmapmat.h"
 #include "pmapsrc.h"
 #include "pmutil.h"
 #include "otspecial.h"
 #include "otypes.h"
 #include "lookup.h"
 
 
 
 #ifdef PMAP_CONTRIB
 /* The following are convenient placeholders interfacing to mkpmap 
    and rcontrib. They can be externally set via initPmapContribTab()
    and referenced within the contrib pmap modules. These variables 
    can then be safely ignored by rtrace/rpict/rvu, without annoying
    linking errors. */
 /* Global pointer to rcontrib's contribution binning LUT */
 LUTAB *pmapContribTab = NULL;
 /* Contribution/coefficient mode flag */
 int *pmapContribMode;
 
 
 
 extern void SDdisk2square(double sq[2], double diskx, double disky);
 
 
 
 #if 0
    int xy2lin (unsigned scDim, int x, int y)
    /* Serialise 2D contribution coords in (scDim x scDim) Shirley-Chiu square
       to 1D index.  Returns -1 if coordinates invalid */
    {
       return x < 0 || y < 0 
          ? -1
          : (x * scDim) + y;
    }
 
 
 
    void lin2xy (unsigned scDim, int bin, int *x, int *y)
    /* Deserialise 1D contribution index to 2D coordinates in (scDim x scDim)
     * Shirley-Chiu square.  Returns -1 if bin invalid */
    {
       *x = bin < 0 ? -1 : bin / scDim;
       *y = bin < 0 ? -1 : bin % scDim;
    }
 #endif
 
 
 
 static int ray2bin (const RAY *ray, unsigned scDim)
 /* Map ray dir (pointing away from origin) to its 1D bin in an
    (scDim x scDim) Shirley-Chiu square.
    Returns -1 if mapped bin is invalid (e.g. behind plane) */
 {
    static int        scRHS, varInit = 0;
    static FVECT      scNorm, scUp;
    unsigned          scBin [2];
    FVECT             diskPlane;
    RREAL             dz, diskx, disky, rad, diskd2, scCoord [2];
 
    if (!varInit) {
       /* Lazy init shirley-Chiu mapping orientation from function variables */
       scRHS       = varvalue(PMAP_CONTRIB_SCRHS);
       scNorm [0]  = varvalue(PMAP_CONTRIB_SCNORMX);
       scNorm [1]  = varvalue(PMAP_CONTRIB_SCNORMY);
       scNorm [2]  = varvalue(PMAP_CONTRIB_SCNORMZ);
       scUp [0]    = varvalue(PMAP_CONTRIB_SCUPX);
       scUp [1]    = varvalue(PMAP_CONTRIB_SCUPY);
       scUp [2]    = varvalue(PMAP_CONTRIB_SCUPZ);
       varInit ^= 1;
    }
 
    /* Map incident direction to disk */
    dz = DOT(ray -> rdir, scNorm);                         /* normal proj */
    if (dz > 0) {
       fcross(diskPlane, scUp, scNorm);    /* y-axis in plane, perp to up */
       diskPlane [0] *= scRHS; 
       diskx = DOT(ray -> rdir, diskPlane);
       disky = DOT(ray -> rdir, scUp) - dz * DOT(ray -> rdir, scUp);
       diskd2 = diskx * diskx + disky * disky;       /* in-plane length^2 */
       rad = dz>FTINY ? sqrt((1 - dz*dz) / diskd2) : 0;  /* radial factor */
       disky *= rad;                 /* diskx, disky now in range [-1, 1] */
 
       /* Apply Shirley-Chiu mapping of (diskx, disky) to square */
       SDdisk2square(scCoord, diskx, disky);
       /* Map Shirley-Chiu square coords to 1D bin */
       scBin [0] = scCoord [0] * scDim;
       scBin [1] = scCoord [1] * scDim;
       return PMAP_CONTRIB_XY2LIN(scDim, scBin [0], scBin [1]);
    }
    else return -1;
 }
 
 
 /* ------------------ CONTRIBSRC  STUFF --------------------- */
 
 
 #ifndef PMAP_CONTRIB_TEST
    MODCONT *addContribModifier (LUTAB *contribTab, unsigned *numContribs, 
       char *mod, char *binParm, int binCnt
    )
    /* Add light source modifier mod to contribution lookup table contribTab,
       and update numContribs.  Return initialised contribution data for this
       modifier. This code adapted from rcontrib.c:addmodifier(). */
    {
       LUENT    *lutEntry = lu_find(contribTab, mod);
       MODCONT  *contrib;
       EPNODE   *eBinVal;
 
+      /* Check for bin overflow in mRGBE encoding BEFORE photon distrib! */
+      if (binCnt > PMAP_CONTRIB_MAXBINS) {
+         sprintf(errmsg, "too many bins for modifier %s", mod);
+         error(USER, errmsg);
+      } 
       if (lutEntry -> data) {
          /* Reject duplicate modifiers */
          sprintf(errmsg, "duplicate light source modifier %s", mod);
          error(USER, errmsg);
       }
       if (*numContribs >= MAXMODLIST) {
          sprintf(errmsg, "too many modifiers (max. %d)", MAXMODLIST);
          error(INTERNAL, errmsg);
       }
       lutEntry -> key = mod;
       
       if (binCnt <= 0) {
          sprintf(errmsg, "undefined/invalid bin count for modifier %s", mod);
          error(USER, errmsg);
       }
       
       /* Allocate and init contributions */
       contrib = (MODCONT *)malloc(
          sizeof(MODCONT) + sizeof(DCOLOR) * (binCnt - 1)
       );
       if (!contrib)
          error(SYSTEM, "out of memory in addContribModifier()");
       contrib -> outspec   = NULL;
       contrib -> modname   = mod;
       contrib -> params    = binParm;
       contrib -> nbins     = binCnt;
       contrib -> binv      = eBinVal;
       contrib -> bin0      = 0;
       memset(contrib -> cbin, 0, sizeof(DCOLOR) * binCnt);
       lutEntry -> data = lutEntry -> data = (char *)contrib;
       ++(*numContribs);
 
       return(contrib);
    }
 
 
 
    void addContribModfile (LUTAB *contribTab, unsigned *numContribs, 
       char *modFile, char *binParm, int binCnt
    )
    /* Add light source modifiers from file modFile to contribution lookup table
     * contribTab, and update numContribs. 
     * NOTE: This code is adapted from rcontrib.c */
    {
       char  *mods [MAXMODLIST];
       int   i;
       
       /* Find file and store strings */
       i = wordfile(mods, MAXMODLIST, getpath(modFile, getrlibpath(), R_OK));
       if (i < 0) {
          sprintf(errmsg, "can't open modifier file %s", modFile);
          error(SYSTEM, errmsg);
       }
       if (*numContribs + i >= MAXMODLIST - 1) {
          sprintf(errmsg, "too many modifiers (max. %d) in file %s", 
             MAXMODLIST - 1, modFile
          );
          error(INTERNAL, errmsg);
       }
 
       for (i = 0; mods [i]; i++)
          /* Add each modifier */
          addContribModifier(contribTab, numContribs, 
             mods [i], binParm, binCnt
          );
    }
 
 
 
    static int contribSourceBin (LUTAB *contribs, const RAY *ray)
    /* Map contribution source ray to its bin for light source ray -> rsrc,
       using Shirley-Chiu disk-to-square mapping.
       Return invalid bin -1 if mapping failed. */
    {
       const SRCREC   *src;
       const OBJREC   *srcMod;
       const MODCONT  *srcCont;
       RAY            srcRay;
       int            bin, i;
 
       /* Check we have a valid ray and contribution LUT */
       if (!ray || !contribs)
          return -1;
 
       src = &source [ray -> rsrc];
       srcMod = findmaterial(src -> so);
       srcCont = (MODCONT*)lu_find(contribs, srcMod -> oname) -> data;
 
       if (!srcCont)
          /* Not interested in this source (modifier not in contrib LUT) */
          return -1;
 
       /* Set up shadow ray pointing to source for disk2square mapping */
       rayorigin(&srcRay, SHADOW, NULL, NULL);
       srcRay.rsrc = ray -> rsrc;
       VCOPY(srcRay.rorg, ray -> rop);
       for (i = 0; i < 3; i++)
          srcRay.rdir [i] = -ray -> rdir [i];
               
       if (!(src -> sflags & SDISTANT 
          ? sourcehit(&srcRay) 
          : (*ofun[srcMod -> otype].funp)(srcMod, &srcRay)
       ))
          /* (Redundant?) sanity check for valid source ray? */
          return -1;
    #if 0
       worldfunc(RCCONTEXT, &srcRay);
    #endif      
       set_eparams((char*)srcCont -> params);
 
       if ((bin = ray2bin(&srcRay, sqrt(srcCont -> nbins))) < 0)
          error(WARNING, "Ignoring invalid bin in contribSourceBin()");
 
       return bin;
    }
 
 
 
    PhotonContribSourceIdx newPhotonContribSource (PhotonMap *pmap, 
       const RAY *contribSrcRay, FILE *contribSrcHeap
    )
    /* Add contribution source ray for emitted contribution photon and save 
     * light source index and binned direction. The current contribution 
     * source is stored in pmap -> lastContribSrc. If the previous contrib
     * source spawned photons (i.e. has srcIdx >= 0), it's appended to 
     * contribSrcHeap. If contribSrcRay == NULL, the current contribution 
     * source is still flushed, but no new source is set. Returns updated 
     * contribution source counter pmap -> numContribSrc. */
    {
       if (!pmap || !contribSrcHeap)
          return 0;
          
       /* Check if last contribution source has spawned photons (srcIdx >= 0, 
        * see newPhoton()), in which case we save it to the heap file before 
        * clobbering it. (Note this is short-term storage, so we doan' need 
        * da portable I/O stuff here). */
       if (pmap -> lastContribSrc.srcIdx >= 0) {
          if (!fwrite(&pmap -> lastContribSrc, sizeof(PhotonContribSource), 
             1, contribSrcHeap
          ))
             error(SYSTEM, "failed writing photon contrib source in "
                "newPhotonContribSource()"
             );
             
          pmap -> numContribSrc++;
          if (!pmap -> numContribSrc || 
             pmap -> numContribSrc > PMAP_MAXCONTRIBSRC
          );
       }
 
       if (contribSrcRay) {
          /* Mark this contribution source unused with a negative source
             index until its path spawns a photon (see newPhoton()) */
          pmap -> lastContribSrc.srcIdx = -1;
 
          /* Map ray to bin in anticipation that this contrib source will be
             used, since the ray will be lost once a photon is spawned */
          pmap -> lastContribSrc.srcBin = contribSourceBin(
             pmapContribTab, contribSrcRay
          );
 
          if (pmap -> lastContribSrc.srcBin < 0) {
             /* Warn if invalid bin, but trace photon nonetheless. It will
                count as emitted to prevent bias, but will not be stored in
                newPhoton(), as it contributes zero flux */
             sprintf(errmsg, "invalid bin for light source %s, "
                "contribution photons will be discarded", 
                source [contribSrcRay -> rsrc].so -> oname
             );
             error(WARNING, errmsg);
          }
       }
 
       return pmap -> numContribSrc;
    }
 
 
 
    PhotonContribSourceIdx buildContribSources (PhotonMap *pmap, 
       FILE **contribSrcHeap, char **contribSrcHeapFname, 
       PhotonContribSourceIdx *contribSrcOfs, unsigned numHeaps
    )
    /* Consolidate per-subprocess contribution source heaps into array
     * pmap -> contribSrc. Returns offset for contribution source index 
     * linearisation in pmap -> numContribSrc. The heap files in 
     * contribSrcHeap are closed on return. */
    {
       PhotonContribSourceIdx  heapLen;
       unsigned                heap;
 
       if (!pmap || !contribSrcHeap || !contribSrcOfs || !numHeaps)
          return 0;
 
       pmap -> numContribSrc = 0;
 
       for (heap = 0; heap < numHeaps; heap++) {
          contribSrcOfs [heap] = pmap -> numContribSrc;
          
          if (fseek(contribSrcHeap [heap], 0, SEEK_END) < 0)
             error(SYSTEM, "failed photon contrib source seek "
                "in buildContribSources()"
             );
          pmap -> numContribSrc += heapLen = 
             ftell(contribSrcHeap [heap]) / sizeof(PhotonContribSource);
 
          if (!(pmap -> contribSrc = realloc(pmap -> contribSrc, 
             pmap -> numContribSrc * sizeof(PhotonContribSource)
          ))) 
             error(SYSTEM, "failed photon contrib source alloc "
                "in buildContribSources()"
             );
 
          rewind(contribSrcHeap [heap]);
          if (fread(pmap -> contribSrc + contribSrcOfs [heap], 
             sizeof(PhotonContribSource), heapLen, contribSrcHeap [heap]
          ) != heapLen)
             error(SYSTEM, "failed reading photon contrib source "
                "in buildContribSources()"
             );
          
          fclose(contribSrcHeap [heap]);
          unlink(contribSrcHeapFname [heap]);
       }
       
       return pmap -> numContribSrc;
    }
 
 
 
    /* ----------------- CONTRIB PMAP ENCODING STUFF -------------------- */
 
 
 
    static int coeffCompare (const void *c1, const void *c2)
    /* Comparison function to REVERSE sort thresholded coefficients */
    {
       const PreComputedContribCoeff *tcoeff1 = c1, *tcoeff2 = c2;
       /* Use dot product as magnitude to compare _absolute_ values */
       const WAVELET_COEFF  v1 = DOT(tcoeff1 -> coeff, tcoeff1 -> coeff),
                            v2 = DOT(tcoeff2 -> coeff, tcoeff2 -> coeff);
       
       if (v1 < v2)
          return 1;
       else if (v1 > v2)
          return -1;
       else
          return 0;
    }
 
 
 
    static int thresholdContribs (PreComputedContrib *preCompContrib)
    /* Threshold wavelet detail coefficients in preCompContrib ->
       waveletMatrix [1..coeffDim-1] [1..coeffDim-1] by keeping the
       (preCompContrib -> nCompressedCoeffs) largest of these and returning them
       in preCompContrib -> compressedCoeffs along with their original order. 
       NOTE: preCompContrib -> waveletMatrix [0][0] is the average wavelet
       coefficient and excluded from thresholding.  
       Returns 0 on success. */
    {
       unsigned                i, j, coeffDim, coeffIdx;
       WaveletMatrix2          waveletMatrix;
       PreComputedContribCoeff *threshCoeffs;
       
       if (!preCompContrib || !(coeffDim = preCompContrib -> coeffDim) ||
          !(threshCoeffs = preCompContrib -> compressedCoeffs) ||
          !(waveletMatrix = preCompContrib -> waveletMatrix)
       )
          /* Should be initialised by caller! */
          return -1;
 
       /* Set up coefficient buffer for compression (thresholding), skipping
        * coefficient at waveletMatrix [0][0] since it's the average and
        * therefore not thresholded.  The 2D coefficient indices (matrix
        * coords) are linearised to 1D.  */
       for (i = 0; i < coeffDim; i++)
          for (j = 0; j < coeffDim; j++) {
             coeffIdx = PMAP_CONTRIB_XY2LIN(coeffDim, i, j);
             
             if (coeffIdx) {
                /* Set up pointers to coeffs (sorted faster than 3 doubles)
                   and remember original (linearised) index prior to sort */
                threshCoeffs [coeffIdx - 1].idx = coeffIdx;
                threshCoeffs [coeffIdx - 1].coeff = (WAVELET_COEFF*)&(
                   waveletMatrix [i] [j]
                );
             }
          }
       
       /* REVERSE sort coeffs by magnitude; the non-thresholded coeffs
          will then start at threshCoeffs [0] (note nCoeffs == coeffDim^2) */
       /* XXX: PARTITIONING WOULD ACTUALLY BE SUFFICIENT AND FASTER! */
       qsort(threshCoeffs, preCompContrib -> nCoeffs - 1,
          sizeof(PreComputedContribCoeff), coeffCompare
       );
       
       return 0;
    }
 
 
 
    static int encodeContribs (PreComputedContrib *preCompContrib, 
       float compressRatio
    )
    /* Apply wavelet transform to input matrix preCompContrib ->
       waveletMatrix and compress according to compressRatio, storing
       thresholded and mRGBE-encoded coefficients in preCompContrib ->
       mrgbeCoeffs.  Note that the average coefficient is not encoded, and
       returned in preCompContrib -> waveletMatrix [0][0].  
       Returns 0 on success.  */
    {
       unsigned                   i, j, k, scDim;
       WaveletMatrix2             waveletMatrix, tWaveletMatrix;
       PreComputedContribCoeff    *threshCoeffs;
       mRGBERange                 *mrgbeRange;
       mRGBE                      *mrgbeCoeffs;
       WAVELET_COEFF              absCoeff;
    #ifdef PMAP_CONTRIB_DBG
       WaveletCoeff3              decCoeff;
       unsigned                   decIdx;
    #endif 
       
       if (!preCompContrib || !preCompContrib -> mrgbeCoeffs ||
          !preCompContrib -> compressedCoeffs || 
          !(waveletMatrix = preCompContrib -> waveletMatrix) || 
          !(tWaveletMatrix = preCompContrib -> tWaveletMatrix) ||
          !(scDim = preCompContrib -> scDim)
       )
          /* Should be initialised by the caller! */
          return -1;
       
    #ifdef PMAP_CONTRIB_DBG
       for (i = 0; i < scDim; i++) {
          for (j = 0; j < scDim; j++) {
             for (k = 0; k < 3; k++) {
       #if 0
                /* Set contributions to bins for debugging */
                waveletMatrix [i] [j] [k] = PMAP_CONTRIB_XY2LIN(scDim, i, j);
       #else
                /* Replace contribs with "bump" function */
                waveletMatrix [i] [j] [k] =
                   (1. - fabs(i - scDim/2. + 0.5) / (scDim/2. - 0.5)) *
                   (1. - fabs(j - scDim/2. + 0.5) / (scDim/2. - 0.5));
       #endif
             }
                
       #if 0
             /* Dump contribs prior to encoding for debugging */         
             printf("% 7.3g\t", colorAvg(waveletMatrix [i] [j]));
          }
          putchar('\n');
       }
       putchar('\n');
       #else
          }
       }
       #endif
    #endif
 
       /* Do 2D wavelet transform on preCompContrib -> waveletMatrix */
       if (padWaveletXform2(waveletMatrix, tWaveletMatrix, scDim, NULL) !=
          preCompContrib -> coeffDim
       )
          error(INTERNAL, "failed wavelet transform in encodeContribs()");
 
       /* Compress wavelet detail coeffs by thresholding */
       if (thresholdContribs(preCompContrib) < 0)
          error(INTERNAL, "failed wavelet compression in encodeContribs()");
       threshCoeffs = preCompContrib -> compressedCoeffs;
 
       /* Init per-channel coefficient range for mRGBE encoding */
       mrgbeRange = &preCompContrib -> mrgbeRange;
       setcolor(mrgbeRange -> min, FHUGE, FHUGE, FHUGE);
       setcolor(mrgbeRange -> max, 0, 0, 0);
 
       /* Update per-channel coefficient range */
       for (i = 0; i < preCompContrib -> nCompressedCoeffs; i++) {
          for (k = 0; k < 3; k++) {
    #ifdef PMAP_CONTRIB_DBG
       #if 0
             /* Replace wavelet coefficient with bin for debugging, ordering
                bins linearly */ 
             threshCoeffs [i].coeff [k] = threshCoeffs [i].idx = i + 1;
       #endif
    #endif
             absCoeff = fabs(threshCoeffs [i].coeff [k]);
             
             if (absCoeff < mrgbeRange -> min [k])
                mrgbeRange -> min [k] = absCoeff;
             if (absCoeff > mrgbeRange -> max [k])
                mrgbeRange -> max [k] = absCoeff;
          }
       }
       
       if (preCompContrib -> nCompressedCoeffs == 1)
          /* Maximum compression with just 1 (!) compressed detail coeff (is
           * this even useful?), so mRGBE range is undefined since min & max
           * coincide; set minimum to 0, maximum to the single remaining
           * coefficient */
          setcolor(mrgbeRange -> min, 0, 0, 0);
       
       /* Init mRGBE coefficient normalisation from range */
       if (!mRGBEinit(mrgbeRange, mrgbeRange -> min, mrgbeRange -> max))
          return -1;
          
       mrgbeCoeffs = preCompContrib -> mrgbeCoeffs;
 
       /* Encode wavelet detail coefficients to mRGBE */
       for (i = 0; i < preCompContrib -> nCompressedCoeffs; i++) {
          /* XXX: threshCoeffs [i].idx could overflow the mRGBE data field,
           * since there are more padded coefficients than bins!!!
           * SORT BY IDX AND USE AN INCREMENTAL INDEXING HERE INSTEAD??? */
          mrgbeCoeffs [i] = mRGBEencode(threshCoeffs [i].coeff, 
             mrgbeRange, threshCoeffs [i].idx
          );
 
    #ifdef PMAP_CONTRIB_DBG
          /* Encoding sanity check */
          decIdx = mRGBEdecode(mrgbeCoeffs [i], mrgbeRange, decCoeff);
       #if 0
          if (decCoeff != threshCoeffs [i].idx ||
             sqrt(dist2(decCoeff, threshCoeffs [i].coeff)) > 1.7
          )
             error(CONSISTENCY, "failed sanity check in encodeContribs()");
       #endif
          for (k = 0; k < 3; k++)
             if (decCoeff [k] / threshCoeffs [i].coeff [k] < 0)
                error(CONSISTENCY, 
                   "coefficient sign reversal in encodeContribs()"
                );
    #endif
       }
          
       return 0;
    }
 
 
 
    static void initContribHeap (PhotonMap *pmap)
    /* Initialise precomputed contribution heap */
    {
       int fdFlags;
       
       if (!pmap -> contribHeap) {
          /* Open heap file for precomputed contributions */
          mktemp(strcpy(pmap -> contribHeapFname, PMAP_TMPFNAME));
          if (!(pmap -> contribHeap = fopen(pmap -> contribHeapFname, "w+b")
             )
          )
             error(SYSTEM, "failed opening precomputed contribution "
                "heap file in initContribHeap()"
             );
    #ifdef F_SETFL /* XXX is there an alternative needed for Wind0z? */
          fdFlags = fcntl(fileno(pmap -> contribHeap), F_GETFL);
          fcntl(fileno(pmap -> contribHeap), F_SETFL, fdFlags | O_APPEND);
    #endif   /* ftruncate(fileno(pmap -> heap), 0); */
       }
    }
 
 
 
    static MODCONT *getPhotonContrib (PhotonMap *pmap, RAY *ray, COLOR irrad)
    /* Locate photons near ray -> rop which originated from the light source
       modifier indexed by ray -> rsrc, and accumulate their contributions 
       in pmap -> srcContrib. Return total contributions in irrad and
       pointer to binned contributions (or NULL if none were found). */
    {
       const OBJREC            *srcMod = findmaterial(source [ray->rsrc].so);
       MODCONT                 *contrib = (MODCONT*)lu_find(
                                  pmapContribTab, srcMod -> oname
                               ) -> data;
       Photon                  *photon;
       COLOR                   flux;
       PhotonSearchQueueNode   *sqn;
       float                   r2, norm;
       unsigned                i;
 
       /* Zero bins for this source modifier */     
       memset(contrib -> cbin, 0, sizeof(DCOLOR) * contrib -> nbins);
       setcolor(irrad, 0, 0, 0);
     
       if (!contrib || !pmap -> maxGather) 
          /* Modifier not in LUT or zero bandwidth */
          return NULL;
    
       /* Lookup photons */
       pmap -> squeue.tail = 0;
       /* Pass light source index to filter in findPhotons() via 
          pmap -> lastContribSrc */ 
       pmap -> lastContribSrc.srcIdx = ray -> rsrc;
       findPhotons(pmap, ray);
       
       /* Need at least 2 photons */
       if (pmap -> squeue.tail < 2) {
    #ifdef PMAP_NONEFOUND
          sprintf(errmsg, "no photons found on %s at (%.3f, %.3f, %.3f)", 
             ray -> ro ? ray -> ro -> oname : "<null>",
             ray -> rop [0], ray -> rop [1], ray -> rop [2]
          );
          error(WARNING, errmsg);
    #endif
          return NULL;
       }
 
       /* Avg radius^2 between furthest two photons to improve accuracy */ 
       sqn = pmap -> squeue.node + 1;
       r2 = max(sqn -> dist2, (sqn + 1) -> dist2);
       r2 = 0.25 * (pmap -> maxDist2 + r2 + 2 * sqrt(pmap -> maxDist2 * r2));
 
       /* XXX: OMIT extra normalisation factor 1/PI for ambient calc? */
    #ifdef PMAP_EPANECHNIKOV
       /* Normalise accumulated flux by Epanechnikov kernel integral in 2D
          (see Eq. 4.4, p.76 in Silverman, "Density Estimation for 
          Statistics and Data Analysis", 1st Ed., 1986, and
          Wann Jensen, "Realistic Image Synthesis using Photon Mapping"),
          include RADIANCE-specific lambertian factor PI */
       norm = 2 / (PI * PI * r2);
    #else
       /* Normalise accumulated flux by search area PI * r^2, including
          RADIANCE-specific lambertian factor PI */
       norm = 1 / (PI * PI * r2);
    #endif 
 
       /* Skip the extra photon */
       for (i = 1 ; i < pmap -> squeue.tail; i++, sqn++) {
          /* Get photon's contribution to density estimate */
          photon = getNearestPhoton(&pmap -> squeue, sqn -> idx);
          getPhotonFlux(photon, flux);
          scalecolor(flux, norm);
    #ifdef PMAP_EPANECHNIKOV
          /* Apply Epanechnikov kernel to photon flux based on distance */
          scalecolor(flux, 1 - sqn -> dist2 / r2);
    #endif
          addcolor(irrad, flux);
          addcolor(contrib -> cbin [photonSrcBin(pmap, photon)], flux);
       }
 
       return contrib;
    }
    
    
    
    void freePreCompContribNode (void *p)
    /* Clean up precomputed contribution LUT entry */   
    {
       PhotonMap            *preCompContribPmap = (PhotonMap*)p;
       PreComputedContrib   *preCompContrib;
       
       if (preCompContribPmap) {
          preCompContrib = (PreComputedContrib*)(
             preCompContribPmap -> preCompContrib
          );
          
          if (preCompContrib) {
             /* Free primary and transposed wavelet matrices */
             freeWaveletMatrix2(preCompContrib -> waveletMatrix,
                preCompContrib -> coeffDim
             );
             freeWaveletMatrix2(preCompContrib -> tWaveletMatrix, 
                preCompContrib -> coeffDim
             );
          
             /* Free thresholded coefficients */
             free(preCompContrib -> compressedCoeffs);
          
             /* Free mRGBE encoded coefficients */
             free(preCompContrib -> mrgbeCoeffs);
             free(preCompContrib -> waveletFname);
             
             if (preCompContrib -> cache) {
                /* Free contribution cache */
                OOC_DeleteCache(preCompContrib -> cache);
                free(preCompContrib -> cache);
             }
          }
          
          /* Clean up precomputed contrib photon map */
          deletePhotons(preCompContribPmap);
          free(preCompContribPmap);
       }
    }
 
 
 
    void initPreComputedContrib (PreComputedContrib *preCompContrib)
    /* Initialise precomputed contribution container in photon map */
    {
       preCompContrib -> waveletFname = NULL;
       preCompContrib -> waveletFile = NULL;
       preCompContrib -> waveletMatrix = 
          preCompContrib -> tWaveletMatrix = NULL;
       preCompContrib -> compressedCoeffs = NULL;
       setcolor(preCompContrib -> mrgbeRange.min, 0, 0, 0);
       setcolor(preCompContrib -> mrgbeRange.max, 0, 0, 0);
       setcolor(preCompContrib -> mrgbeRange.norm, 0, 0, 0);
       preCompContrib -> mrgbeCoeffs = NULL;
       preCompContrib -> scDim = 
          preCompContrib -> nBins = 
          preCompContrib -> coeffDim = 
          preCompContrib -> nCoeffs = 
          preCompContrib -> nNonZeroCoeffs = 
          preCompContrib -> nCompressedCoeffs = 
          preCompContrib -> contribSize = 0;
       preCompContrib -> cache = NULL;
    }
 
 
 
    void preComputeContrib (PhotonMap *pmap)
    /* Precompute contributions for a random subset of (finalGather * 
       pmap -> numPhotons) photons, and return the per-modifier precomputed
       contribution photon maps in LUT preCompContribTab, discarding the
       original photons.  */
    {
       unsigned long        p, numPreComp;
       unsigned             i, scDim, nBins, 
                            coeffDim, nCoeffs, nCompressedCoeffs;
       PhotonIdx            pIdx;
       Photon               photon;
       RAY                  ray;
       MODCONT              *binnedContribs;
       COLR                 mrgbeRange32 [2]; 
       LUENT                *preCompContribNode;
       PhotonMap            *preCompContribPmap, nuPmap;
       PreComputedContrib   *preCompContrib;
       LUTAB                lutInit = LU_SINIT(NULL, freePreCompContribNode);
       WaveletMatrix2       waveletMatrix;
    #ifdef PMAP_CONTRIB_DBG
       RAY                  dbgRay;
    #endif
       
       if (verbose) {
          sprintf(errmsg, 
             "\nPrecomputing contributions for %ld photons\n", numPreComp
          );
          eputs(errmsg);
    #if NIX
          fflush(stderr);
    #endif
       }
 
       /* Init new parent photon map and set output filename */
       initPhotonMap(&nuPmap, pmap -> type);
       nuPmap.fileName = pmap -> fileName;
       
       /* Set contrib/coeff mode in new parent photon map */
       nuPmap.contribMode = pmap -> contribMode;
       
       /* Allocate and init LUT containing per-modifier child photon maps */
       if (!(nuPmap.preCompContribTab = malloc(sizeof(LUTAB))))
          error(SYSTEM, 
             "out of memory allocating LUT in preComputeContrib()"
          );
       memcpy(nuPmap.preCompContribTab, &lutInit, sizeof(LUTAB));
       
       /* Record start time, baby */
       repStartTime = time(NULL);
    #ifdef SIGCONT
       signal(SIGCONT, pmapPreCompReport);
    #endif
       repComplete = numPreComp = finalGather * pmap -> numPhotons;
       repProgress = 0;
    
       photonRay(NULL, &ray, PRIMARY, NULL);
       ray.ro = NULL;
       
       for (p = 0; p < numPreComp; p++) {
          /* Get random photon from stratified distribution in source heap 
           * to avoid duplicates and clustering */
          pIdx = firstPhoton(pmap) + (unsigned long)(
             (p + pmapRandom(pmap -> randState)) / finalGather
          );
          getPhoton(pmap, pIdx, &photon);
 
          /* Init dummy photon ray with intersection and normal at photon
             position. Set emitting light source index from origin. */
          VCOPY(ray.rop, photon.pos);
          ray.rsrc = photonSrcIdx(pmap, &photon);
          for (i = 0; i < 3; i++)
             ray.ron [i] = photon.norm [i] / 127.0;
 
    #ifndef PMAP_CONTRIB_DBG
          /* Get contributions at photon position; retry with another 
             photon if no contribs found */
          binnedContribs = getPhotonContrib(pmap, &ray, ray.rcol);
    #else
          /* Get all contribs from same photon for debugging */
          /* Position must differ otherwise too many identical photon keys
           * will cause ooC octree to overflow! */ 
          VCOPY(dbgRay.rop, photon.pos);
          getPhoton(pmap, 0, &photon);
          memcpy(&dbgRay, &ray, sizeof(RAY));
          dbgRay.rsrc = photonSrcIdx(pmap, &photon);
          for (i = 0; i < 3; i++)
             dbgRay.ron [i] = photon.norm [i] / 127.0;
          binnedContribs = getPhotonContrib(pmap, &dbgRay, dbgRay.rcol);
    #endif
 
          if (binnedContribs) {
    #if 0
             if (!(p & PMAP_CNTUPDATE))
                printf("Precomputing contribution photon %lu / %lu\n", 
                   p, numPreComp
                );
    #endif
             /* Found contributions at photon position, so generate
              * precomputed photon */
             if (!(preCompContribNode = lu_find(nuPmap.preCompContribTab, 
                   binnedContribs -> modname)
                )
             )
                error(SYSTEM, "out of memory allocating LUT entry in "
                   "preComputeContrib()"
                );
 
             if (!preCompContribNode -> key) {
                /* New LUT node for precomputed contribs for this modifier */
                preCompContribNode -> key = (char*)binnedContribs -> modname;
                preCompContribNode -> data = (char*)(
                   preCompContribPmap = malloc(sizeof(PhotonMap))
                );
 
                if (preCompContribPmap) {
                   /* Init new child photon map and its contributions */
                   initPhotonMap(preCompContribPmap, 
                      PMAP_TYPE_CONTRIB_CHILD
                   );
                   initPhotonHeap(preCompContribPmap);
                   initContribHeap(preCompContribPmap);
                   preCompContrib = (PreComputedContrib*)(
                      preCompContribPmap -> preCompContrib = 
                         malloc(sizeof(PreComputedContrib))
                   );
                   initPreComputedContrib(preCompContrib);
                }
                if (!preCompContribPmap || !preCompContrib)
                   error(SYSTEM, "out of memory allocating new photon map "
                      "in preComputeContrib()"
                   );
 
                /* Set output filename from parent photon map */
                preCompContribPmap -> fileName = nuPmap.fileName;
                
                /* Set contrib/coeff mode from parent photon map */
                preCompContribPmap -> contribMode = nuPmap.contribMode;
 
                /* Set Shirley-Chiu square & wavelet matrix dimensions
                   (number of bins resp. coefficients). */
                preCompContrib -> scDim = scDim = sqrt(
                   preCompContrib -> nBins = nBins = binnedContribs -> nbins
                );
                
                if (scDim * scDim != nBins)
                   /* Mkpmap shoulda took care of dis */
                   error(INTERNAL, "contribution bin count not "
                      "integer square in preComputeContrib()"
                   );
                   
                if (nBins > 1) {
                   /* BINNING ENABLED; set up wavelet xform & compression.
                      Get dimensions of wavelet coefficient matrix after
                      boundary padding, and number of nonzero coefficients. 
                      The number of compressed coefficients is fixed and
                      based on the number of nonzero coefficients (minus one
                      as the average coefficient is not thresholded), since
                      zero coeffs are already thresholded by default.  */
                   preCompContrib -> coeffDim = coeffDim = padWaveletXform2(
                      NULL, NULL, scDim, &preCompContrib -> nNonZeroCoeffs 
                   );
                   preCompContrib -> nCoeffs = nCoeffs = coeffDim * coeffDim;
                   nCompressedCoeffs = (1 - compressRatio) * 
                      (preCompContrib -> nNonZeroCoeffs - 1);
                   if (nCompressedCoeffs < 1) {
                      error(WARNING, 
                         "maximum contribution compression, clamping number "
                         "of wavelet coefficients in preComputeContrib()"
                      );
                      nCompressedCoeffs = 1;
                   }
                   if (nCompressedCoeffs >= preCompContrib -> nCoeffs) {
                      error(WARNING, 
                         "minimum contribution compression, clamping number "
                         "of wavelet coefficients in preComputeContrib()"
                      );
                      nCompressedCoeffs = nCoeffs - 1;
                   } 
                   preCompContrib -> nCompressedCoeffs = nCompressedCoeffs;
 
                   /* Lazily allocate primary and transposed wavelet
                    * coefficient matrix */
                   preCompContrib -> waveletMatrix = waveletMatrix =
                      allocWaveletMatrix2(coeffDim);
                   preCompContrib -> tWaveletMatrix = allocWaveletMatrix2(
                      coeffDim
                   );
                   if (!waveletMatrix || !preCompContrib -> tWaveletMatrix)
                      error(SYSTEM, "out of memory allocating wavelet "
                         "coefficient matrix in preComputeContrib()"
                      );
                   
                   /* Lazily allocate thresholded detail coefficients (minus
                      the average coeff) */
                   preCompContrib -> compressedCoeffs = calloc(
                      nCoeffs - 1, sizeof(PreComputedContribCoeff)
                   );
                   
                   /* Lazily alloc mRGBE-encoded compressed wavelet coeffs */
                   preCompContrib -> mrgbeCoeffs = calloc(
                      nCompressedCoeffs, sizeof(mRGBE)
                   );
                   if (!preCompContrib -> compressedCoeffs || 
                      !preCompContrib -> mrgbeCoeffs
                   )
                      error(SYSTEM, "out of memory allocating compressed "
                         "contribution buffer in preComputeContrib()"
                      );
                   
                   /* Set size of compressed contributions, in bytes */
                   preCompContrib -> contribSize = PMAP_CONTRIB_ENCSIZE(
                      nCompressedCoeffs
                   );
                }
             }
             else {
                /* LUT node already exists for this modifier; use its data */
                preCompContribPmap = (PhotonMap*)preCompContribNode -> data;
                preCompContrib = (PreComputedContrib*)(
                   preCompContribPmap -> preCompContrib
                );
                scDim             = preCompContrib -> scDim;
                nBins             = preCompContrib -> nBins;
                nCoeffs           = preCompContrib -> nCoeffs;
                nCompressedCoeffs = preCompContrib -> nCompressedCoeffs;
                waveletMatrix     = preCompContrib -> waveletMatrix;
             }
             
             if (nBins > 1) {
                /* Binning enabled; copy binned contribs to wavelet xform
                 * input matrix, then compress & encode */
                for (i = 0; i < scDim; i++)
                   memcpy(waveletMatrix [i], 
                      binnedContribs->cbin + PMAP_CONTRIB_XY2LIN(scDim, i, 0),
                      scDim * WAVELET_COEFFSIZE
                   );
 
                if (encodeContribs(preCompContrib, compressRatio))
                   error(INTERNAL, "failed contribution "
                      "compression/encoding in preComputeContrib()"
                   );
 
                /* Dump encoded wavelet coeffs to contrib heap file,
                   prepended by mRGBE range in 32-bit RGBE.  NOTE: minimum
                   and maximum are reversed in the file to facilitate
                   handling the special (but pointless) case of a single
                   compressed coeff (see below) */
                setcolr(mrgbeRange32 [0], 
                   preCompContrib -> mrgbeRange.max [0],
                   preCompContrib -> mrgbeRange.max [1],
                   preCompContrib -> mrgbeRange.max [2]
                );
                setcolr(mrgbeRange32 [1], 
                   preCompContrib -> mrgbeRange.min [0],
                   preCompContrib -> mrgbeRange.min [1], 
                   preCompContrib -> mrgbeRange.min [2]
                );
                
                /* Dump only mRGBE maximum if 1 compressed coeff (maximum
                 * compression, pointless as it may be!), since minimum is
                 * implicitly zero and can be omitted to save space */
                putbinary(mrgbeRange32, sizeof(COLR), 
                   1 + (nCompressedCoeffs > 1), 
                   preCompContribPmap -> contribHeap
                );
                
                if (putbinary(preCompContrib -> mrgbeCoeffs, sizeof(mRGBE),
                      nCompressedCoeffs, preCompContribPmap -> contribHeap
                   ) != nCompressedCoeffs
                )
                   error(SYSTEM, "failed writing to contribution heap "
                      "in preComputeContrib()"
                   );
             }
             
             /* Set photon flux to coarsest average wavelet coefficient
                waveletMatrix [0] [0].  IF BINNING IS DISABLED, this is the
                total contribution from all directions */
             copycolor(ray.rcol, waveletMatrix [0] [0]);
 
             /* HACK: signal newPhoton() to set precomputed photon's 
                contribution source from ray -> rsrc */
             /* XXX: DO WE STILL NEED THIS CRAP AFTER PRECOMP, 
                SINCE CHILD PMAPS ARE PER-MODIFIER ANYWAY? */
             preCompContribPmap -> lastContribSrc.srcIdx = -2;
             
             /* Append photon to new heap from ray and increment total 
                count for all modifiers in parent photon map */
             newPhoton(preCompContribPmap, &ray);
             nuPmap.numPhotons++;
          }
 
          /* Update progress */
          repProgress++;
 
          if (photonRepTime > 0 && time(NULL) >= repLastTime + photonRepTime)
             pmapPreCompReport();
    #ifdef SIGCONT
          else signal(SIGCONT, pmapPreCompReport);
    #endif
       }
 
       /* Trash original pmap and its leaf file, and replace with new parent,
          which now acts as container for the per-modifier child pmaps */
       unlink(pmap -> store.leafFname);
       deletePhotons(pmap);
       memcpy(pmap, &nuPmap, sizeof(PhotonMap));
 
    #ifdef SIGCONT
       signal(SIGCONT, SIG_DFL);
    #endif
 
       /* Bail out if no photons could be precomputed */
       if (!pmap -> numPhotons)
          error(USER, 
             "no contribution photons precomputed; try increasing -am"
          );
          
       /* Build per-modifier precomputed photon maps from their contribution
          heaps */
       lu_doall(pmap -> preCompContribTab, buildPreCompContribPmap, NULL);
    }
 
 
 
 #else
    /* -------------------------------------------------------------------
       U N I T   T E S T S
       ------------------------------------------------------------------- */
 
    #include <stdio.h>
    #include <random.h>
    #include "func.h"
    
    int main (int argc, char *argv [])
    {
       unsigned       i, scdim, nsamp, numTheta = 0, numPhi = 0;
       double         t, p;
       RAY            ray;
       int            bin;
 
       if (argc < 3) {
          fprintf(stderr, "%s <scdim> <nsamp> [<var>=<value>; ..]\n", 
             argv [0]
          );
          fprintf(stderr, "Default variable defs: %s\n", 
             PMAP_CONTRIB_SCDEFAULTS
          );
          fputs("\nMissing Shirley-Chiu dimension scDim>1, "
             "number of samples nsamp\n",
             stderr
          );
          return -1;
       }
       
       if (!(scdim = atoi(argv [1]))) {
          fputs("Invalid Shirley-Chiu dimension\n", stderr);
          return -1;
       }
       
       if (!(nsamp = atoi(argv [2]))) {
          fputs("Invalid num samples\n", stderr);
          return -1;
       }
       else {
          numTheta = (int)(sqrt(nsamp) / 2);
          numPhi = 4* numTheta;
       }
       
       /* (Doan' need to) Init cal func routines for binning */
    #if 0
       initfunc();
       setcontext(RCCONTEXT);
    #endif
    
       /* Compile default orientation variables for contribution binning */
       scompile(PMAP_CONTRIB_SCDEFAULTS, NULL, 0);
       /* Compile custom orientation variabls from command line */
       for (i = 3; i < argc; i++)
          scompile(argv [i], NULL, 0);
       
       for (i = 0; i < nsamp; i++) {
          #if 0
             /* Random */
             t = 0.5 * PI * drand48();
             p = 2 * PI * drand48();
          #else
             /* Stratified */
             t = 0.5 * PI * ((i % numTheta) + drand48()) / numTheta;
             p = 2.0 * PI * ((i / numTheta) + drand48()) / numPhi;
          #endif
          ray.rdir [0] = sin(t) * cos(p);
          ray.rdir [1] = sin(t) * sin(p);
          ray.rdir [2] = cos(t);
          bin = ray2bin(&ray, scdim);
          
          printf("%.3f\t%.3f\t%.3f\t-->\t%d\n", 
             ray.rdir [0], ray.rdir [1], ray.rdir [2], bin
          );
       }
       
       return 0;
    }
 #endif
 
 #endif /* PMAP_CONTRIB */
diff --git a/wavelet2.h b/wavelet2.h
index deabc5e..d10deb7 100644
--- a/wavelet2.h
+++ b/wavelet2.h
@@ -1,243 +1,245 @@
 /* 
    =========================================================================
    Definitions for 2D wavelet transforms on arrays of 3-tuples.
 
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 
 #ifndef _WAVELET2_H
    #define _WAVELET2_H
 
    /* Wavelet coefficient type; defaults to double if not already defined.
      (may be overridden in compiler command line, for example).
       NOTE: single precision float may improve cache performance during 
       2D transform with large arrays. */
    #ifndef WAVELET_COEFF
       #define WAVELET_COEFF   double
    #endif
 
    /* Perform final Haar wavelet transform on coarsest two detail levels.
     * This has the advantage of resulting in a single coarsest level
     * approximation coefficient, possibly at the expense of poorer
     * compression */
-   #define WAVELET_FINAL_HAAR
+   /* #define WAVELET_FINAL_HAAR */
 
    /* Boundary extension modes for padded Daubechies D4 wavelet transform */
    #define WAVELET_EXTEND_CONST     0     /* Constant (default if undef) */
    #define WAVELET_EXTEND_GRAD1     1     /* 1st order gradient */
    #define WAVELET_EXTEND_GRAD2     2     /* 2nd order gradient */
    #define WAVELET_EXTEND_CYCL      3     /* Periodic (wraparound) */
    #define WAVELET_EXTEND_SYMM      4     /* Symmetric (reflection) */
 
    #ifndef WAVELET_EXTEND
-      #define WAVELET_EXTEND_MODE   WAVELET_EXTEND_GRAD1
+      #define WAVELET_EXTEND_MODE   WAVELET_EXTEND_SYMM
    #endif
 
    
    /* Convenience macros for handling coefficients */
    #define min(a, b)          ((a) < (b) ? (a) : (b))
    #define max(a, b)          ((a) > (b) ? (a) : (b))
    #define coeffAvg(a)        (((a) [0] + (a) [1] + (a) [2]) / 3)
    
    #define coeffIsEmpty(c) ( \
       isnan((c) [0]) || isnan((c) [1]) || isnan((c) [2]) \
    )
    #define coeffIsZero(c)  ((c) [0] == 0 && (c) [1] == 0 && (c) [2] == 0)
    
    /* Determine if coeff tuple c is thresholded, also including empty coeffs
     * in debugging mode, since these will be unconditionally dropped. Note
-    * that the coarsest approximation coefficients are not thresholded,
-    * since they are essential for reconstruction! */
-   #ifdef WAVELET_FINAL_HAAR
+    * that the 3x3 coarsest approximation coefficients in the upper left of
+    * matrix c are not thresholded, since they are essential for a
+    * reasonably accurate reconstruction! */
+   /* #ifdef WAVELET_FINAL_HAAR */
+   #if 0
       #define coeffThresh(c, i, j, t)  ( \
          fabs(t) > 0 && ((i) || (j)) && ( \
             coeffIsEmpty((c) [i][j]) || fabs((c) [i][j][0]) < (t) || \
             fabs((c) [i][j][1]) < (t) || fabs((c) [i][j][2]) < (t) \
          ) \
       )
    #else
       #define coeffThresh(c, i, j, t)  ( \
          fabs(t) > 0 && ((i) > 2 || (j) > 2) && ( \
             coeffIsEmpty((c) [i][j]) || fabs((c) [i][j][0]) < (t) || \
             fabs((c) [i][j][1]) < (t) || fabs((c) [i][j][2]) < (t) \
          ) \
       )
    #endif
    
 
 
    /* Haar / Daubechies D4 coefficients */
    #define SQRT2        1.41421356
    #define SQRT3        1.73205081
    #define H4NORM       (0.25 / SQRT2)
 
    /* Haar wavelet coeffs */
    extern const WAVELET_COEFF h2;
 
    /* Daubechies D4 wavelet coeffs */   
    extern const WAVELET_COEFF h4 [4];
    extern const WAVELET_COEFF g4 [4];
 
 
 
    /* Wavelet matrix defs; these are stored as arrays of pointers (a.k.a.
     * Iliffe vectors), as this proved to be more performant than a flattened
     * representation. Encapsulating the coefficient's triplets in a struct
     * prevents the compiler from introducing another layer of indirection.
     * */
    typedef WAVELET_COEFF      WaveletCoeff3 [3];
    
    #define WAVELET_COEFFSIZE  (sizeof(WaveletCoeff3))
    typedef WaveletCoeff3      **WaveletMatrix2;
 
    
    
    /* 
       ----------- 2D COEFFICIENT MATRIX ALLOC/INIT ROUTINES -----------
    */
 
    WaveletMatrix2 allocWaveletMatrix2 (unsigned len);
    /* Allocate and return a 2D coefficient array of size (len x len).
       Returns NULL if allocation failed. */
 
    
    void freeWaveletMatrix2 (WaveletMatrix2 y, unsigned len);
    /* Free previously allocated 2D coeff array y of size (len x len) */
 
 
    void zeroCoeffs2 (WaveletMatrix2 y, unsigned len);
    /* Set 2D array coefficients to zero */
 
 
    /* 
       ---------- 2D WAVELET TRANSFORM OPTIMISED FOR SIZES 2^l ----------
       ------ THESE USE PERIODIC (WRAP-AROUND) BOUNDARY EXTENSION -------
    */
    
    int waveletXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l);
    /*
       Perform full 2D multiresolution forward wavelet transform on array y
       of size (2^l) x (2^l) containing original signal as 3-tuples, where 
       l >= 1.  Note no intra-tuple transform occurs.
       
       The wavelet coefficients are returned in array y, containing the
       coarsest approximation in y [0][0] followed by horizontal/vertical
       details in order of increasing resolution/frequency.
       
       A preallocated array yt of identical dimensions to y can be supplied
       as buffer for intermediate results.  If yt == NULL, a buffer is
       automatically allocated and freed on demand, but this is inefficient
       for frequent calls.  It is recommended to preallocate yt to the
       maximum expected size.  The dimensions of yt are not checked; this is
       the caller's responsibility.
          
       Returns 0 on success, else -1.
    */
    
 
    int waveletInvXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, unsigned l);
    /* 
       Perform full 2D multiresolution inverse wavelet transform on array y
       of size (2^l) x (2^l) containing wavelet coefficients as 3-tuples,
       where l >= 1.  Note no intra-tuple transform occurs.
 
       A preallocated array yt of identical dimensions to y can be supplied
       as buffer for intermediate results.  If yt == NULL, a buffer is
       automatically allocated and freed on demand, but this is inefficient
       for frequent calls.  It is recommended to preallocate yt to the
       maximum expected size.  The dimensions of yt are not checked; this is
       the caller's responsibility.
       
       The reconstructed signal is returned in array y.  Returns 0 on
       success, else -1.
    */
 
 
 
    /* 
       ------------ 2D WAVELET TRANSFORM FOR ARBITRARY SIZES -------------
       ---- THESE USE CONSTANT BOUNDARY EXTENSION WITH PADDING COEFFS-----
    */
    
    unsigned padWaveletXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
       unsigned yLen0, unsigned *numNonZero
    );
    /* Perform full 2D multiresolution forward wavelet transform with padded
       boundary coefficients on array y containing (yLen0 x yLen0) samples of
       3-tuples.  Note no intra-tuple transform occurs.  An additional array
       yt of identical dimension to y is required as buffer for intermediate
       results.
       
       The wavelet coefficients are returned in array y, containing the
       coarsest approximation coefficients in y [0][0] followed by
       horizontal/vertical detail coefficients in order of increasing
       resolution/frequency.
 
       NOTE: Both y and yt must be large enough to accommodate the additional
       padding coefficients introduced at the array boundaries during the
       transform.  It is the caller's responsibility to preallocate and
       dimension these arrays accordingly with allocWaveletMatrix2().  The
       dimension of the output array (which generally exceeds the input
       dimension yLen0) must be interrogated beforehand by calling the
       function with y or yt set to NULL.
       
       The returned value is the output dimension, or 0 if the transform
       failed.  In addition, the number of nonzero coefficients is returned
       in numNonZero (if not NULL), which is a subset of the number of output
       coefficients, i.e.  (output dimension)^2.
    */
    
    
    unsigned padWaveletInvXform2 (WaveletMatrix2 y, WaveletMatrix2 yt,
       unsigned yLen0, unsigned yLenOrg
    );
    /* Perform full 2D multiresolution inverse wavelet transform on array y
       containing (yLen0 x yLen0) boundary padded wavelet coefficients as
       3-tuples.  The original number of samples (yLenOrg x yLenOrg) is
       required for the reconstruction.  An additional array yt of identical
       dimension to y is required as buffer for intermediate results.
     
       The reconstructed samples are returned in y.  This is smaller than
       (yLen0 x yLen0) since the boundary coefficients from the forward
       transform are collapsed.
     
       The returned value is the reconstructed number of samples per
       dimension (yLenOrg), or 0 if the inverse transform failed.
     */ 
    
   
    
    #ifdef WAVELET_DBG
       /* 
          ----------------- SUPPORT ROUTINES FOR DEBUGGING -----------------
       */
       
       void clearCoeffs2 (WaveletMatrix2 y, unsigned len);
       /* Clear 2D array to facilitate debugging */
       
       
       void dumpCoeffs2 (const WaveletMatrix2 y1, const WaveletMatrix2 y2,
          unsigned y1Len, unsigned y2Len, float thresh 
       );
       /* 
          Dump 2D arrays y1 and y2 of dims y1Len resp.  y2Len side-by-side to
          stdout (skip y2 if NULL).  Coefficients with absolute magnitude
          less than thresh are marked with brackets ('[]') as thresholded.
       */
       
       
       float rmseCoeffs2 (const WaveletMatrix2 y1, const WaveletMatrix2 y2, 
          unsigned len
       );
       /* Calculate RMSE between 2D matrices y1 and y2 */
    #endif
 #endif
 
diff --git a/wavelet3.c b/wavelet3.c
index 0cfec25..363750e 100644
--- a/wavelet3.c
+++ b/wavelet3.c
@@ -1,1078 +1,1083 @@
 /* 
    =========================================================================
    2-dimensional Daubechies D4 and Haar wavelet transforms on arbitrary
    sized arrays of 3-tuples.  Array boundaries are extended as defined by
    WAVELET_EXTEND_MODE, which introduces padding coefficients.
 
    Compile with -DWAVELET_TEST_2DPAD to build standalone unit tests.
    
    The implementation here is mainly distilled from the following sources:
    -- Filip Wasilewski's pywavelets:
       https://github.com/PyWavelets/pywt/tree/master/pywt/_extensions/c
       [D4 with boundary condition handling via padding, arbitrary sizes]
    -- Ingrid Daubechies and Wim Sweldens, "Factoring Wavelet Tranforms into 
       Lifting Steps", section 7.5:
       https://services.math.duke.edu/~ingrid/publications/J_Four_Anal_Appl_4_p247.pdf
       [Lifting scheme on D4]
    -- Ian Kaplan's webpages on wavelets and signal processing::
       http://bearcave.com/misl/misl_tech/wavelets/daubechies/index.html
       http://bearcave.com/misl/misl_tech/wavelets/lifting/basiclift.html
       [Lifting scheme on D4]
    -- Stefan Moebius' TurboWavelets.Net:
       https://github.com/codeprof/TurboWavelets.Net/tree/master/TurboWavelets
       [Boundary condition handling with arbitrary sizes, backreferencing]
    -- "Numerical Recipes in C", chapter 13.10:
       http://cacs.usc.edu/education/cs653/NRC-c13.10-Wavelets.pdf
       [Linear algebra (matrix mult.) implementation of D4]
    
    Roland Schregle (roland.schregle@{hslu.ch, gmail.com})
    (c) Lucerne University of Applied Sciences and Arts,
        supported by the Swiss National Science Foundation 
        (SNSF #179067, "Light Fields for Spatio-Temporal Glare Assessment")
    =========================================================================
    
    $Id$
 */
 
 
 #include "wavelet2.h"
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 
 
 
 static unsigned padHaarStep2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned len
 )
 /* 
    Single iteration of forward 2D Haar wavelet transform with padding on
    array y of arbitrary size (len x len) containing 3-tuples.  This variant
    uses constant boundary extension.  The transform is performed over y's
    2nd axis (i.e.  horizontally, assuming row-major addressing as per C
    convention).  Note the triplets per array element are _not_ decorrelated,
    but transformed independently.
    
    The wavelet coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the generated
    coefficients vertically, and prepares the array for another transform
    over its 2nd axis in a subsequent call (this time as input y).  The
    result of a subsequent transform restores yt to y's original orientation,
    in which case both horizontal and vertical axes have been decorrelated.
    
    Returns array dimension len on success, else 0.
 */
 {
    static unsigned   axis = 0;
    const unsigned    hLen = -(-(signed)len >> 1);
    int               h, i, j, k;
 
    if (len < 2 || !y || !yt)
       /* Input shorter than wavelet support, or no input/output */
       return 0;
 
 #ifdef WAVELET_DBG
    clearCoeffs2(yt, len);
 #endif      
 
    /* NOTE: yt is transposed on the fly such that the next function call
     * transforms over the alternate axis. This is done by simply swapping
     * the indices during assignment */    
    for (i = 0; i < len; i++) {
       for (j = 0; j <= len - 2; j += 2) {
          h = j >> 1;
          
          for (k = 0; k < 3; k++) {
             /* Smooth/approx/avg/lowpass */
             yt [h       ] [i] [k] = h2 * (
                y [i] [j    ] [k] + 
                y [i] [j + 1] [k]
             );
             
             /* Detail/diff/highpass */
             yt [h + hLen] [i] [k] = h2 * (
                y [i] [j    ] [k] - 
                y [i] [j + 1] [k]
             );
          }
       }
       
       if (len & 1) {
          /* Odd length; pad with last value as additional approx term */
          for (k = 0; k < 3; k++)
             /* Smooth/approx/avg/lowpass */
             yt [hLen - 1] [i] [k] = y [i] [len - 1] [k];
       }
    }
 
 #ifdef WAVELET_DBG
    printf("%s FWD HAAR (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
    dumpCoeffs2(y, yt, len, len, 0);
    putchar('\n');
 #endif
 
    axis ^= 1;
    return len;
 }
 
 
 
 static unsigned padHaarInvStep2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned len
 )
 /* 
    Single iteration of inverse 2D Haar wavelet transform with padding on
    array y of arbitrary size (len x len) containing 3-tuples.  This reverses
    the forward transform above.  The transform is inverted over y's 2nd axis
    (i.e.  horizontally, assuming row-major addressing as per C convention).
    
    The inverted coefficients are returned in the *TRANSPOSED* output array
    yt, of identical dimensions to y.  The transpose arranges the inverted
    coefficients vertically, and prepares the array for another inverse
    transform over its 2nd axis in a subsequent call (this time as input y). 
    The result of a subsequent inverse transform restores yt to y's original
    orientation, in which case both horizontal and vertical axes have been
    inversely transformed.
    
    Returns array dimension len on success, else 0.
 */
 {
    static unsigned   axis = 1;
    const unsigned    hLen = -(-(signed)len >> 1);
    int               h, i, j, k;
 
    if (len < 2 || !y || !yt)
       /* Too few coeffs for reconstruction, or no input/output */
       return 0;
 
 #ifdef WAVELET_DBG
    clearCoeffs2(yt, len);
 #endif   
 
    /* NOTE: i, j are swapped relative to the forward transform, as axis
     * order is now reversed. */
    /* NOTE: yt is transposed on the fly such that the next function call
     * inverts over the alternate axis. This is done by simply swapping
     * the indices during assignment */    
    for (i = 0; i < len; i++) {
       for (j = 0; j <= len - 2; j += 2) {      
          h = j >> 1;
       
          for (k = 0; k < 3; k++) {
             yt [i] [j    ] [k] = h2 * (
                y [h       ] [i] [k] +  /* Approx */
                y [h + hLen] [i] [k]    /* Detail */
             );
             
             yt [i] [j + 1] [k] = h2 * (
                y [h       ] [i] [k] -  /* Approx */
                y [h + hLen] [i] [k]    /* Detail */
             );
          }
       }
 
       if (len & 1) {
          /* Odd length; pad additional value from last approximation term */
          for (k = 0; k < 3; k++)
             yt [i] [len - 1] [k] = y [hLen - 1] [i] [k];
       }   
    }
    
 #ifdef WAVELET_DBG
    printf("%s INV HAAR (%d x %d)\n", axis ? "VERT" : "HORIZ", len, len);
    dumpCoeffs2(y, yt, len, len, 0);
    putchar('\n');
 #endif
 
    axis ^= 1;
    return len;
 }
 
 
 
 static unsigned padD4Step2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned yLen, unsigned ytOffset
 )
 /* 
    Single iteration of forward 2D Daubechies D4 wavelet transform with
    boundary padding on array y containing (yLen x yLen) 3-tuples.  This
    variant extends the input array beyond its boundary as defined by
    WAVELET_EXTEND_MODE, which results in padding coefficients.  The
    transform is performed over y's 2nd axis (i.e.  horizontally, assuming
    row-major addressing as per C convention).  Note the triplets per array
    element are _not_ decorrelated, but transformed independently.
    
    The output wavelet coefficients are returned in the *TRANSPOSED* array
    yt.  The transpose arranges the generated coefficients vertically,
    and prepares the array for another transform over its 2nd axis in a
    subsequent call (this time as input y).  The result of a subsequent
    transform restores yt to y's original orientation, in which case both
    horizontal and vertical axes have been decorrelated.
    
    Both the input array y and output array yt must be large enough to also
    accommodate the additional padding coefficients introduced at the
    boundaries.  The number of output coefficient pairs ytLen returned in yt
    is obtained as return value, and can be interrogated beforehand without
    performing a transform by passing either y or yt as NULL.
    
    The approximation coefficients are returned in yt [0..ytLen-1] [i], while
    the details are returned in yt [ytOffset..ytOffset+ytLen-1] [i].  The
    offset ytOffset must be supplied by the caller to accommodate the detail
    coefficients (plus padding!) without clobbering the approximation
    coefficients (again, plus padding!).  Consequently, there is a gap
    between the approximation coefficients in the upper left of the matrix,
    and the detail coefficients in the lower right.  This gap shrinks and
    shifts toward the upper left with each iteration.
    
    The following schema indicates the matrix contents during the transform
    (see the unit test output when compiled with WAVELET_TEST_2DPAD):
 
       ----> j
      +----------+--------------+                 +-----------+-------------+
    | |   y_ij   |              |                 |  S(y_ji)  |             |
    | +----------+              |  (H_k xform)^T  +-----------+             |
    v |                         |   ---------->   |                         |
    i |                         |                 +-----------+             |
      |                         |                 |  D(y_ji)  |             |
      +-------------------------+                 +-------------------------+
 
 
       ----> i
      +----------+--------------+                 +----------+---+----------+
    | | S(y_ji)  |              |                 |S(S(y_ij))|   |S(D(y_ij))|
    | +----------+              |  (V_k xform)^T  +----------+   +----------+
    v |                         |   ---------->   |                         |
    j +----------+              |                 +----------+   +----------+
      | D(y_ji)  |              |                 |D(S(y_ij))|   |D(D(y_ij))|
      +-------------------------+                 +----------+---+----------+
    
-               ...                     ...                  ...
-               ...                     ...                  ...
+                 ...                    ...                  ...
+                 ...                    ...                  ...
                
                      +------+------+--------+--------+----------------+
                      |SS_n-1|SD_n-1|        |        |                |
                      |------+------+ SD_n-2 |        |                |
                      +DS_n-1|DD_n-1+--------+ . . .  |      SD_0      |
                      +------+----+-+--------+        |                |
                      |  DS_n-2   | | DD_n-2 |        +----------------+
     (V_n-1 xform)^T  +-----------+ +--------+ .           .     .     |
       ---------->    |    .      .               .        .     .     |
                      |    .      .                  .     .     .     |
                      +----------------+              +----------------+    
                      |                |              |                |
                      |     DS_0       | . . . . . .  |      DD_0      |
                      |                |              |                |
                      |                |              |                |
                      +----------------+--------------+----------------+
                                                  
     where i, j are the array indices of the outer resp.  inner loop, S(y_ij)
     D(y_ij) are approximation resp.  details coefficients of input y_ij
     (itself either the original signal or approximation/detail coefficients
     from previous iterations), and H_k, V_k are the horizontal resp.
     vertical transforms of the k-th (out of the maximum n) iteration.
 */
 { 
    static unsigned   axis = 0;
    unsigned          h, i, j, k, l, i0 = 0, iN, wIdx, dExt;
    /* Number of output coeff pairs ytLen, incl. boundary padding, and
       max dimension of entire array */
    const unsigned    ytLen = (yLen + 3) >> 1, ytTotalLen = ytOffset + ytLen;
    WAVELET_COEFF     yExt;
 #if (WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD1 || \
     WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD2 \
 )   
    /* Gradient boundary extension stuff */
    WAVELET_COEFF     dy1_0, dy1_1, dy2_0;
 #endif
 
    if (y && yt && yLen >= 2) {
 #ifdef WAVELET_DBG
       /* Clear output array yt to facilitate debugging */
       clearCoeffs2(yt, ytTotalLen);
 #else
       /* Zero output (sub)array yt; note this will not affect accumulated
        * coefficients from previous iterations, as they lie outside the
        * array bounds */
       zeroCoeffs2(yt, ytTotalLen);
 #endif
       /* FWD TRANSFORM GOTCHAS:
        *
        * Each full iteration of the forward transform consists of a
        * horizontal step followed by a vertical one.  yt is transposed on
        * the fly such that the next function call transforms over the
        * alternate axis. This is done by simply swapping the indices during
        * assignment.
        *
        * In the initial horizontal transform, the input array only contains
        * a set of approximation coefficients (or the original signal) at 
        * y [0..yLen-1] [0..yLen-1].       
        *
        * In the subsequent vertical transform, the input array then contains
        * two sets of coefficients from the preceding horizontal transform:
        * approximation coeffs y [0..ytLen-1] [0..ytLen-1] and detail coeffs
        * y [ytOffset..ytTotalLen-1] [ytOffset..ytTotalLen-1].  In this case,
        * the loop below is iterated twice with an offset yOffset to select
        * both sets of coefficients to transform.  Note the detail coeffs are
        * selected first to facilitate the looping logic; if i0==0 at the end
        * of the loop (i.e.  approx coeffs were selected), it terminates. 
        */
       do {
          i0 ^= ytOffset * axis;
          iN = i0 ? ytTotalLen : yLen;
          
          for (i = i0; i < iN; i++) {
             /* First iteration (j = -2); pad coefficients at left edge and
              * extend input beyond boundary according to extension mode */
             for (k = 0; k < 3; k++) {
 #if (WAVELET_EXTEND_MODE == WAVELET_EXTEND_CONST || \
    !defined(WAVELET_EXTEND_MODE) \
 )
                /* Constant boundary extension (=default) */
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
                   (h4 [0] + h4 [1] + h4 [2]) * y [i] [0] [k] +
                   (h4 [3]                  ) * y [i] [1] [k];
                   
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
                   (g4 [0] + g4 [1] + g4 [2]) * y [i] [0] [k] +
                   (g4 [3]                  ) * y [i] [1] [k];
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD1
                /* 1st order gradient boundary extension */
                dy1_0 = y [i] [0] [k] - y [i] [1] [k];
 
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
                   h4 [0]   * (y [i] [0] [k] + 2 * dy1_0) +
                   h4 [1]   * (y [i] [0] [k] +     dy1_0) +
                   h4 [2]   * (y [i] [0] [k]            ) +
                   h4 [3]   * (y [i] [1] [k]            );
 
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
                   g4 [0]   * (y [i] [0] [k] + 2 * dy1_0) +
                   g4 [1]   * (y [i] [0] [k] +     dy1_0) +
                   g4 [2]   * (y [i] [0] [k]            ) +
                   g4 [3]   * (y [i] [1] [k]            );
                   
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD2
                /* 2nd order gradient boundary extension */
                dy1_0 = y [i] [0] [k] - y [i] [1] [k];
                dy1_1 = y [i] [1] [k] - y [i] [2] [k];
                dy2_0 = dy1_0 - dy1_1;
                
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
 /*                h4 [0]   * (y [i] [0] [k] + 2 * (dy1_0 + 1 * dy2_0)) +
                   h4 [1]   * (y [i] [0] [k] + 1 * (dy1_0 + 0 * dy2_0)) + 
                   h4 [2]   * (y [i] [0] [k] + 0 * (dy1_0            )) +
                   h4 [3]   * (y [i] [1] [k]                          ); */
                   h4 [0]   * (y [i] [0] [k] + 2 * (dy1_0 + dy2_0)) +
                   h4 [1]   * (y [i] [0] [k] +      dy1_0         ) +
                   h4 [2]   * (y [i] [0] [k]                      ) +
                   h4 [3]   * (y [i] [1] [k]                      );
 
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
 /*                g4 [0]   * (y [i] [0] [k] + 2 * (dy1_0 + 1 * dy2_0)) +
                   g4 [1]   * (y [i] [0] [k] + 1 * (dy1_0 + 0 * dy2_0)) + 
                   g4 [2]   * (y [i] [0] [k] + 0 * (dy1_0            )) + 
                   g4 [3]   * (y [i] [1] [k]                          ); */
                   g4 [0]   * (y [i] [0] [k] + 2 * (dy1_0 + dy2_0)) +
                   g4 [1]   * (y [i] [0] [k] +      dy1_0         ) +
                   g4 [2]   * (y [i] [0] [k]                      ) +
                   g4 [3]   * (y [i] [1] [k]                      );
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_CYCL
                /* Cyclic (perdiodic / wraparound) boundary extension */
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
                   h4 [0] * y [i] [yLen - 2] [k] + 
                   h4 [1] * y [i] [yLen - 1] [k] + 
                   h4 [2] * y [i] [0       ] [k] +
                   h4 [3] * y [i] [1       ] [k];
                   
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
                   g4 [0] * y [i] [yLen - 2] [k] + 
                   g4 [1] * y [i] [yLen - 1] [k] + 
                   g4 [0] * y [i] [0       ] [k] +
                   g4 [3] * y [i] [1       ] [k];
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_SYMM
                /* Symmetric (reflected) boundary extension */
                /* Smooth/approx/avg/lowpass */
                yt [0       ] [i] [k] = 
-                  h4 [0] * y [i] [2] [k] + 
-                  h4 [1] * y [i] [1] [k] + 
+                  h4 [0] * y [i] [1] [k] + 
+                  h4 [1] * y [i] [0] [k] + 
                   h4 [2] * y [i] [0] [k] +
                   h4 [3] * y [i] [1] [k];
                   
                /* Detail/diff/highpass */
                yt [ytOffset] [i] [k] = 
-                  g4 [0] * y [i] [2] [k] + 
-                  g4 [1] * y [i] [1] [k] + 
+                  g4 [0] * y [i] [1] [k] + 
+                  g4 [1] * y [i] [0] [k] + 
                   g4 [2] * y [i] [0] [k] +
                   g4 [3] * y [i] [1] [k];
 #endif
             } 
          
             /* Intermediate and last iterations (j >= 0) */
             for (j = 0; j < yLen; j += 2) {
                h = (j >> 1) + 1;
             
                for (k = 0; k < 3; k++) {
 #ifdef WAVELET_DBG
                   /* Only necessary in debugging mode, since otherwise
                      already cleared by zeroCoeffs2() */
                   yt [h] [i] [k] = yt [h + ytOffset] [i] [k] = 0;
 #endif
                   /* Convolve up to boundary of wavelet or input signal,
                    * whichever comes first */
                   for (l = j; l < min(j + 4, yLen); l++) {
                      /* Smooth/approx/avg/lowpass */
                      yt [h           ] [i] [k] += h4 [l - j] * y [i] [l] [k];
                      
                      /* Detail/diff/highpass */
                      yt [h + ytOffset] [i] [k] += g4 [l - j] * y [i] [l] [k];
                   }
 
                   /* If necessary, convolve beyond input signal boundary
                    * with extension. This is skipped if the wavelet boundary
                    * was already reached */
                   for (; l < j + 4; l++) {
                      wIdx = l - j;
                      
 #if (WAVELET_EXTEND_MODE == WAVELET_EXTEND_CONST || \
    !defined(WAVELET_EXTEND_MODE) \
 )
                      /* Constant boundary extension (=default) */
                      yExt = y [i] [yLen - 1] [k];
                         
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD1
                      /* 1st order gradient boundary extension */
                      dy1_0 = y [i] [yLen - 1] [k] - y [i] [yLen - 2] [k];
                      dExt  = l - yLen + 1;
                      yExt  = y [i] [yLen - 1] [k] + dExt * dy1_0;
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_GRAD2
                      /* 2nd order gradient boundary extension */
                      dy1_0 = y [i] [yLen - 1] [k] - y [i] [yLen - 2] [k];
                      dy1_1 = y [i] [yLen - 2] [k] - y [i] [yLen - 3] [k];
                      dy2_0 = dy1_0 - dy1_1;
                      dExt  = l - yLen + 1;
                      yExt  = y [i] [yLen - 1] [k] + 
                         dExt * (dy1_0 + (dExt ? dExt - 1 : 0) * dy2_0);
                      
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_CYCL
                      /* Cyclic (perdiodic/wraparound) boundary extension */
                      yExt = y [i] [l - yLen] [k];
 
 #elif WAVELET_EXTEND_MODE == WAVELET_EXTEND_SYMM
                      /* Symmetric (reflected) boundary extension */
-                     yExt = y [i] [((yLen - 1) << 1) - l] [k];
+                     yExt = y [i] [((yLen - 1) << 1) - l + 1] [k];
                      /* = (yLen - 1) - (l - (yLen - 1)) */
 #endif
 
                      /* Smooth/approx/avg/lowpass */
                      yt [h           ] [i] [k] += h4 [wIdx] * yExt;
                         
                      /* Detail/diff/highpass */
                      yt [h + ytOffset] [i] [k] += g4 [wIdx] * yExt;
                   }
                }
             }   
          }
       } while (i0);
       
 #ifdef WAVELET_DBG
       if (axis)
          printf("VERT FWD D4: (%dx%d) H-approx, (%dx%d) H-detail\n"
             "-> (%dx%d) V-approx of H-approx, (%dx%d) V-approx of H-detail,\n"
             "   (%dx%d) V-detail of H-approx, (%dx%d) V-detail of V-detail\n",
             ytLen, yLen, ytLen, yLen,
             ytLen, ytLen, ytLen, ytLen, ytLen, ytLen, ytLen, ytLen
          );
       else 
          printf(
             "HORIZ FWD D4: (%dx%d) -> (%dx%d) H-approx, (%dx%d) H-detail\n",
             yLen, yLen,
             ytLen, yLen, ytLen, yLen
          );
          
       dumpCoeffs2(y, yt, ytTotalLen, ytTotalLen, 0);
       putchar('\n');
 #endif
 
       axis ^= 1;
    }
    
    return ytLen;
 }
 
 
 
 static unsigned padD4InvStep2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned yLen, unsigned yOffset
 )
 /*
    Single iteration of inverse 2D Daubechies D4 wavelet transform with
    boundary padding on array y containing pairs of (yLen x yLen)
    approximation and detail coefficients in 3-tuples.  This reverses the
    forward transform above.  The transform is inverted over y's 2nd axis
    (i.e.  horizontally, assuming row-major addressing as per C convention).
    
    The inverted coefficients are returned in the *TRANSPOSED* output array
    yt. The transpose arranges the inverted coefficients vertically, and
    prepares the array for another inverse transform over its 2nd axis in a
    subsequent call (this time as input y).  The result of a subsequent
    inverse transform restores yt to y's original orientation, in which case
    both horizontal and vertical axes have been inversely transformed.
    
    The number of output coefficients is smaller than the input as padding
    coefficients are collapsed at the boundaries during the inverse
    transform.  The number of output coefficients per dimension ytLen is
    obtained as returned value, and can be interrogated beforehand without
    performing a transform by passing either y or yt as NULL.
    
    The inversion expects approximation coeffs y [0..yLen-1][0..yLen-1]
    and y [0..yLen-1] [yOffset..yOffset+yLen-1], and detail coefficients
    y [yOffset..yOffset+yLen-1] [0..yLen-1] and 
    y [yOffset..yOffset+yLen-1] [yOffset..yOffset+yLen-1].  
    
    The offset yOffset must be supplied by the caller to account for the fact
    that the number of output coefficients ytLen is smaller than yLen due to
    the collapsed padding coefficients.  Consequently there is a gap between
    the inverted coefficients located in the corners of the matrix.  This gap
    increases with each iteration, until all coefficients have been consumed
    by the inversion.
 */
 {
    static unsigned   axis = 1;
    unsigned          h, i, j, k, l, i0 = 0, iN;
    /* Number of output coeff (pairs) ytLen, max dimension of entire array */
    unsigned          ytLen = (yLen - 1) << 1;
    const unsigned    yTotalLen = yOffset + yLen;
 
    if (y && yt && ytLen >= 2) {
 #ifdef WAVELET_DBG
       /* Clear output array yt to facilitate debugging */
       clearCoeffs2(yt, yTotalLen);
 #else
       /* Zero output (sub)array yt; note this will not affect accumulated
        * coefficients from previous iterations, as they lie outside the
        * array bounds */
       zeroCoeffs2(yt, yTotalLen);
 #endif
       /* INV TRANSFORM GOTCHAS:
        *
        * i, j are swapped relative to the forward transform, as axis order
        * is now reversed (vertical before horizontal xform).  yt is
        * transposed on the fly such that the next function call transforms
        * over the alternate axis.  This is done by simply swapping the
        * indices during assignment.
        *
        * In the inverse vertical transform, the input array contains two
        * sets of coefficients:
        * (a) approx of approx coeffs y [0..yLen-1] [0..yLen-1] in the
        *     upper left, and details of approximation coefficients 
        *     y [yOffset..yTotalLen-1] [0..yLen-1] in the lower left.
        * (b) approx of detail coeffs y [0..yLen-1] [yOffset..yTotalLen-1]
        *     in the upper right, and details of detail coefficients 
        *     y [yOffset..yTotalLen-1] [yOffset..yTotalLen-1] in lower right.
        *
        * The loop below is iterated twice with an offset to select both sets
        * of coefficients to inverse the transform.  Note set (b) is selected
        * first to facilitate the looping logic; if i0==0 at the end of the
        * loop (i.e.  set (a) was selected), it terminates.
        */
       do {
          i0 ^= yOffset * (axis);
          iN = i0 ? yTotalLen : ytLen;
          
          for (i = i0; i < iN; i++) {
             for (j = 0; j < ytLen; j += 2) {
                h = j >> 1;
                
                for (k = 0; k < 3; k++) {
                   yt [i] [j    ] [k] = 
                      h4 [2] * y [h              ] [i] [k] +   /* Approx */
                      g4 [2] * y [h + yOffset    ] [i] [k] +   /* Detail */
                      h4 [0] * y [h + 1          ] [i] [k] +   /* Approx */
                      g4 [0] * y [h + 1 + yOffset] [i] [k];    /* Detail */
                      
                   yt [i] [j + 1] [k] =
                      h4 [3] * y [h              ] [i] [k] +   /* Approx */
                      g4 [3] * y [h + yOffset    ] [i] [k] +   /* Detail */
                      h4 [1] * y [h + 1          ] [i] [k] +   /* Approx */
                      g4 [1] * y [h + 1 + yOffset] [i] [k];    /* Detail */
                }
             }
          }  
       } while (i0);
 
 #ifdef WAVELET_DBG
       if (axis)
          printf("VERT INV D4: "
             "(%dx%d) V-approx of H-approx, (%dx%d) V-approx of H-detail,\n"
             "(%dx%d) V-detail of H-approx, (%dx%d) V-detail of V-detail\n"
             "-> (%dx%d) H-approx, (%dx%d) H-detail\n",
             yLen, yLen, yLen, yLen, yLen, yLen, yLen, yLen, 
             yLen, ytLen, yLen, ytLen
          );
       else 
          printf(
             "HORIZ INV D4: (%dx%d) H-approx, (%dx%d) H-detail -> (%dx%d)\n",
             yLen, ytLen, yLen, ytLen,
             ytLen, ytLen
          );
 
       dumpCoeffs2(y, yt, yTotalLen, yTotalLen, 0);
       putchar('\n');
 #endif
 
       axis ^= 1;
    }
 
    return ytLen;
 }
 
 
 
 unsigned padWaveletXform2 (WaveletMatrix2 y, WaveletMatrix2 yt, 
    unsigned yLen0, unsigned *numNonZero
 )
 /* Perform full 2D multiresolution forward wavelet transform with padded
    boundary coefficients on array y containing (yLen0 x yLen0) samples of
    3-tuples.  Note no intra-tuple transform occurs.  An additional array yt
    of identical dimension to y is required as buffer for intermediate
    results.
    
    The wavelet coefficients are returned in array y, containing the coarsest
    approximation coefficients in y [0][0] followed by horizontal/vertical
    detail coefficients in order of increasing resolution/frequency.
    
    NOTE: Both y and yt must be large enough to accommodate the additional
    padding coefficients introduced at the array boundaries during the
    transform.  It is the caller's responsibility to preallocate and
    dimension these arrays accordingly with allocWaveletMatrix2().  The
    dimension of the output array (which generally exceeds the input
    dimension yLen0) must be interrogated beforehand by calling the function
    with y or yt set to NULL.
    
    The returned value is the output dimension, or 0 if the transform failed. 
    In addition, the number of nonzero coefficients is returned in
    numNonZero (if not NULL), which is a subset of the number of output
    coefficients, i.e. (output dimension)^2.
 */
 {
    /* Output size (number of coeffs, including boundary padding coeffs),
       output array offset and pointers */
    unsigned       yLen, ytLen, ytOffset, totalLen = 0, totalNZ = 0;
 
    /* Transform if enough samples, else just return output size.
       NOTE: the D4 transform will be skipped for input sizes yLen0 < 4. */
    if (yLen0 >= 2) {
       /* Figure out total number of coefficients (= maximum size of output
          array including boundary padding) by summing the number of detail
          coeffs returned per dimension by each xform iteration.  Also sum
          number of nonzero detail coefficients.  */
       for (yLen = yLen0, totalLen = 0;
          yLen > 3; 
          totalLen += yLen = padD4Step2(NULL, NULL, yLen, 0),
          totalNZ += yLen * yLen
       );
       /* Finally add number of approx coeffs at coarsest resolution */
       totalLen += yLen;
       totalNZ = 3 * totalNZ + yLen * yLen;
 
       if (y && yt) {
          /* Apply Daubechies D4 transform, accounting for boundary padding */
          for (yLen = yLen0, ytOffset = totalLen; 
             yLen > 3;
             yLen = ytLen
          ) {
             /* Get num of coeff pairs returned by the next xform iteration,
                and calculate the offset of detail coeffs in output array */
             ytOffset -= ytLen = padD4Step2(NULL, NULL, yLen, 0);
             /* Apply horizontal & vertical Daubechies D4 transform, swapping
                input and transposed output array.  The resulting detail
                coefficients are prepended to those from the previous
                iteration from position ytOffset; consequently, the returned
                number of coefficient pairs is the input size for next
                iteration */
             if (!padD4Step2(y, yt, yLen, ytOffset) || 
                !padD4Step2(yt, y, yLen, ytOffset)
             )
                return 0;
          }
 #ifdef WAVELET_FINAL_HAAR
          /* Apply horizontal & vertical Haar transform in penultimate and
             final iterations (since yLen is odd) to decompose to a single
             approximation coefficient y [0] [0] at the coarsest resolution;
             all other coeffs are details.  Obviously no boundary padding is
             necessary here.  */
          for (; yLen >= 2; yLen--) {
             if (!padHaarStep2(y, yt, yLen) || !padHaarStep2(yt, y, yLen))
                return 0;
          }
 #endif
       }
    }
 
    /* Optionally return number of nonzero coeffs */
    if (numNonZero)
       *numNonZero = totalNZ;
       
    /* All coeffs now in y; return output size */
    return totalLen;
 }
 
 
 
 unsigned padWaveletInvXform2 (WaveletMatrix2 y, WaveletMatrix2 yt,
    unsigned yLen0, unsigned yLenOrg
 )
 /* Perform full 2D multiresolution inverse wavelet transform on array y
    containing (yLen0 x yLen0) boundary padded wavelet coefficients as
    3-tuples.  The original number of samples (yLenOrg x yLenOrg) is required
    for the reconstruction.  An additional array yt of identical dimension to
    y is required as buffer for intermediate results.
  
    The reconstructed samples are returned in y.  This is smaller than (yLen0
    x yLen0) since the boundary coefficients from the forward transform are
    collapsed.
  
    The returned value is the reconstructed number of samples per dimension
    (yLenOrg), or 0 if the inverse transform failed.
  */
 {
    unsigned    yLen = 0, nextyLen, ytLen, ytOffset, i, j, d4Iter;
 
    if (yLenOrg < 2)
       /* Too few coefficients for reconstruction */
       return 0;
       
    /* Figure out num Daubechies iterations based on original num coeffs */
    for (d4Iter = 0, yLen = yLenOrg; 
       yLen > 3; 
       d4Iter++, yLen = padD4Step2(NULL, NULL, yLen, 0)
    );
 #ifdef WAVELET_FINAL_HAAR
    /* Invert horizontal & vertical Haar transform at two coarsest levels */
    for (yLen = 2; yLen <= min(3, yLen0); yLen++)
       if (!padHaarInvStep2(y, yt, yLen) || !padHaarInvStep2(yt, y, yLen))
          return 0;
 #endif
    /* Invert horiz & vert Daubechies D4 transform on subsequent levels */
    for (i = d4Iter; i; i--) {
       /* Get num detail coeffs and their offset for current iteration */
       for (j = 0, ytOffset = yLen0, yLen = nextyLen = yLenOrg; j < i; j++) {
          nextyLen = yLen;
          ytOffset -= yLen = padD4Step2(NULL, NULL, yLen, 0);
       }
 
       if (!(ytLen = padD4InvStep2(y, yt, yLen, ytOffset)) ||
          !(ytLen = padD4InvStep2(yt, y, yLen, ytOffset))
       )
          return 0;
    
       /* XXX: Drop excess reconstructed approximation coeffs to match number
          of detail coeffs in next iteration. */
       yLen = min(ytLen, nextyLen);
    }
    
    return yLenOrg;
 }
 
 
 
 #ifdef WAVELET_TEST_2DPAD
    #include <stdio.h>   
    
    #ifdef WAVELET_TEST_mRGBE
       #include "mrgbe.h"
    #endif
    
    #define WAVELET_TEST_INIT 6
    
    
    
    static void avgDetailCoeffs (const WaveletMatrix2 y, unsigned yLen, 
       WaveletCoeff3 avg
    )
    {
       unsigned i, j, k, numCoeffs = 0;
       
       avg [0] = avg [1] = avg [2] = 0;
       for (i = 0; i < yLen; i++)
          for (j = 0; j < yLen; j++)
             /* Skip zero and cleared coeffs */
             if ((i > 2 || j > 2) && !coeffIsZero(y [i][j]) && 
                !coeffIsEmpty(y [i][j])
             ) {
                for (k = 0; k < 3; k++)
                   avg [k] += y [i][j][k];
                numCoeffs++;
             }
       
       for (k = 0; k < 3; k++)
          avg [k] /= numCoeffs;
    }
          
    
    
    int main (int argc, char *argv [])
    {
       int            i, j, k;
       unsigned       y0Len, yLen, numThresh = 0, numNonzero = 0;
       WaveletMatrix2  y0 = NULL, y = NULL, yt = NULL;
       FILE           *dataFile = NULL;
       WAVELET_COEFF  inData, thresh = 0;
       WaveletCoeff3  avgDetail;
    #ifdef WAVELET_TEST_mRGBE
       #define        HUGE 1e10
       mRGBE          mrgbeCoeff;
       mRGBERange     mrgbeRange;
       WaveletMatrix2  ymrgbe = NULL;
    #endif
       
       if (argc < 2) {
          fprintf(stderr, "%s <dim> [threshold] [dataFile]\n", argv [0]);
          fputs("Missing array dimension dim >= 2, "
             "compression threshold >= 0\n", stderr
          );
          return -1;
       }
       if (!(y0Len = atoi(argv [1])) || y0Len < 2) {
          fprintf(stderr, "Invalid array dimension %d\n", y0Len);
          return -1;
       }
       if (argc > 2 && (thresh = atof(argv [2])) < 0) {
          fprintf(stderr, "Invalid threshold %.3f\n", thresh);
          return -1;
       }
       
       /* Get size of (padded) output array */
       if (!(yLen = padWaveletXform2(NULL, NULL, y0Len, &numNonzero))) {
          fputs("Can't determine number of coefficients "
             "(input too short?)\n", stderr
          );
          return -1;
       }
       
       /* Allocate & clear arrays for original and reconstruction */
       if (!(y0 = allocWaveletMatrix2(y0Len)) || 
          !(y = allocWaveletMatrix2(yLen)) ||
          !(yt = allocWaveletMatrix2(yLen))
    #ifdef WAVELET_TEST_mRGBE
          || !(ymrgbe = allocWaveletMatrix2(yLen))
    #endif
       ) {
          fprintf(stderr, "Failed allocating %dx%d array\n", yLen, yLen);
          return -1;
       }
       clearCoeffs2(y0, y0Len);
       clearCoeffs2(y, yLen);
       clearCoeffs2(yt, yLen);
       
       if (argc > 3) {
          /* Load data from file; length must not exceed allocated */
          if (!(dataFile = fopen(argv [3], "r"))) {
             fprintf(stderr, "Failed opening data file %s\n", argv [3]);
             return -1;
          }
       
          for (i = 0; i < y0Len; i++) {
             for (j = 0; j < y0Len; j++) {
                if (feof(dataFile)) {
                   fprintf(stderr, 
                      "Premature end of file reading data from %s\n",
                      argv [2]
                   );
                   fclose(dataFile);
                   return -1;
                }
             
                /* Read next float, skipping any leading whitespace */
                if (fscanf(dataFile, " %lf", &inData)) {
                   y0 [i][j][0] = y0 [i][j][1] = y0 [i][j][2] = 
                      y [i][j][0] = y [i][j][1] = y [i][j][2] = inData;
                }
                else {
                   fprintf(stderr, 
                      "Error reading from data file %s\n", argv [2]
                   );
                   fclose(dataFile);
                   return -1;
                }
             }
          }
          
          fclose(dataFile);
       }
       else {
          /* Init input */
          srand48(111);
          for (i = 0; i < y0Len; i++) {
             for (j = 0; j < y0Len; j++) {
                for (k = 0; k < 3; k++) {
                   y0 [i][j][k] = y [i][j][k] =                  
    #if WAVELET_TEST_INIT == 0
                      /* Random data, channel-independent */
                      drand48();
    #elif WAVELET_TEST_INIT == 1
                      /* Random data, indentical for all channels */
                      k ? y [i][j][k - 1] : drand48();
    #elif WAVELET_TEST_INIT == 2
                      /* Monotonically increasing along axis 0 */
                      i;
    #elif WAVELET_TEST_INIT == 3
                      /* Monotonically increasing along axis 1 */
                      j;
    #elif WAVELET_TEST_INIT == 4
                      /* Monotonically increasing along both axes */
                      i * j;
    #elif WAVELET_TEST_INIT == 5
                      /* Monotonically increasing by linear index */
                      i * y0Len + j;
    #elif WAVELET_TEST_INIT == 6
                      /* Symmetric "bump" function */
                      (1. - fabs(j - y0Len/2. + 0.5) / (y0Len/2. - 0.5)) *
                      (1. - fabs(i - y0Len/2. + 0.5) / (y0Len/2. - 0.5));
    #endif
                }
             }
          }
       }
 
    #ifdef WAVELET_DBG
       puts("----------------------- FWD XFORM -------------------------\n");
    #endif
       
       /* Forward xform */
       if (!padWaveletXform2(y, yt, y0Len, NULL)) {
          fputs("Forward xform failed\n", stderr);
          return -1;
       }
 
       /* Threshold coefficients; we use hard thresholding as it's easier to
        * implement than soft thresholding, which requires sorting the
        * coefficients.  NOTE: the coarsest approximation coefficient at
        * y [0][0] is omitted as it is essential for reconstruction! */
       for (i = numThresh = 0; i < yLen; i++)
          for (j = 0; j < yLen; j++) {
             if (coeffThresh(y, i, j, thresh)) {
                y [i][j][0] = y [i][j][1] = y [i][j][2] = 0;
                numThresh++;
    #if 0
                /* Replace thresholded values with random noise in range
                   [-threshold, threshold] */
                y [i][j][0] = y [i][j][1] = y [i][j][2] = thresh * (
                   2 * drand48() - 1
                );
    #endif
             }
          }
 
    #ifdef WAVELET_DBG
       /* Dump coefficients */
       puts("----------------------- COEFFICIENTS -------------------------\n");
       dumpCoeffs2(y, NULL, yLen, 0, thresh);
       printf("\n%d/%d nonzero coefficients", numNonzero, yLen * yLen);
       
       avgDetailCoeffs(y, yLen, avgDetail);
       printf("\nAvg = [%.3g, %.3g, %.3g]", 
          avgDetail [0], avgDetail [1], avgDetail [2]
       );
       
       if (numThresh)
          printf("\n%d/%d coefficients thresholded = %.1f%% compression", 
             numThresh, yLen * yLen, 100. * numThresh / (yLen * yLen)
          );      
    #endif
    
    #ifdef WAVELET_TEST_mRGBE
       /* Test mRGBE coefficient encoding */
       mrgbeRange.min [0] = mrgbeRange.min [1] = mrgbeRange.min [2] = HUGE;
       mrgbeRange.max [0] = mrgbeRange.max [1] = mrgbeRange.max [2] = 0;
       clearCoeffs2(ymrgbe, yLen);   
       
-      /* Find min/max coeff and init mRGBE range */
+      /* Find min/max coeff and init mRGBE range, omitting upper left 3x3
+       * approximation coefficients, since these aren't encoded */
       for (i = 0; i < yLen; i++)
          for (j = 0; j < yLen; j++)
-            if (!coeffIsEmpty(y [i][j]) && !coeffIsZero(y [i][j]))
+            if ((i > 2 || j > 2) && 
+               !coeffIsEmpty(y [i][j]) && !coeffIsZero(y [i][j])
+            )
                for (k = 0; k < 3; k++) {
                   /* Skip empty (NaN) coeffs */
                   inData = fabs(y [i][j][k]);
                   if (inData < mrgbeRange.min [k])
                      mrgbeRange.min [k] = inData;
                   if (inData > mrgbeRange.max [k])
                      mrgbeRange.max [k] = inData;
                }
       mRGBEinit(&mrgbeRange, mrgbeRange.min, mrgbeRange.max);      
       
-      /* Encode coeffs to mRGBE and back, but preserve approximation
-       * coefficients */
+      /* Encode coeffs to mRGBE and back, but preserve upper 3x3
+       * approximation coefficients */
       for (i = 0; i < yLen; i++)
          for (j = 0; j < yLen; j++)
-            if (!coeffIsEmpty(y [i][j]) && !coeffIsZero(y [i][j])) {
+            if ((i > 2 || j > 2) && 
+               !coeffIsEmpty(y [i][j]) && !coeffIsZero(y [i][j])
+            ) {
                mrgbeCoeff = mRGBEencode(y [i][j], &mrgbeRange, 0);
                mRGBEdecode(mrgbeCoeff, &mrgbeRange, ymrgbe [i][j]);
             }
             else for (k = 0; k < 3; k++)
                ymrgbe [i][j][k] = y [i][j][k];
          
       #ifdef WAVELET_DBG
          /* Dump mRGBE-decoded coefficients */
          puts("\n\n-------------------- mRGBE COEFFICIENTS ----------------------\n");           
          dumpCoeffs2(ymrgbe, NULL, yLen, yLen, 0);
          
          avgDetailCoeffs(y, yLen, avgDetail);
          printf("\nAvg = [%.3g, %.3g, %.3g]", 
             avgDetail [0], avgDetail [1], avgDetail [2]
          );
       #endif
    #endif
 
    #ifdef WAVELET_DBG
       puts("\n----------------------- INV XFORM -------------------------\n");
    #endif
    
       /* Inverse xform (also using mRGBE coeffs if enabled) */
       if (!padWaveletInvXform2(y, yt, yLen, y0Len)
    #ifdef WAVELET_TEST_mRGBE
          || !padWaveletInvXform2(ymrgbe, yt, yLen, y0Len)
    #endif
       ) {
          fputs("\nInverse xform failed\n", stderr);
          return -1;
       }
    
    #ifdef WAVELET_DBG
       puts("\n--------------------- ORIG vs. INV XFORM ------------------------\n");
       dumpCoeffs2(y0, y, y0Len, y0Len, 0);
    #endif
       printf("\nAvg RMSE = %.2f\n", rmseCoeffs2(y0, y, y0Len));
       
    #ifdef WAVELET_TEST_mRGBE
       #ifdef WAVELET_DBG
          puts("\n------------------ ORIG vs. INV XFORM + mRGBE -------------------\n");
          dumpCoeffs2(y0, ymrgbe, y0Len, y0Len, 0);
       #endif
       printf("\nAvg RMSE with mRGBE enc = %.2f\n", 
          rmseCoeffs2(y0, ymrgbe, y0Len)
       );
    #endif
       
       freeWaveletMatrix2(y0, y0Len);
       freeWaveletMatrix2(y, yLen);
       freeWaveletMatrix2(yt, yLen);
    #ifdef WAVELET_TEST_mRGBE
       freeWaveletMatrix2(ymrgbe, yLen);
    #endif
       return 0;
    }
 #endif /* WAVELET_TEST_2DPAD */