diff --git a/src/KSPACE/ewald.cpp b/src/KSPACE/ewald.cpp
index 4ec4eed9e..af4803f57 100644
--- a/src/KSPACE/ewald.cpp
+++ b/src/KSPACE/ewald.cpp
@@ -1,967 +1,964 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
      per-atom energy/virial added by German Samolyuk (ORNL), Stan Moore (BYU)
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
 #include "stdlib.h"
 #include "stdio.h"
 #include "string.h"
 #include "math.h"
 #include "ewald.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "pair.h"
 #include "domain.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define SMALL 0.00001
 
 /* ---------------------------------------------------------------------- */
 
 Ewald::Ewald(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg)
 {
   if (narg != 1) error->all(FLERR,"Illegal kspace_style ewald command");
 
   accuracy_relative = atof(arg[0]);
 
   kmax = 0;
   kxvecs = kyvecs = kzvecs = NULL;
   ug = NULL;
   eg = vg = NULL;
   sfacrl = sfacim = sfacrl_all = sfacim_all = NULL;
 
   nmax = 0;
   ek = NULL;
   cs = sn = NULL;
 
   kcount = 0;
 }
 
 /* ----------------------------------------------------------------------
    free all memory 
 ------------------------------------------------------------------------- */
 
 Ewald::~Ewald()
 {
   deallocate();
   memory->destroy(ek);
   memory->destroy3d_offset(cs,-kmax_created);
   memory->destroy3d_offset(sn,-kmax_created);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Ewald::init()
 {
   if (comm->me == 0) {
     if (screen) fprintf(screen,"Ewald initialization ...\n");
     if (logfile) fprintf(logfile,"Ewald initialization ...\n");
   }
 
   // error check
 
   if (domain->triclinic) 
     error->all(FLERR,"Cannot use Ewald with triclinic box");
   if (domain->dimension == 2) 
     error->all(FLERR,"Cannot use Ewald with 2d simulation");
 
   if (!atom->q_flag) error->all(FLERR,"Kspace style requires atom attribute q");
 
   if (slabflag == 0 && domain->nonperiodic > 0)
     error->all(FLERR,"Cannot use nonperiodic boundaries with Ewald");
   if (slabflag == 1) {
     if (domain->xperiodic != 1 || domain->yperiodic != 1 || 
 	domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
       error->all(FLERR,"Incorrect boundaries with slab Ewald");
   }
 
   // extract short-range Coulombic cutoff from pair style
 
   scale = 1.0;
 
   if (force->pair == NULL)
     error->all(FLERR,"KSpace style is incompatible with Pair style");
   int itmp;
   double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
   if (p_cutoff == NULL)
     error->all(FLERR,"KSpace style is incompatible with Pair style");
   double cutoff = *p_cutoff;
 
   qsum = qsqsum = 0.0;
   for (int i = 0; i < atom->nlocal; i++) {
     qsum += atom->q[i];
     qsqsum += atom->q[i]*atom->q[i];
   }
 
   double tmp;
   MPI_Allreduce(&qsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
   qsum = tmp;
   MPI_Allreduce(&qsqsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
   qsqsum = tmp;
 
   if (qsqsum == 0.0)
     error->all(FLERR,"Cannot use kspace solver on system with no charge");
   if (fabs(qsum) > SMALL && comm->me == 0) {
     char str[128];
     sprintf(str,"System is not charge neutral, net charge = %g",qsum);
     error->warning(FLERR,str);
   }
 
   // set accuracy (force units) from accuracy_relative or accuracy_absolute
   
   if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
   else accuracy = accuracy_relative * two_charge_force;
   
   // setup K-space resolution
 
   q2 = qsqsum * force->qqrd2e / force->dielectric;
   bigint natoms = atom->natoms;
 
   // use xprd,yprd,zprd even if triclinic so grid size is the same
   // adjust z dimension for 2d slab Ewald
   // 3d Ewald just uses zprd since slab_volfactor = 1.0
 
   double xprd = domain->xprd;
   double yprd = domain->yprd;
   double zprd = domain->zprd;
   double zprd_slab = zprd*slab_volfactor;
   
   // make initial g_ewald estimate
   // based on desired accuracy and real space cutoff
   // fluid-occupied volume used to estimate real-space error
   // zprd used rather than zprd_slab
 
   if (!gewaldflag)
     g_ewald = sqrt(-log(accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / 
                         (2.0*q2))) / cutoff;
 
   // setup Ewald coefficients so can print stats
 
   setup();
 
- // final RMS accuracy
+  // final RMS accuracy
 
   double lprx = rms(kxmax,xprd,natoms,q2);
   double lpry = rms(kymax,yprd,natoms,q2);
   double lprz = rms(kzmax,zprd_slab,natoms,q2);
   double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
   double spr = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / 
     sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
-
+  
   // stats
 
   if (comm->me == 0) {
     if (screen) {
       fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
       fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
 	      MAX(lpr,spr));
       fprintf(screen,"  estimated relative force accuracy = %g\n",
 	      MAX(lpr,spr)/two_charge_force);
       fprintf(screen,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
 	      kcount,kmax,kmax3d);
     }
     if (logfile) {
       fprintf(logfile,"  G vector (1/distnace) = %g\n",g_ewald);
       fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
 	      MAX(lpr,spr));
       fprintf(logfile,"  estimated relative force accuracy = %g\n",
 	      MAX(lpr,spr)/two_charge_force);
       fprintf(logfile,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
 	      kcount,kmax,kmax3d);
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    adjust Ewald coeffs, called initially and whenever volume has changed 
 ------------------------------------------------------------------------- */
 
 void Ewald::setup()
 {
   // volume-dependent factors
 
   double xprd = domain->xprd;
   double yprd = domain->yprd;
   double zprd = domain->zprd;
   
   // adjustment of z dimension for 2d slab Ewald
   // 3d Ewald just uses zprd since slab_volfactor = 1.0
 
   double zprd_slab = zprd*slab_volfactor;
   volume = xprd * yprd * zprd_slab;
 
   unitk[0] = 2.0*MY_PI/xprd;
   unitk[1] = 2.0*MY_PI/yprd;
   unitk[2] = 2.0*MY_PI/zprd_slab;
 
   // determine kmax
   // function of current box size, accuracy, G_ewald (short-range cutoff)
 
   bigint natoms = atom->natoms;
   double err;
   kxmax = 1;
   kymax = 1;
   kzmax = 1;
   
   err = rms(kxmax,xprd,natoms,q2);
   while (err > accuracy) {
     kxmax++;
     err = rms(kxmax,xprd,natoms,q2);
   }
 
   err = rms(kymax,yprd,natoms,q2);
   while (err > accuracy) {
     kymax++;
     err = rms(kymax,yprd,natoms,q2);
   }
 
   err = rms(kzmax,zprd_slab,natoms,q2);
   while (err > accuracy) {
     kzmax++;
     err = rms(kzmax,zprd_slab,natoms,q2);
   } 
 
   int kmax_old = kmax;
   kmax = MAX(kxmax,kymax);
   kmax = MAX(kmax,kzmax);
   kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
 
   double gsqxmx = unitk[0]*unitk[0]*kxmax*kxmax;
   double gsqymx = unitk[1]*unitk[1]*kymax*kymax;
   double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
   gsqmx = MAX(gsqxmx,gsqymx);
   gsqmx = MAX(gsqmx,gsqzmx);
   
   // if size has grown, reallocate k-dependent and nlocal-dependent arrays
 
   if (kmax > kmax_old) {
     deallocate();
     allocate();
 
     memory->destroy(ek);
     memory->destroy3d_offset(cs,-kmax_created);
     memory->destroy3d_offset(sn,-kmax_created);
     nmax = atom->nmax;
     memory->create(ek,nmax,3,"ewald:ek");
     memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald:cs");
     memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald:sn");
     kmax_created = kmax;
   }
 
   // pre-compute Ewald coefficients
 
   coeffs();
 }
 
 /* ----------------------------------------------------------------------
    compute RMS accuracy for a dimension
 ------------------------------------------------------------------------- */
 
 double Ewald::rms(int km, double prd, bigint natoms, double q2)
 {
   double value = 2.0*q2*g_ewald/prd * 
     sqrt(1.0/(MY_PI*km*natoms)) * 
     exp(-MY_PI*MY_PI*km*km/(g_ewald*g_ewald*prd*prd));
 
   return value;
 }
 
 /* ----------------------------------------------------------------------
    compute the Ewald long-range force, energy, virial 
 ------------------------------------------------------------------------- */
 
 void Ewald::compute(int eflag, int vflag)
 {
   int i,j,k;
 
   // set energy/virial flags
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = evflag_atom = eflag_global = vflag_global = 
 	 eflag_atom = vflag_atom = 0;
 
   // extend size of per-atom arrays if necessary
 
   if (atom->nlocal > nmax) {
     memory->destroy(ek);
     memory->destroy3d_offset(cs,-kmax_created);
     memory->destroy3d_offset(sn,-kmax_created);
     nmax = atom->nmax;
     memory->create(ek,nmax,3,"ewald:ek");
     memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald:cs");
     memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald:sn");
     kmax_created = kmax;
   }
 
   // partial structure factors on each processor
   // total structure factor by summing over procs
 
   eik_dot_r();
   MPI_Allreduce(sfacrl,sfacrl_all,kcount,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(sfacim,sfacim_all,kcount,MPI_DOUBLE,MPI_SUM,world);
 
   // K-space portion of electric field
   // double loop over K-vectors and local atoms
   // perform per-atom calculations if needed
 
   double **f = atom->f;
   double *q = atom->q;
   int nlocal = atom->nlocal;
 
   int kx,ky,kz;
   double cypz,sypz,exprl,expim,partial,partial_peratom;
 
   for (i = 0; i < nlocal; i++) {
     ek[i][0] = 0.0;
     ek[i][1] = 0.0;
     ek[i][2] = 0.0;
   }
 
   for (k = 0; k < kcount; k++) {
     kx = kxvecs[k];
     ky = kyvecs[k];
     kz = kzvecs[k];
 
     for (i = 0; i < nlocal; i++) {
       cypz = cs[ky][1][i]*cs[kz][2][i] - sn[ky][1][i]*sn[kz][2][i];
       sypz = sn[ky][1][i]*cs[kz][2][i] + cs[ky][1][i]*sn[kz][2][i];
       exprl = cs[kx][0][i]*cypz - sn[kx][0][i]*sypz;
       expim = sn[kx][0][i]*cypz + cs[kx][0][i]*sypz;
       partial = expim*sfacrl_all[k] - exprl*sfacim_all[k];
       ek[i][0] += partial*eg[k][0];
       ek[i][1] += partial*eg[k][1];
       ek[i][2] += partial*eg[k][2];
 
       if (evflag_atom) {
         partial_peratom = exprl*sfacrl_all[k] + expim*sfacim_all[k];		
     	if (eflag_atom) eatom[i] += q[i]*ug[k]*partial_peratom;
         if (vflag_atom)
 	  for (j = 0; j < 6; j++)
 	    vatom[i][j] += ug[k]*vg[k][j]*partial_peratom;
       }
     }
   }
 
   // convert E-field to force
 
   const double qscale = force->qqrd2e * scale;
 
   for (i = 0; i < nlocal; i++) {
     f[i][0] += qscale * q[i]*ek[i][0];
     f[i][1] += qscale * q[i]*ek[i][1];
     f[i][2] += qscale * q[i]*ek[i][2];
   }
  
   // global energy
 
   if (eflag_global) {
     for (k = 0; k < kcount; k++)
       energy += ug[k] * (sfacrl_all[k]*sfacrl_all[k] + 
 			 sfacim_all[k]*sfacim_all[k]);
     energy -= g_ewald*qsqsum/MY_PIS +
       MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
     energy *= qscale;
   }
 
   // global virial
 
   if (vflag_global) {
     double uk;
     for (k = 0; k < kcount; k++) {
       uk = ug[k] * (sfacrl_all[k]*sfacrl_all[k] + sfacim_all[k]*sfacim_all[k]);
       for (j = 0; j < 6; j++) virial[j] += uk*vg[k][j];
     }
     for (j = 0; j < 6; j++) virial[j] *= qscale;
   }
 
   // per-atom energy/virial
   // energy includes self-energy correction
 
   if (evflag_atom) {
     if (eflag_atom) {
       for (i = 0; i < nlocal; i++) {
         eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum / 
 	  (g_ewald*g_ewald*volume);
         eatom[i] *= qscale;
       }
     }
       
     if (vflag_atom)
       for (i = 0; i < nlocal; i++)
 	for (j = 0; j < 6; j++) vatom[i][j] *= q[i]*qscale;
   }
 
   // 2d slab correction
 
   if (slabflag) slabcorr();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Ewald::eik_dot_r()
 {
   int i,k,l,m,n,ic;
   double cstr1,sstr1,cstr2,sstr2,cstr3,sstr3,cstr4,sstr4;
   double sqk,clpm,slpm;
 
   double **x = atom->x;
   double *q = atom->q;
   int nlocal = atom->nlocal;
 
   n = 0;
 
   // (k,0,0), (0,l,0), (0,0,m)
 
   for (ic = 0; ic < 3; ic++) {
     sqk = unitk[ic]*unitk[ic];
     if (sqk <= gsqmx) {
       cstr1 = 0.0;
       sstr1 = 0.0;
       for (i = 0; i < nlocal; i++) {
 	cs[0][ic][i] = 1.0;
 	sn[0][ic][i] = 0.0;
 	cs[1][ic][i] = cos(unitk[ic]*x[i][ic]);
 	sn[1][ic][i] = sin(unitk[ic]*x[i][ic]);
 	cs[-1][ic][i] = cs[1][ic][i];
 	sn[-1][ic][i] = -sn[1][ic][i];
 	cstr1 += q[i]*cs[1][ic][i];
 	sstr1 += q[i]*sn[1][ic][i];
       }
       sfacrl[n] = cstr1;
       sfacim[n++] = sstr1;
     }
   }
 
   for (m = 2; m <= kmax; m++) {
     for (ic = 0; ic < 3; ic++) {
       sqk = m*unitk[ic] * m*unitk[ic];
       if (sqk <= gsqmx) {
 	cstr1 = 0.0;
 	sstr1 = 0.0;
 	for (i = 0; i < nlocal; i++) {
 	  cs[m][ic][i] = cs[m-1][ic][i]*cs[1][ic][i] - 
 	    sn[m-1][ic][i]*sn[1][ic][i];
 	  sn[m][ic][i] = sn[m-1][ic][i]*cs[1][ic][i] + 
 	    cs[m-1][ic][i]*sn[1][ic][i];
 	  cs[-m][ic][i] = cs[m][ic][i];
 	  sn[-m][ic][i] = -sn[m][ic][i];
 	  cstr1 += q[i]*cs[m][ic][i];
 	  sstr1 += q[i]*sn[m][ic][i];
 	}
 	sfacrl[n] = cstr1;
 	sfacim[n++] = sstr1;
       }
     }
   }
 
   // 1 = (k,l,0), 2 = (k,-l,0)
 
   for (k = 1; k <= kxmax; k++) {
     for (l = 1; l <= kymax; l++) {
       sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]);
       if (sqk <= gsqmx) {
 	cstr1 = 0.0;
 	sstr1 = 0.0;
 	cstr2 = 0.0;
 	sstr2 = 0.0;
 	for (i = 0; i < nlocal; i++) {
 	  cstr1 += q[i]*(cs[k][0][i]*cs[l][1][i] - sn[k][0][i]*sn[l][1][i]);
 	  sstr1 += q[i]*(sn[k][0][i]*cs[l][1][i] + cs[k][0][i]*sn[l][1][i]);
 	  cstr2 += q[i]*(cs[k][0][i]*cs[l][1][i] + sn[k][0][i]*sn[l][1][i]);
 	  sstr2 += q[i]*(sn[k][0][i]*cs[l][1][i] - cs[k][0][i]*sn[l][1][i]);
 	}
 	sfacrl[n] = cstr1;
 	sfacim[n++] = sstr1;
 	sfacrl[n] = cstr2;
 	sfacim[n++] = sstr2;
       }
     }
   }
 
   // 1 = (0,l,m), 2 = (0,l,-m)
 
   for (l = 1; l <= kymax; l++) {
     for (m = 1; m <= kzmax; m++) {
       sqk = (l*unitk[1] * l*unitk[1]) + (m*unitk[2] * m*unitk[2]);
       if (sqk <= gsqmx) {
 	cstr1 = 0.0;
 	sstr1 = 0.0;
 	cstr2 = 0.0;
 	sstr2 = 0.0;
 	for (i = 0; i < nlocal; i++) {
 	  cstr1 += q[i]*(cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i]);
 	  sstr1 += q[i]*(sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i]);
 	  cstr2 += q[i]*(cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i]);
 	  sstr2 += q[i]*(sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i]);
 	}
 	sfacrl[n] = cstr1;
 	sfacim[n++] = sstr1;
 	sfacrl[n] = cstr2;
 	sfacim[n++] = sstr2;
       }
     }
   }
 
   // 1 = (k,0,m), 2 = (k,0,-m)
 
   for (k = 1; k <= kxmax; k++) {
     for (m = 1; m <= kzmax; m++) {
       sqk = (k*unitk[0] * k*unitk[0]) + (m*unitk[2] * m*unitk[2]);
       if (sqk <= gsqmx) {
 	cstr1 = 0.0;
 	sstr1 = 0.0;
 	cstr2 = 0.0;
 	sstr2 = 0.0;
 	for (i = 0; i < nlocal; i++) {
 	  cstr1 += q[i]*(cs[k][0][i]*cs[m][2][i] - sn[k][0][i]*sn[m][2][i]);
 	  sstr1 += q[i]*(sn[k][0][i]*cs[m][2][i] + cs[k][0][i]*sn[m][2][i]);
 	  cstr2 += q[i]*(cs[k][0][i]*cs[m][2][i] + sn[k][0][i]*sn[m][2][i]);
 	  sstr2 += q[i]*(sn[k][0][i]*cs[m][2][i] - cs[k][0][i]*sn[m][2][i]);
 	}
 	sfacrl[n] = cstr1;
 	sfacim[n++] = sstr1;
 	sfacrl[n] = cstr2;
 	sfacim[n++] = sstr2;
       }
     }
   }
 
   // 1 = (k,l,m), 2 = (k,-l,m), 3 = (k,l,-m), 4 = (k,-l,-m)
 
   for (k = 1; k <= kxmax; k++) {
     for (l = 1; l <= kymax; l++) {
       for (m = 1; m <= kzmax; m++) {
 	sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]) +
 	  (m*unitk[2] * m*unitk[2]);
 	if (sqk <= gsqmx) {
 	  cstr1 = 0.0;
 	  sstr1 = 0.0;
 	  cstr2 = 0.0;
 	  sstr2 = 0.0;
 	  cstr3 = 0.0;
 	  sstr3 = 0.0;
 	  cstr4 = 0.0;
 	  sstr4 = 0.0;
 	  for (i = 0; i < nlocal; i++) {
 	    clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
 	    slpm = sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
 	    cstr1 += q[i]*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
 	    sstr1 += q[i]*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
 	    
 	    clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
 	    slpm = -sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
 	    cstr2 += q[i]*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
 	    sstr2 += q[i]*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
 	    
 	    clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
 	    slpm = sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
 	    cstr3 += q[i]*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
 	    sstr3 += q[i]*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
 	    
 	    clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
 	    slpm = -sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
 	    cstr4 += q[i]*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
 	    sstr4 += q[i]*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
 	  }
 	  sfacrl[n] = cstr1;
 	  sfacim[n++] = sstr1;
 	  sfacrl[n] = cstr2;
 	  sfacim[n++] = sstr2;
 	  sfacrl[n] = cstr3;
 	  sfacim[n++] = sstr3;
 	  sfacrl[n] = cstr4;
 	  sfacim[n++] = sstr4;
 	}
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    pre-compute coefficients for each Ewald K-vector 
 ------------------------------------------------------------------------- */
 
 void Ewald::coeffs()
 {
   int k,l,m;
   double sqk,vterm;
 
-  double unitkx = unitk[0];
-  double unitky = unitk[1];
-  double unitkz = unitk[2];
   double g_ewald_sq_inv = 1.0 / (g_ewald*g_ewald);
   double preu = 4.0*MY_PI/volume;
 
   kcount = 0;
 
   // (k,0,0), (0,l,0), (0,0,m)
 
   for (m = 1; m <= kmax; m++) {
-    sqk = (m*unitkx) * (m*unitkx);
+    sqk = (m*unitk[0]) * (m*unitk[0]);
     if (sqk <= gsqmx) {
       kxvecs[kcount] = m;
       kyvecs[kcount] = 0;
       kzvecs[kcount] = 0;
       ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
-      eg[kcount][0] = 2.0*unitkx*m*ug[kcount];
+      eg[kcount][0] = 2.0*unitk[0]*m*ug[kcount];
       eg[kcount][1] = 0.0;
       eg[kcount][2] = 0.0;
       vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
-      vg[kcount][0] = 1.0 + vterm*(unitkx*m)*(unitkx*m);
+      vg[kcount][0] = 1.0 + vterm*(unitk[0]*m)*(unitk[0]*m);
       vg[kcount][1] = 1.0;
       vg[kcount][2] = 1.0;
       vg[kcount][3] = 0.0;
       vg[kcount][4] = 0.0;
       vg[kcount][5] = 0.0;
       kcount++;
     }
-    sqk = (m*unitky) * (m*unitky);
+    sqk = (m*unitk[1]) * (m*unitk[1]);
     if (sqk <= gsqmx) {
       kxvecs[kcount] = 0;
       kyvecs[kcount] = m;
       kzvecs[kcount] = 0;
       ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
       eg[kcount][0] = 0.0;
-      eg[kcount][1] = 2.0*unitky*m*ug[kcount];
+      eg[kcount][1] = 2.0*unitk[1]*m*ug[kcount];
       eg[kcount][2] = 0.0;
       vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
       vg[kcount][0] = 1.0;
-      vg[kcount][1] = 1.0 + vterm*(unitky*m)*(unitky*m);
+      vg[kcount][1] = 1.0 + vterm*(unitk[1]*m)*(unitk[1]*m);
       vg[kcount][2] = 1.0;
       vg[kcount][3] = 0.0;
       vg[kcount][4] = 0.0;
       vg[kcount][5] = 0.0;
       kcount++;
     }
-    sqk = (m*unitkz) * (m*unitkz);
+    sqk = (m*unitk[2]) * (m*unitk[2]);
     if (sqk <= gsqmx) {
       kxvecs[kcount] = 0;
       kyvecs[kcount] = 0;
       kzvecs[kcount] = m;
       ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
       eg[kcount][0] = 0.0;
       eg[kcount][1] = 0.0;
-      eg[kcount][2] = 2.0*unitkz*m*ug[kcount];
+      eg[kcount][2] = 2.0*unitk[2]*m*ug[kcount];
       vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
       vg[kcount][0] = 1.0;
       vg[kcount][1] = 1.0;
-      vg[kcount][2] = 1.0 + vterm*(unitkz*m)*(unitkz*m);
+      vg[kcount][2] = 1.0 + vterm*(unitk[2]*m)*(unitk[2]*m);
       vg[kcount][3] = 0.0;
       vg[kcount][4] = 0.0;
       vg[kcount][5] = 0.0;
       kcount++;
     }
   }
 
   // 1 = (k,l,0), 2 = (k,-l,0)
 
   for (k = 1; k <= kxmax; k++) {
     for (l = 1; l <= kymax; l++) {
-      sqk = (unitkx*k) * (unitkx*k) + (unitky*l) * (unitky*l);
+      sqk = (unitk[0]*k) * (unitk[0]*k) + (unitk[1]*l) * (unitk[1]*l);
       if (sqk <= gsqmx) {
 	kxvecs[kcount] = k;
 	kyvecs[kcount] = l;
 	kzvecs[kcount] = 0;
 	ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
-	eg[kcount][0] = 2.0*unitkx*k*ug[kcount];
-	eg[kcount][1] = 2.0*unitky*l*ug[kcount];
+	eg[kcount][0] = 2.0*unitk[0]*k*ug[kcount];
+	eg[kcount][1] = 2.0*unitk[1]*l*ug[kcount];
 	eg[kcount][2] = 0.0;
 	vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
-	vg[kcount][0] = 1.0 + vterm*(unitkx*k)*(unitkx*k);
-	vg[kcount][1] = 1.0 + vterm*(unitky*l)*(unitky*l);
+	vg[kcount][0] = 1.0 + vterm*(unitk[0]*k)*(unitk[0]*k);
+	vg[kcount][1] = 1.0 + vterm*(unitk[1]*l)*(unitk[1]*l);
 	vg[kcount][2] = 1.0;
-	vg[kcount][3] = vterm*unitkx*k*unitky*l;
+	vg[kcount][3] = vterm*unitk[0]*k*unitk[1]*l;
 	vg[kcount][4] = 0.0;
 	vg[kcount][5] = 0.0;
 	kcount++;
 
 	kxvecs[kcount] = k;
 	kyvecs[kcount] = -l;
 	kzvecs[kcount] = 0;
 	ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
-	eg[kcount][0] = 2.0*unitkx*k*ug[kcount];
-	eg[kcount][1] = -2.0*unitky*l*ug[kcount];
+	eg[kcount][0] = 2.0*unitk[0]*k*ug[kcount];
+	eg[kcount][1] = -2.0*unitk[1]*l*ug[kcount];
 	eg[kcount][2] = 0.0;
-	vg[kcount][0] = 1.0 + vterm*(unitkx*k)*(unitkx*k);
-	vg[kcount][1] = 1.0 + vterm*(unitky*l)*(unitky*l);
+	vg[kcount][0] = 1.0 + vterm*(unitk[0]*k)*(unitk[0]*k);
+	vg[kcount][1] = 1.0 + vterm*(unitk[1]*l)*(unitk[1]*l);
 	vg[kcount][2] = 1.0;
-	vg[kcount][3] = -vterm*unitkx*k*unitky*l;
+	vg[kcount][3] = -vterm*unitk[0]*k*unitk[1]*l;
 	vg[kcount][4] = 0.0;
 	vg[kcount][5] = 0.0;
 	kcount++;;
       }
     }
   }
 
   // 1 = (0,l,m), 2 = (0,l,-m)
 
   for (l = 1; l <= kymax; l++) {
     for (m = 1; m <= kzmax; m++) {
-      sqk = (unitky*l) * (unitky*l) + (unitkz*m) * (unitkz*m);
+      sqk = (unitk[1]*l) * (unitk[1]*l) + (unitk[2]*m) * (unitk[2]*m);
       if (sqk <= gsqmx) {
 	kxvecs[kcount] = 0;
 	kyvecs[kcount] = l;
 	kzvecs[kcount] = m;
 	ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
 	eg[kcount][0] =  0.0;
-	eg[kcount][1] =  2.0*unitky*l*ug[kcount];
-	eg[kcount][2] =  2.0*unitkz*m*ug[kcount];
+	eg[kcount][1] =  2.0*unitk[1]*l*ug[kcount];
+	eg[kcount][2] =  2.0*unitk[2]*m*ug[kcount];
 	vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
 	vg[kcount][0] = 1.0;
-	vg[kcount][1] = 1.0 + vterm*(unitky*l)*(unitky*l);
-	vg[kcount][2] = 1.0 + vterm*(unitkz*m)*(unitkz*m);
+	vg[kcount][1] = 1.0 + vterm*(unitk[1]*l)*(unitk[1]*l);
+	vg[kcount][2] = 1.0 + vterm*(unitk[2]*m)*(unitk[2]*m);
 	vg[kcount][3] = 0.0;
 	vg[kcount][4] = 0.0;
-	vg[kcount][5] = vterm*unitky*l*unitkz*m;
+	vg[kcount][5] = vterm*unitk[1]*l*unitk[2]*m;
 	kcount++;
 
 	kxvecs[kcount] = 0;
 	kyvecs[kcount] = l;
 	kzvecs[kcount] = -m;
 	ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
 	eg[kcount][0] =  0.0;
-	eg[kcount][1] =  2.0*unitky*l*ug[kcount];
-	eg[kcount][2] = -2.0*unitkz*m*ug[kcount];
+	eg[kcount][1] =  2.0*unitk[1]*l*ug[kcount];
+	eg[kcount][2] = -2.0*unitk[2]*m*ug[kcount];
 	vg[kcount][0] = 1.0;
-	vg[kcount][1] = 1.0 + vterm*(unitky*l)*(unitky*l);
-	vg[kcount][2] = 1.0 + vterm*(unitkz*m)*(unitkz*m);
+	vg[kcount][1] = 1.0 + vterm*(unitk[1]*l)*(unitk[1]*l);
+	vg[kcount][2] = 1.0 + vterm*(unitk[2]*m)*(unitk[2]*m);
 	vg[kcount][3] = 0.0;
 	vg[kcount][4] = 0.0;
-	vg[kcount][5] = -vterm*unitky*l*unitkz*m;
+	vg[kcount][5] = -vterm*unitk[1]*l*unitk[2]*m;
 	kcount++;
       }
     }
   }
 
   // 1 = (k,0,m), 2 = (k,0,-m)
 
   for (k = 1; k <= kxmax; k++) {
     for (m = 1; m <= kzmax; m++) {
-      sqk = (unitkx*k) * (unitkx*k) + (unitkz*m) * (unitkz*m);
+      sqk = (unitk[0]*k) * (unitk[0]*k) + (unitk[2]*m) * (unitk[2]*m);
       if (sqk <= gsqmx) {
 	kxvecs[kcount] = k;
 	kyvecs[kcount] = 0;
 	kzvecs[kcount] = m;
 	ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
-	eg[kcount][0] =  2.0*unitkx*k*ug[kcount];
+	eg[kcount][0] =  2.0*unitk[0]*k*ug[kcount];
 	eg[kcount][1] =  0.0;
-	eg[kcount][2] =  2.0*unitkz*m*ug[kcount];
+	eg[kcount][2] =  2.0*unitk[2]*m*ug[kcount];
 	vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
-	vg[kcount][0] = 1.0 + vterm*(unitkx*k)*(unitkx*k);
+	vg[kcount][0] = 1.0 + vterm*(unitk[0]*k)*(unitk[0]*k);
 	vg[kcount][1] = 1.0;
-	vg[kcount][2] = 1.0 + vterm*(unitkz*m)*(unitkz*m);
+	vg[kcount][2] = 1.0 + vterm*(unitk[2]*m)*(unitk[2]*m);
 	vg[kcount][3] = 0.0;
-	vg[kcount][4] = vterm*unitkx*k*unitkz*m;
+	vg[kcount][4] = vterm*unitk[0]*k*unitk[2]*m;
 	vg[kcount][5] = 0.0;
 	kcount++;
 
 	kxvecs[kcount] = k;
 	kyvecs[kcount] = 0;
 	kzvecs[kcount] = -m;
 	ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
-	eg[kcount][0] =  2.0*unitkx*k*ug[kcount];
+	eg[kcount][0] =  2.0*unitk[0]*k*ug[kcount];
 	eg[kcount][1] =  0.0;
-	eg[kcount][2] = -2.0*unitkz*m*ug[kcount];
-	vg[kcount][0] = 1.0 + vterm*(unitkx*k)*(unitkx*k);
+	eg[kcount][2] = -2.0*unitk[2]*m*ug[kcount];
+	vg[kcount][0] = 1.0 + vterm*(unitk[0]*k)*(unitk[0]*k);
 	vg[kcount][1] = 1.0;
-	vg[kcount][2] = 1.0 + vterm*(unitkz*m)*(unitkz*m);
+	vg[kcount][2] = 1.0 + vterm*(unitk[2]*m)*(unitk[2]*m);
 	vg[kcount][3] = 0.0;
-	vg[kcount][4] = -vterm*unitkx*k*unitkz*m;
+	vg[kcount][4] = -vterm*unitk[0]*k*unitk[2]*m;
 	vg[kcount][5] = 0.0;
 	kcount++;
       }
     }
   }
 
   // 1 = (k,l,m), 2 = (k,-l,m), 3 = (k,l,-m), 4 = (k,-l,-m)
 
   for (k = 1; k <= kxmax; k++) {
     for (l = 1; l <= kymax; l++) {
       for (m = 1; m <= kzmax; m++) {
-	sqk = (unitkx*k) * (unitkx*k) + (unitky*l) * (unitky*l) + 
-	  (unitkz*m) * (unitkz*m);
+	sqk = (unitk[0]*k) * (unitk[0]*k) + (unitk[1]*l) * (unitk[1]*l) + 
+	  (unitk[2]*m) * (unitk[2]*m);
 	if (sqk <= gsqmx) {
 	  kxvecs[kcount] = k;
 	  kyvecs[kcount] = l;
 	  kzvecs[kcount] = m;
 	  ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
-	  eg[kcount][0] = 2.0*unitkx*k*ug[kcount];
-	  eg[kcount][1] = 2.0*unitky*l*ug[kcount];
-	  eg[kcount][2] = 2.0*unitkz*m*ug[kcount];
+	  eg[kcount][0] = 2.0*unitk[0]*k*ug[kcount];
+	  eg[kcount][1] = 2.0*unitk[1]*l*ug[kcount];
+	  eg[kcount][2] = 2.0*unitk[2]*m*ug[kcount];
 	  vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
-	  vg[kcount][0] = 1.0 + vterm*(unitkx*k)*(unitkx*k);
-	  vg[kcount][1] = 1.0 + vterm*(unitky*l)*(unitky*l);
-	  vg[kcount][2] = 1.0 + vterm*(unitkz*m)*(unitkz*m);
-	  vg[kcount][3] = vterm*unitkx*k*unitky*l;
-	  vg[kcount][4] = vterm*unitkx*k*unitkz*m;
-	  vg[kcount][5] = vterm*unitky*l*unitkz*m;
+	  vg[kcount][0] = 1.0 + vterm*(unitk[0]*k)*(unitk[0]*k);
+	  vg[kcount][1] = 1.0 + vterm*(unitk[1]*l)*(unitk[1]*l);
+	  vg[kcount][2] = 1.0 + vterm*(unitk[2]*m)*(unitk[2]*m);
+	  vg[kcount][3] = vterm*unitk[0]*k*unitk[1]*l;
+	  vg[kcount][4] = vterm*unitk[0]*k*unitk[2]*m;
+	  vg[kcount][5] = vterm*unitk[1]*l*unitk[2]*m;
 	  kcount++;
 
 	  kxvecs[kcount] = k;
 	  kyvecs[kcount] = -l;
 	  kzvecs[kcount] = m;
 	  ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
-	  eg[kcount][0] = 2.0*unitkx*k*ug[kcount];
-	  eg[kcount][1] = -2.0*unitky*l*ug[kcount];
-	  eg[kcount][2] = 2.0*unitkz*m*ug[kcount];
-	  vg[kcount][0] = 1.0 + vterm*(unitkx*k)*(unitkx*k);
-	  vg[kcount][1] = 1.0 + vterm*(unitky*l)*(unitky*l);
-	  vg[kcount][2] = 1.0 + vterm*(unitkz*m)*(unitkz*m);
-	  vg[kcount][3] = -vterm*unitkx*k*unitky*l;
-	  vg[kcount][4] = vterm*unitkx*k*unitkz*m;
-	  vg[kcount][5] = -vterm*unitky*l*unitkz*m;
+	  eg[kcount][0] = 2.0*unitk[0]*k*ug[kcount];
+	  eg[kcount][1] = -2.0*unitk[1]*l*ug[kcount];
+	  eg[kcount][2] = 2.0*unitk[2]*m*ug[kcount];
+	  vg[kcount][0] = 1.0 + vterm*(unitk[0]*k)*(unitk[0]*k);
+	  vg[kcount][1] = 1.0 + vterm*(unitk[1]*l)*(unitk[1]*l);
+	  vg[kcount][2] = 1.0 + vterm*(unitk[2]*m)*(unitk[2]*m);
+	  vg[kcount][3] = -vterm*unitk[0]*k*unitk[1]*l;
+	  vg[kcount][4] = vterm*unitk[0]*k*unitk[2]*m;
+	  vg[kcount][5] = -vterm*unitk[1]*l*unitk[2]*m;
 	  kcount++;
 
 	  kxvecs[kcount] = k;
 	  kyvecs[kcount] = l;
 	  kzvecs[kcount] = -m;
 	  ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
-	  eg[kcount][0] = 2.0*unitkx*k*ug[kcount];
-	  eg[kcount][1] = 2.0*unitky*l*ug[kcount];
-	  eg[kcount][2] = -2.0*unitkz*m*ug[kcount];
-	  vg[kcount][0] = 1.0 + vterm*(unitkx*k)*(unitkx*k);
-	  vg[kcount][1] = 1.0 + vterm*(unitky*l)*(unitky*l);
-	  vg[kcount][2] = 1.0 + vterm*(unitkz*m)*(unitkz*m);
-	  vg[kcount][3] = vterm*unitkx*k*unitky*l;
-	  vg[kcount][4] = -vterm*unitkx*k*unitkz*m;
-	  vg[kcount][5] = -vterm*unitky*l*unitkz*m;
+	  eg[kcount][0] = 2.0*unitk[0]*k*ug[kcount];
+	  eg[kcount][1] = 2.0*unitk[1]*l*ug[kcount];
+	  eg[kcount][2] = -2.0*unitk[2]*m*ug[kcount];
+	  vg[kcount][0] = 1.0 + vterm*(unitk[0]*k)*(unitk[0]*k);
+	  vg[kcount][1] = 1.0 + vterm*(unitk[1]*l)*(unitk[1]*l);
+	  vg[kcount][2] = 1.0 + vterm*(unitk[2]*m)*(unitk[2]*m);
+	  vg[kcount][3] = vterm*unitk[0]*k*unitk[1]*l;
+	  vg[kcount][4] = -vterm*unitk[0]*k*unitk[2]*m;
+	  vg[kcount][5] = -vterm*unitk[1]*l*unitk[2]*m;
 	  kcount++;
 
 	  kxvecs[kcount] = k;
 	  kyvecs[kcount] = -l;
 	  kzvecs[kcount] = -m;
 	  ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
-	  eg[kcount][0] = 2.0*unitkx*k*ug[kcount];
-	  eg[kcount][1] = -2.0*unitky*l*ug[kcount];
-	  eg[kcount][2] = -2.0*unitkz*m*ug[kcount];
-	  vg[kcount][0] = 1.0 + vterm*(unitkx*k)*(unitkx*k);
-	  vg[kcount][1] = 1.0 + vterm*(unitky*l)*(unitky*l);
-	  vg[kcount][2] = 1.0 + vterm*(unitkz*m)*(unitkz*m);
-	  vg[kcount][3] = -vterm*unitkx*k*unitky*l;
-	  vg[kcount][4] = -vterm*unitkx*k*unitkz*m;
-	  vg[kcount][5] = vterm*unitky*l*unitkz*m;
+	  eg[kcount][0] = 2.0*unitk[0]*k*ug[kcount];
+	  eg[kcount][1] = -2.0*unitk[1]*l*ug[kcount];
+	  eg[kcount][2] = -2.0*unitk[2]*m*ug[kcount];
+	  vg[kcount][0] = 1.0 + vterm*(unitk[0]*k)*(unitk[0]*k);
+	  vg[kcount][1] = 1.0 + vterm*(unitk[1]*l)*(unitk[1]*l);
+	  vg[kcount][2] = 1.0 + vterm*(unitk[2]*m)*(unitk[2]*m);
+	  vg[kcount][3] = -vterm*unitk[0]*k*unitk[1]*l;
+	  vg[kcount][4] = -vterm*unitk[0]*k*unitk[2]*m;
+	  vg[kcount][5] = vterm*unitk[1]*l*unitk[2]*m;
 	  kcount++;;
 	}
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    allocate memory that depends on # of K-vectors 
 ------------------------------------------------------------------------- */
 
 void Ewald::allocate()
 {
   kxvecs = new int[kmax3d];
   kyvecs = new int[kmax3d];
   kzvecs = new int[kmax3d];
 
   ug = new double[kmax3d];
   memory->create(eg,kmax3d,3,"ewald:eg");
   memory->create(vg,kmax3d,6,"ewald:vg");
 
   sfacrl = new double[kmax3d];
   sfacim = new double[kmax3d];
   sfacrl_all = new double[kmax3d];
   sfacim_all = new double[kmax3d];
 }
 
 /* ----------------------------------------------------------------------
    deallocate memory that depends on # of K-vectors 
 ------------------------------------------------------------------------- */
 
 void Ewald::deallocate()
 {
   delete [] kxvecs;
   delete [] kyvecs;
   delete [] kzvecs;
   
   delete [] ug;
   memory->destroy(eg);
   memory->destroy(vg);
 
   delete [] sfacrl;
   delete [] sfacim;
   delete [] sfacrl_all;
   delete [] sfacim_all;
 }
 
 /* ----------------------------------------------------------------------
    Slab-geometry correction term to dampen inter-slab interactions between
    periodically repeating slabs.  Yields good approximation to 2-D Ewald if 
    adequate empty space is left between repeating slabs (J. Chem. Phys. 
    111, 3155).  Slabs defined here to be parallel to the xy plane. 
 ------------------------------------------------------------------------- */
 
 void Ewald::slabcorr()
 {
   // compute local contribution to global dipole moment
   
   double *q = atom->q;
   double **x = atom->x;
   int nlocal = atom->nlocal;
 
   double dipole = 0.0;
   for (int i = 0; i < nlocal; i++) dipole += q[i]*x[i][2];
   
   // sum local contributions to get global dipole moment
 
   double dipole_all;
   MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
 
   // compute corrections
   
   const double e_slabcorr = 2.0*MY_PI*dipole_all*dipole_all/volume;
   const double qscale = force->qqrd2e * scale;
   
   if (eflag_global) energy += qscale * e_slabcorr;
   
   // per-atom energy
     
   if (eflag_atom) {
     double efact = 2.0*MY_PI*dipole_all/volume; 
     for (int i = 0; i < nlocal; i++) eatom[i] += qscale * q[i]*x[i][2]*efact;
   }
 
   // add on force corrections
 
   double ffact = -4.0*MY_PI*dipole_all/volume; 
   double **f = atom->f;
 
   for (int i = 0; i < nlocal; i++) f[i][2] += qscale * q[i]*ffact;
 }
 
 /* ----------------------------------------------------------------------
    memory usage of local arrays 
 ------------------------------------------------------------------------- */
 
 double Ewald::memory_usage()
 {
   double bytes = 3 * kmax3d * sizeof(int);
   bytes += (1 + 3 + 6) * kmax3d * sizeof(double);
   bytes += 4 * kmax3d * sizeof(double);
   bytes += nmax*3 * sizeof(double);
   bytes += 2 * (2*kmax+1)*3*nmax * sizeof(double);
   return bytes;
 }
diff --git a/src/REPLICA/verlet_split.cpp b/src/REPLICA/verlet_split.cpp
index d1f6eaef0..4272cc247 100644
--- a/src/REPLICA/verlet_split.cpp
+++ b/src/REPLICA/verlet_split.cpp
@@ -1,550 +1,551 @@
 /* -------------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Yuxing Peng and Chris Knight (U Chicago)
 ------------------------------------------------------------------------- */
 
 #include "string.h"
 #include "verlet_split.h"
 #include "universe.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "comm.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "force.h"
 #include "pair.h"
 #include "bond.h"
 #include "angle.h"
 #include "dihedral.h"
 #include "improper.h"
 #include "kspace.h"
 #include "output.h"
 #include "update.h"
 #include "modify.h"
 #include "timer.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 VerletSplit::VerletSplit(LAMMPS *lmp, int narg, char **arg) :
   Verlet(lmp, narg, arg)
 {
   // error checks on partitions
 
   if (universe->nworlds != 2) 
     error->universe_all(FLERR,"Verlet/split requires 2 partitions");
   if (universe->procs_per_world[0] % universe->procs_per_world[1])
     error->universe_all(FLERR,"Verlet/split requires Rspace partition "
 			"size be multiple of Kspace partition size");
 
   // master = 1 for Rspace procs, 0 for Kspace procs
 
   if (universe->iworld == 0) master = 1;
   else master = 0;
 
   ratio = universe->procs_per_world[0] / universe->procs_per_world[1];
 
   // Kspace root proc broadcasts info about Kspace proc layout to Rspace procs
 
   int kspace_procgrid[3];
 
   if (universe->me == universe->root_proc[1]) {
     kspace_procgrid[0] = comm->procgrid[0];
     kspace_procgrid[1] = comm->procgrid[1];
     kspace_procgrid[2] = comm->procgrid[2];
   }
   MPI_Bcast(kspace_procgrid,3,MPI_INT,universe->root_proc[1],universe->uworld);
 
   int ***kspace_grid2proc;
   memory->create(kspace_grid2proc,kspace_procgrid[0],
 		 kspace_procgrid[1],kspace_procgrid[2],
 		 "verlet/split:kspace_grid2proc");
 
   if (universe->me == universe->root_proc[1]) {
     for (int i = 0; i < comm->procgrid[0]; i++)
       for (int j = 0; j < comm->procgrid[1]; j++)
 	for (int k = 0; k < comm->procgrid[2]; k++)
 	  kspace_grid2proc[i][j][k] = comm->grid2proc[i][j][k];
   }
   MPI_Bcast(&kspace_grid2proc[0][0][0],
 	    kspace_procgrid[0]*kspace_procgrid[1]*kspace_procgrid[2],MPI_INT,
 	    universe->root_proc[1],universe->uworld);
 
   // Rspace partition must be multiple of Kspace partition in each dim
   // so atoms of one Kspace proc coincide with atoms of several Rspace procs
 
   if (master) {
     int flag = 0;
     if (comm->procgrid[0] % kspace_procgrid[0]) flag = 1;
     if (comm->procgrid[1] % kspace_procgrid[1]) flag = 1;
     if (comm->procgrid[2] % kspace_procgrid[2]) flag = 1;
     if (flag)
       error->one(FLERR,
 		 "Verlet/split requires Rspace partition layout be "
 		 "multiple of Kspace partition layout in each dim");
   }
 
   // block = 1 Kspace proc with set of Rspace procs it overlays
   // me_block = 0 for Kspace proc
   // me_block = 1 to ratio for Rspace procs
   // block = MPI communicator for that set of procs
 
   int iblock,key;
 
   if (!master) {
     iblock = comm->me;
     key = 0;
   } else {
     int kpx = comm->myloc[0] / (comm->procgrid[0]/kspace_procgrid[0]);
     int kpy = comm->myloc[1] / (comm->procgrid[1]/kspace_procgrid[1]);
     int kpz = comm->myloc[2] / (comm->procgrid[2]/kspace_procgrid[2]);
     iblock = kspace_grid2proc[kpx][kpy][kpz];
     key = 1;
   }
 
   MPI_Comm_split(universe->uworld,iblock,key,&block);
   MPI_Comm_rank(block,&me_block);
 
   // output block groupings to universe screen/logfile
   // bmap is ordered by block and then by proc within block
 
   int *bmap = new int[universe->nprocs];
   for (int i = 0; i < universe->nprocs; i++) bmap[i] = -1;
   bmap[iblock*(ratio+1)+me_block] = universe->me;
 
   int *bmapall = new int[universe->nprocs];
   MPI_Allreduce(bmap,bmapall,universe->nprocs,MPI_INT,MPI_MAX,universe->uworld);
 
   if (universe->me == 0) {
     if (universe->uscreen) {
       fprintf(universe->uscreen,
 	      "Per-block Rspace/Kspace proc IDs (original proc IDs):\n");
       int m = 0;
       for (int i = 0; i < universe->nprocs/(ratio+1); i++) {
 	fprintf(universe->uscreen,"  block %d:",i);
 	int kspace_proc = bmapall[m];
 	for (int j = 1; j <= ratio; j++)
 	  fprintf(universe->uscreen," %d",bmapall[m+j]);
 	fprintf(universe->uscreen," %d",kspace_proc);
 	kspace_proc = bmapall[m];
 	for (int j = 1; j <= ratio; j++) {
 	  if (j == 1) fprintf(universe->uscreen," (");
 	  else fprintf(universe->uscreen," ");
 	  fprintf(universe->uscreen,"%d",
 		  universe->uni2orig[bmapall[m+j]]);
 	}
 	fprintf(universe->uscreen," %d)\n",universe->uni2orig[kspace_proc]);
 	m += ratio + 1;
       }
     }
     if (universe->ulogfile) {
       fprintf(universe->ulogfile,
 	      "Per-block Rspace/Kspace proc IDs (original proc IDs):\n");
       int m = 0;
       for (int i = 0; i < universe->nprocs/(ratio+1); i++) {
 	fprintf(universe->ulogfile,"  block %d:",i);
 	int kspace_proc = bmapall[m];
 	for (int j = 1; j <= ratio; j++)
 	  fprintf(universe->ulogfile," %d",bmapall[m+j]);
 
 	fprintf(universe->ulogfile," %d",kspace_proc);
 	kspace_proc = bmapall[m];
 	for (int j = 1; j <= ratio; j++) {
 	  if (j == 1) fprintf(universe->ulogfile," (");
 	  else fprintf(universe->ulogfile," ");
 	  fprintf(universe->ulogfile,"%d",
 		  universe->uni2orig[bmapall[m+j]]);
 	}
 	fprintf(universe->ulogfile," %d)\n",universe->uni2orig[kspace_proc]);
 	m += ratio + 1;
       }
     }
   }
 
   memory->destroy(kspace_grid2proc);
   delete [] bmap;
   delete [] bmapall;
 
   // size/disp = vectors for MPI gather/scatter within block
 
   qsize = new int[ratio+1];
   qdisp = new int[ratio+1];
   xsize = new int[ratio+1];
   xdisp = new int[ratio+1];
 
   // f_kspace = Rspace copy of Kspace forces
   // allocate dummy version for Kspace partition
 
   maxatom = 0;
   f_kspace = NULL;
   if (!master) memory->create(f_kspace,1,1,"verlet/split:f_kspace");
 }
 
 /* ---------------------------------------------------------------------- */
 
 VerletSplit::~VerletSplit()
 {
   delete [] qsize;
   delete [] qdisp;
   delete [] xsize;
   delete [] xdisp;
   memory->destroy(f_kspace);
   MPI_Comm_free(&block);
 }
 
 /* ----------------------------------------------------------------------
    initialization before run
 ------------------------------------------------------------------------- */
 
 void VerletSplit::init()
 {
   if (!force->kspace && comm->me == 0) 
     error->warning(FLERR,"No Kspace calculation with verlet/split");
 
   if (force->kspace_match("tip4p",0)) tip4p_flag = 1;
   else tip4p_flag = 0;
 
   Verlet::init();
 }
 
 /* ----------------------------------------------------------------------
    setup before run
    servant partition only sets up KSpace calculation
 ------------------------------------------------------------------------- */
 
 void VerletSplit::setup()
 {
   if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");
 
   if (!master) force->kspace->setup();
   else Verlet::setup();
 }
 
 /* ----------------------------------------------------------------------
    setup without output
    flag = 0 = just force calculation
    flag = 1 = reneighbor and force calculation
    servant partition only sets up KSpace calculation
 ------------------------------------------------------------------------- */
 
 void VerletSplit::setup_minimal(int flag)
 {
   if (!master) force->kspace->setup();
   else Verlet::setup_minimal(flag);
 }
 
 /* ----------------------------------------------------------------------
    run for N steps
    master partition does everything but Kspace
    servant partition does just Kspace
    communicate back and forth every step:
      atom coords from master -> servant
      kspace forces from servant -> master
      also box bounds from master -> servant if necessary
 ------------------------------------------------------------------------- */
 
 void VerletSplit::run(int n)
 {
   int nflag,ntimestep,sortflag;
 
   // sync both partitions before start timer
 
   MPI_Barrier(universe->uworld);
   timer->init();
   timer->barrier_start(TIME_LOOP);
 
   // setup initial Rspace <-> Kspace comm params
 
   rk_setup();
 
   // flags for timestepping iterations
 
   int n_post_integrate = modify->n_post_integrate;
   int n_pre_exchange = modify->n_pre_exchange;
   int n_pre_neighbor = modify->n_pre_neighbor;
   int n_pre_force = modify->n_pre_force;
   int n_post_force = modify->n_post_force;
   int n_end_of_step = modify->n_end_of_step;
 
   if (atom->sortfreq > 0) sortflag = 1;
   else sortflag = 0;
 
   for (int i = 0; i < n; i++) {
 
     ntimestep = ++update->ntimestep;
     ev_set(ntimestep);
 
     // initial time integration
  
     if (master) {
       modify->initial_integrate(vflag);
       if (n_post_integrate) modify->post_integrate();
     }
 
     // regular communication vs neighbor list rebuild
 
     if (master) nflag = neighbor->decide();
     MPI_Bcast(&nflag,1,MPI_INT,1,block);
 
     if (master) {
       if (nflag == 0) {
 	timer->stamp();
 	comm->forward_comm();
 	timer->stamp(TIME_COMM);
       } else {
 	if (n_pre_exchange) modify->pre_exchange();
 	if (triclinic) domain->x2lamda(atom->nlocal);
 	domain->pbc();
 	if (domain->box_change) {
 	  domain->reset_box();
 	  comm->setup();
 	  if (neighbor->style) neighbor->setup_bins();
 	}
 	timer->stamp();
 	comm->exchange();
 	if (sortflag && ntimestep >= atom->nextsort) atom->sort();
 	comm->borders();
 	if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
 	timer->stamp(TIME_COMM);
 	if (n_pre_neighbor) modify->pre_neighbor();
 	neighbor->build();
 	timer->stamp(TIME_NEIGHBOR);
       }
     }
     
     // if reneighboring occurred, re-setup Rspace <-> Kspace comm params
     // comm Rspace atom coords to Kspace procs
 
     if (nflag) rk_setup();
     r2k_comm();
   
     // force computations
 
     force_clear();
 
     if (master) {
       if (n_pre_force) modify->pre_force(vflag);
 
       timer->stamp();
       if (force->pair) {
         force->pair->compute(eflag,vflag);
         timer->stamp(TIME_PAIR);
       }
       
       if (atom->molecular) {
         if (force->bond) force->bond->compute(eflag,vflag);
         if (force->angle) force->angle->compute(eflag,vflag);
         if (force->dihedral) force->dihedral->compute(eflag,vflag);
         if (force->improper) force->improper->compute(eflag,vflag);
         timer->stamp(TIME_BOND);
       }
 
       if (force->newton) {
 	comm->reverse_comm();
 	timer->stamp(TIME_COMM);
       }
 
     } else {
       if (force->kspace) {
         timer->stamp();
         force->kspace->compute(eflag,vflag);
         timer->stamp(TIME_KSPACE);
       }
 
       // TIP4P PPPM puts forces on ghost atoms, so must reverse_comm()
 
       if (tip4p_flag && force->newton) {
 	comm->reverse_comm();
 	timer->stamp(TIME_COMM);
       }
     }
 
     // comm and sum Kspace forces back to Rspace procs
 
     k2r_comm();
 
     // force modifications, final time integration, diagnostics
     // all output
 
     if (master) {
       if (n_post_force) modify->post_force(vflag);
       modify->final_integrate();
       if (n_end_of_step) modify->end_of_step();
 
       if (ntimestep == output->next) {
 	timer->stamp();
 	output->write(ntimestep);
 	timer->stamp(TIME_OUTPUT);
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    setup params for Rspace <-> Kspace communication
    called initially and after every reneighbor
    also communcicate atom charges from Rspace to KSpace since static
 ------------------------------------------------------------------------- */
 
 void VerletSplit::rk_setup()
 {
   // grow f_kspace array on master procs if necessary
 
   if (master) {
     if (atom->nlocal > maxatom) {
       memory->destroy(f_kspace);
       maxatom = atom->nmax;
       memory->create(f_kspace,maxatom,3,"verlet/split:f_kspace");
     }
   }
 
   // qsize = # of atoms owned by each master proc in block
 
   int n = 0;
   if (master) n = atom->nlocal;
   MPI_Gather(&n,1,MPI_INT,qsize,1,MPI_INT,0,block);
 
   // setup qdisp, xsize, xdisp based on qsize
   // only needed by Kspace proc
   // set Kspace nlocal to sum of Rspace nlocals
   // insure Kspace atom arrays are large enough
 
   if (!master) { 
     qsize[0] = qdisp[0] = xsize[0] = xdisp[0] = 0;
     for (int i = 1; i <= ratio; i++) {
       qdisp[i] = qdisp[i-1]+qsize[i-1];
       xsize[i] = 3*qsize[i];
       xdisp[i] = xdisp[i-1]+xsize[i-1];
     }
 
     atom->nlocal = qdisp[ratio] + qsize[ratio];
     while (atom->nmax <= atom->nlocal) atom->avec->grow(0);
     atom->nghost = 0;
   }
   
   // one-time gather of Rspace atom charges to Kspace proc
 
   MPI_Gatherv(atom->q,n,MPI_DOUBLE,atom->q,qsize,qdisp,MPI_DOUBLE,0,block);
 
   // for TIP4P also need to send atom type and tag
   // KSpace procs need to acquire ghost atoms and map all their atoms
   // map_clear() call is in lieu of comm->exchange() which performs map_clear
   // borders() call acquires ghost atoms and maps them
-  // NOTE: don't atom coords need to be communicated here before borders() ??
+  // NOTE: do atom coords need to be communicated here before borders() call?
   //   could do this by calling r2k_comm() here and not again from run()
+  //   except that forward_comm() in r2k_comm() is wrong
 
   if (tip4p_flag) {
     //r2k_comm();
     MPI_Gatherv(atom->type,n,MPI_INT,atom->type,qsize,qdisp,MPI_INT,0,block);
     MPI_Gatherv(atom->tag,n,MPI_INT,atom->tag,qsize,qdisp,MPI_INT,0,block);
     if (!master) {
       if (triclinic) domain->x2lamda(atom->nlocal);
       if (domain->box_change) comm->setup();
       timer->stamp();
       atom->map_clear();
       comm->borders();
       if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
       timer->stamp(TIME_COMM);
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    communicate Rspace atom coords to Kspace
    also eflag,vflag and box bounds if needed
 ------------------------------------------------------------------------- */
 
 void VerletSplit::r2k_comm()
 {
   MPI_Status status;
 
   int n = 0;
   if (master) n = atom->nlocal;
   MPI_Gatherv(atom->x[0],n*3,MPI_DOUBLE,atom->x[0],xsize,xdisp,
 	      MPI_DOUBLE,0,block);
 
   // send eflag,vflag from Rspace to Kspace
 
   if (me_block == 1) {
     int flags[2];
     flags[0] = eflag; flags[1] = vflag;
     MPI_Send(flags,2,MPI_INT,0,0,block);
   } else if (!master) {
     int flags[2];
     MPI_Recv(flags,2,MPI_DOUBLE,1,0,block,&status);
     eflag = flags[0]; vflag = flags[1];
   }
 
   // send box bounds from Rspace to Kspace if simulation box is dynamic
 
   if (domain->box_change) {
     if (me_block == 1) {
       MPI_Send(domain->boxlo,3,MPI_DOUBLE,0,0,block);
       MPI_Send(domain->boxhi,3,MPI_DOUBLE,0,0,block);
     } else if (!master) {
       MPI_Recv(domain->boxlo,3,MPI_DOUBLE,1,0,block,&status);
       MPI_Recv(domain->boxhi,3,MPI_DOUBLE,1,0,block,&status);
       domain->set_global_box();
       domain->set_local_box();
       force->kspace->setup();
     }
   }
 
   // for TIP4P, Kspace partition needs to update its ghost atoms
 
   if (tip4p_flag && !master) {
     timer->stamp();
     comm->forward_comm();
     timer->stamp(TIME_COMM);
   }
 }
 
 /* ----------------------------------------------------------------------
    communicate and sum Kspace atom forces back to Rspace
 ------------------------------------------------------------------------- */
 
 void VerletSplit::k2r_comm()
 {
   if (eflag) MPI_Bcast(&force->kspace->energy,1,MPI_DOUBLE,0,block);
   if (vflag) MPI_Bcast(force->kspace->virial,6,MPI_DOUBLE,0,block);
 
   int n = 0;
   if (master) n = atom->nlocal;
   MPI_Scatterv(atom->f[0],xsize,xdisp,MPI_DOUBLE,
 	       f_kspace[0],n*3,MPI_DOUBLE,0,block);
   
   if (master) {
     double **f = atom->f;
     int nlocal = atom->nlocal;
     for (int i = 0; i < nlocal; i++) {
       f[i][0] += f_kspace[i][0];
       f[i][1] += f_kspace[i][1];
       f[i][2] += f_kspace[i][2];
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    memory usage of Kspace force array on master procs
 ------------------------------------------------------------------------- */
 
 bigint VerletSplit::memory_usage()
 {
   bigint bytes = maxatom*3 * sizeof(double);
   return bytes;
 }