diff --git a/src/DIPOLE/pair_lj_long_dipole_long.cpp b/src/DIPOLE/pair_lj_long_dipole_long.cpp
index 383e3814c..b476cfcee 100644
--- a/src/DIPOLE/pair_lj_long_dipole_long.cpp
+++ b/src/DIPOLE/pair_lj_long_dipole_long.cpp
@@ -1,683 +1,682 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Pieter J. in 't Veld and Stan Moore (Sandia)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "math_const.h"
 #include "math_vector.h"
 #include "pair_lj_long_dipole_long.h"
 #include "atom.h"
 #include "comm.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "force.h"
 #include "kspace.h"
 #include "update.h"
 #include "integrate.h"
 #include "respa.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define EWALD_F   1.12837917
 #define EWALD_P   0.3275911
 #define A1        0.254829592
 #define A2       -0.284496736
 #define A3        1.421413741
 #define A4       -1.453152027
 #define A5        1.061405429
 
 // ----------------------------------------------------------------------
 
 PairLJLongDipoleLong::PairLJLongDipoleLong(LAMMPS *lmp) : Pair(lmp)
 {
   dispersionflag = ewaldflag = dipoleflag = 1;
   respa_enable = 0;
   single_enable = 0;
 }
 
 // ----------------------------------------------------------------------
 // global settings
 // ----------------------------------------------------------------------
 
 void PairLJLongDipoleLong::options(char **arg, int order)
 {
   const char *option[] = {"long", "cut", "off", NULL};
   int i;
 
   if (!*arg) error->all(FLERR,"Illegal pair_style lj/long/dipole/long command");
   for (i=0; option[i]&&strcmp(arg[0], option[i]); ++i);
   switch (i) {
     default: error->all(FLERR,"Illegal pair_style lj/long/dipole/long command");
     case 0: ewald_order |= 1<<order; break;		// set kspace r^-order
     case 2: ewald_off |= 1<<order;			// turn r^-order off
     case 1: break;
   }
 }
 
 void PairLJLongDipoleLong::settings(int narg, char **arg)
 {
   if (narg != 3 && narg != 4) error->all(FLERR,"Illegal pair_style command");
 
   ewald_off = 0;
   ewald_order = 0;
   options(arg, 6);
   options(++arg, 3);
   options(arg, 1);
   if (!comm->me && ewald_order&(1<<6))
     error->warning(FLERR,"Geometric mixing assumed for 1/r^6 coefficients");
   if (!comm->me && ewald_order==((1<<3)|(1<<6)))
     error->warning(FLERR,
                    "Using largest cut-off for lj/long/dipole/long long long");
   if (!*(++arg))
     error->all(FLERR,"Cut-offs missing in pair_style lj/long/dipole/long");
   if (!((ewald_order^ewald_off)&(1<<3)))
     error->all(FLERR,
                "Coulombic cut not supported in pair_style lj/long/dipole/long");
   cut_lj_global = force->numeric(FLERR,*(arg++));
   if (narg == 4 && (ewald_order==74))
     error->all(FLERR,"Only one cut-off allowed when requesting all long");
   if (narg == 4) cut_coul = force->numeric(FLERR,*(arg++));
   else cut_coul = cut_lj_global;
 
   if (allocated) {					// reset explicit cuts
     int i,j;
     for (i = 1; i <= atom->ntypes; i++)
       for (j = i+1; j <= atom->ntypes; j++)
 	if (setflag[i][j]) cut_lj[i][j] = cut_lj_global;
   }
 }
 
 // ----------------------------------------------------------------------
 // free all arrays
 // ----------------------------------------------------------------------
 
 PairLJLongDipoleLong::~PairLJLongDipoleLong()
 {
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
 
     memory->destroy(cut_lj_read);
     memory->destroy(cut_lj);
     memory->destroy(cut_ljsq);
     memory->destroy(epsilon_read);
     memory->destroy(epsilon);
     memory->destroy(sigma_read);
     memory->destroy(sigma);
     memory->destroy(lj1);
     memory->destroy(lj2);
     memory->destroy(lj3);
     memory->destroy(lj4);
     memory->destroy(offset);
   }
   //if (ftable) free_tables();
 }
 
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
 
 void PairLJLongDipoleLong::allocate()
 {
   allocated = 1;
   int n = atom->ntypes;
 
   memory->create(setflag,n+1,n+1,"pair:setflag");
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       setflag[i][j] = 0;
 
   memory->create(cutsq,n+1,n+1,"pair:cutsq");
 
   memory->create(cut_lj_read,n+1,n+1,"pair:cut_lj_read");
   memory->create(cut_lj,n+1,n+1,"pair:cut_lj");
   memory->create(cut_ljsq,n+1,n+1,"pair:cut_ljsq");
   memory->create(epsilon_read,n+1,n+1,"pair:epsilon_read");
   memory->create(epsilon,n+1,n+1,"pair:epsilon");
   memory->create(sigma_read,n+1,n+1,"pair:sigma_read");
   memory->create(sigma,n+1,n+1,"pair:sigma");
   memory->create(lj1,n+1,n+1,"pair:lj1");
   memory->create(lj2,n+1,n+1,"pair:lj2");
   memory->create(lj3,n+1,n+1,"pair:lj3");
   memory->create(lj4,n+1,n+1,"pair:lj4");
   memory->create(offset,n+1,n+1,"pair:offset");
 }
 
 /* ----------------------------------------------------------------------
    extract protected data from object
 ------------------------------------------------------------------------- */
 
 void *PairLJLongDipoleLong::extract(const char *id, int &dim)
 {
   const char *ids[] = {
     "B", "sigma", "epsilon", "ewald_order", "ewald_cut", "ewald_mix",
     "cut_coul", "cut_vdwl", NULL};
   void *ptrs[] = {
     lj4, sigma, epsilon, &ewald_order, &cut_coul, &mix_flag, &cut_coul, 
     &cut_lj_global, NULL};
   int i;
 
   for (i=0; ids[i]&&strcmp(ids[i], id); ++i);
   if (i <= 2) dim = 2;
   else dim = 0;
   return ptrs[i];
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
 void PairLJLongDipoleLong::coeff(int narg, char **arg)
 {
   if (narg < 4 || narg > 5) 
     error->all(FLERR,"Incorrect args for pair coefficients");
   if (!allocated) allocate();
 
   int ilo,ihi,jlo,jhi;
   force->bounds(arg[0],atom->ntypes,ilo,ihi);
   force->bounds(arg[1],atom->ntypes,jlo,jhi);
 
   double epsilon_one = force->numeric(FLERR,arg[2]);
   double sigma_one = force->numeric(FLERR,arg[3]);
 
   double cut_lj_one = cut_lj_global;
   if (narg == 5) cut_lj_one = force->numeric(FLERR,arg[4]);
 
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
       epsilon_read[i][j] = epsilon_one;
       sigma_read[i][j] = sigma_one;
       cut_lj_read[i][j] = cut_lj_one;
       setflag[i][j] = 1;
       count++;
     }
   }
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairLJLongDipoleLong::init_style()
 {
   const char *style3[] = {"ewald/disp", NULL};
   const char *style6[] = {"ewald/disp", NULL};
   int i;
 
   if (strcmp(update->unit_style,"electron") == 0)
     error->all(FLERR,"Cannot (yet) use 'electron' units with dipoles");
 
   // require an atom style with charge defined
 
   if (!atom->q_flag && (ewald_order&(1<<1)))
     error->all(FLERR,
 	"Invoking coulombic in pair style lj/long/dipole/long requires atom attribute q");
   if (!atom->mu && (ewald_order&(1<<3)))
     error->all(FLERR,"Pair lj/long/dipole/long requires atom attributes mu, torque");
   if (!atom->torque && (ewald_order&(1<<3)))
     error->all(FLERR,"Pair lj/long/dipole/long requires atom attributes mu, torque");
 
   neighbor->request(this);
 
   cut_coulsq = cut_coul * cut_coul;
 
   // ensure use of KSpace long-range solver, set g_ewald
 
   if (ewald_order&(1<<3)) {				// r^-1 kspace
     if (force->kspace == NULL) 
       error->all(FLERR,"Pair style is incompatible with KSpace style");
     for (i=0; style3[i]&&strcmp(force->kspace_style, style3[i]); ++i);
     if (!style3[i])
       error->all(FLERR,"Pair style is incompatible with KSpace style");
   }
   if (ewald_order&(1<<6)) {				// r^-6 kspace
     if (force->kspace == NULL) 
       error->all(FLERR,"Pair style is incompatible with KSpace style");
     for (i=0; style6[i]&&strcmp(force->kspace_style, style6[i]); ++i);
     if (!style6[i])
       error->all(FLERR,"Pair style is incompatible with KSpace style");
   }
   if (force->kspace) g_ewald = force->kspace->g_ewald;
 }
 
 /* ----------------------------------------------------------------------
    neighbor callback to inform pair style of neighbor list to use
    regular or rRESPA
 ------------------------------------------------------------------------- */
 
 void PairLJLongDipoleLong::init_list(int id, NeighList *ptr)
 {
   if (id == 0) list = ptr;
   else if (id == 1) listinner = ptr;
   else if (id == 2) listmiddle = ptr;
   else if (id == 3) listouter = ptr;
 
   if (id)
     error->all(FLERR,"Pair style lj/long/dipole/long does not currently support respa");
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairLJLongDipoleLong::init_one(int i, int j)
 {
   if ((ewald_order&(1<<6))||(setflag[i][j] == 0)) {
     epsilon[i][j] = mix_energy(epsilon_read[i][i],epsilon_read[j][j],
 			       sigma_read[i][i],sigma_read[j][j]);
     sigma[i][j] = mix_distance(sigma_read[i][i],sigma_read[j][j]);
     if (ewald_order&(1<<6))
       cut_lj[i][j] = cut_lj_global;
     else
       cut_lj[i][j] = mix_distance(cut_lj_read[i][i],cut_lj_read[j][j]);
   }
   else {
     sigma[i][j] = sigma_read[i][j];
     epsilon[i][j] = epsilon_read[i][j];
     cut_lj[i][j] = cut_lj_read[i][j];
   }
 
   double cut = MAX(cut_lj[i][j], cut_coul);
   cutsq[i][j] = cut*cut;
   cut_ljsq[i][j] = cut_lj[i][j] * cut_lj[i][j];
 
   lj1[i][j] = 48.0 * epsilon[i][j] * pow(sigma[i][j],12.0);
   lj2[i][j] = 24.0 * epsilon[i][j] * pow(sigma[i][j],6.0);
   lj3[i][j] = 4.0 * epsilon[i][j] * pow(sigma[i][j],12.0);
   lj4[i][j] = 4.0 * epsilon[i][j] * pow(sigma[i][j],6.0);
 
   // check interior rRESPA cutoff
 
   //if (cut_respa && MIN(cut_lj[i][j],cut_coul) < cut_respa[3])
     //error->all(FLERR,"Pair cutoff < Respa interior cutoff");
  
   if (offset_flag) {
     double ratio = sigma[i][j] / cut_lj[i][j];
     offset[i][j] = 4.0 * epsilon[i][j] * (pow(ratio,12.0) - pow(ratio,6.0));
   } else offset[i][j] = 0.0;
 
   cutsq[j][i] = cutsq[i][j];
   cut_ljsq[j][i] = cut_ljsq[i][j];
   lj1[j][i] = lj1[i][j];
   lj2[j][i] = lj2[i][j];
   lj3[j][i] = lj3[i][j];
   lj4[j][i] = lj4[i][j];
   offset[j][i] = offset[i][j];
 
   return cut;
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairLJLongDipoleLong::write_restart(FILE *fp)
 {
   write_restart_settings(fp);
 
   int i,j;
   for (i = 1; i <= atom->ntypes; i++)
     for (j = i; j <= atom->ntypes; j++) {
       fwrite(&setflag[i][j],sizeof(int),1,fp);
       if (setflag[i][j]) {
 	fwrite(&epsilon_read[i][j],sizeof(double),1,fp);
 	fwrite(&sigma_read[i][j],sizeof(double),1,fp);
 	fwrite(&cut_lj_read[i][j],sizeof(double),1,fp);
       }
     }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairLJLongDipoleLong::read_restart(FILE *fp)
 {
   read_restart_settings(fp);
 
   allocate();
 
   int i,j;
   int me = comm->me;
   for (i = 1; i <= atom->ntypes; i++)
     for (j = i; j <= atom->ntypes; j++) {
       if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
       MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
       if (setflag[i][j]) {
 	if (me == 0) {
 	  fread(&epsilon_read[i][j],sizeof(double),1,fp);
 	  fread(&sigma_read[i][j],sizeof(double),1,fp);
 	  fread(&cut_lj_read[i][j],sizeof(double),1,fp);
 	}
 	MPI_Bcast(&epsilon_read[i][j],1,MPI_DOUBLE,0,world);
 	MPI_Bcast(&sigma_read[i][j],1,MPI_DOUBLE,0,world);
 	MPI_Bcast(&cut_lj_read[i][j],1,MPI_DOUBLE,0,world);
       }
     }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairLJLongDipoleLong::write_restart_settings(FILE *fp)
 {
   fwrite(&cut_lj_global,sizeof(double),1,fp);
   fwrite(&cut_coul,sizeof(double),1,fp);
   fwrite(&offset_flag,sizeof(int),1,fp);
   fwrite(&mix_flag,sizeof(int),1,fp);
   fwrite(&ewald_order,sizeof(int),1,fp);
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairLJLongDipoleLong::read_restart_settings(FILE *fp)
 {
   if (comm->me == 0) {
     fread(&cut_lj_global,sizeof(double),1,fp);
     fread(&cut_coul,sizeof(double),1,fp);
     fread(&offset_flag,sizeof(int),1,fp);
     fread(&mix_flag,sizeof(int),1,fp);
     fread(&ewald_order,sizeof(int),1,fp);
   }
   MPI_Bcast(&cut_lj_global,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&cut_coul,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
   MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
   MPI_Bcast(&ewald_order,1,MPI_INT,0,world);
 }
 
 /* ----------------------------------------------------------------------
    compute pair interactions
 ------------------------------------------------------------------------- */
 
 void PairLJLongDipoleLong::compute(int eflag, int vflag)
 {
   double evdwl,ecoul,fpair;
   evdwl = ecoul = 0.0;
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
   double **x = atom->x, *x0 = x[0];
   double **mu = atom->mu, *mu0 = mu[0], *imu, *jmu;
   double **tq = atom->torque, *tq0 = tq[0], *tqi;
   double **f = atom->f, *f0 = f[0], *fi = f0, fx, fy, fz;
   double *q = atom->q, qi = 0, qj;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   double *special_coul = force->special_coul;
   double *special_lj = force->special_lj;
   int newton_pair = force->newton_pair;
   double qqrd2e = force->qqrd2e;
 
   int i, j;
-  int order1 = ewald_order&(1<<1), order3 = ewald_order&(1<<3),
-      order6 = ewald_order&(1<<6);
+  int order3 = ewald_order&(1<<3), order6 = ewald_order&(1<<6);
   int *ineigh, *ineighn, *jneigh, *jneighn, typei, typej, ni;
   double *cutsqi, *cut_ljsqi, *lj1i, *lj2i, *lj3i, *lj4i, *offseti;
   double rsq, r2inv, force_coul, force_lj;
   double g2 = g_ewald*g_ewald, g6 = g2*g2*g2, g8 = g6*g2;
   double B0, B1, B2, B3, G0, G1, G2, mudi, mudj, muij;
   vector force_d = VECTOR_NULL, ti = VECTOR_NULL, tj = VECTOR_NULL;
   vector mui, muj, xi, d;
   
   double C1 = 2.0 * g_ewald / MY_PIS;
   double C2 = 2.0 * g2 * C1;
   double C3 = 2.0 * g2 * C2;
 
   ineighn = (ineigh = list->ilist)+list->inum;
 
   for (; ineigh<ineighn; ++ineigh) {			// loop over all neighs
     i = *ineigh; fi = f0+3*i; tqi = tq0+3*i;
     qi = q[i];				// initialize constants
     offseti = offset[typei = type[i]];
     lj1i = lj1[typei]; lj2i = lj2[typei]; lj3i = lj3[typei]; lj4i = lj4[typei];
     cutsqi = cutsq[typei]; cut_ljsqi = cut_ljsq[typei];
     memcpy(xi, x0+(i+(i<<1)), sizeof(vector));
     memcpy(mui, imu = mu0+(i<<2), sizeof(vector));
     
     jneighn = (jneigh = list->firstneigh[i])+list->numneigh[i];
 
     for (; jneigh<jneighn; ++jneigh) {			// loop over neighbors
       j = *jneigh;
       ni = sbmask(j);					// special index
       j &= NEIGHMASK;
       
       { register double *xj = x0+(j+(j<<1));
 	d[0] = xi[0] - xj[0];				// pair vector
 	d[1] = xi[1] - xj[1];
 	d[2] = xi[2] - xj[2]; }
 
       if ((rsq = vec_dot(d, d)) >= cutsqi[typej = type[j]]) continue;
       r2inv = 1.0/rsq;
 
       if (order3 && (rsq < cut_coulsq)) {		// dipole
 	memcpy(muj, jmu = mu0+(j<<2), sizeof(vector));
 	{						// series real space
 	  register double r = sqrt(rsq);
 	  register double x = g_ewald*r;
 	  register double f = exp(-x*x)*qqrd2e;
 
 	  B0 = 1.0/(1.0+EWALD_P*x);			// eqn 2.8
 	  B0 *= ((((A5*B0+A4)*B0+A3)*B0+A2)*B0+A1)*f/r;
 	  B1 = (B0 + C1 * f) * r2inv;
 	  B2 = (3.0*B1 + C2 * f) * r2inv;
 	  B3 = (5.0*B2 + C3 * f) * r2inv;
 
 	  mudi = mui[0]*d[0]+mui[1]*d[1]+mui[2]*d[2];
 	  mudj = muj[0]*d[0]+muj[1]*d[1]+muj[2]*d[2];
 	  muij = mui[0]*muj[0]+mui[1]*muj[1]+mui[2]*muj[2];
 	  G0 = qi*(qj = q[j]);				// eqn 2.10
 	  G1 = qi*mudj-qj*mudi+muij;
 	  G2 = -mudi*mudj;
 	  force_coul = G0*B1+G1*B2+G2*B3;
 	  
 	  mudi *= B2; mudj *= B2;			// torque contribs
 	  ti[0] = mudj*d[0]+(qj*d[0]-muj[0])*B1;
 	  ti[1] = mudj*d[1]+(qj*d[1]-muj[1])*B1;
 	  ti[2] = mudj*d[2]+(qj*d[2]-muj[2])*B1;
 
 	  if (newton_pair || j < nlocal) {
 	    tj[0] = mudi*d[0]-(qi*d[0]+mui[0])*B1;
 	    tj[1] = mudi*d[1]-(qi*d[1]+mui[1])*B1;
 	    tj[2] = mudi*d[2]-(qi*d[2]+mui[2])*B1;
 	  }
 
 	  if (eflag) ecoul = G0*B0+G1*B1+G2*B2;
 	  if (ni > 0) {					// adj part, eqn 2.13
 	    force_coul -= (f = qqrd2e*(1.0-special_coul[ni])/r)*(
 	       	(3.0*G1+15.0*G2*r2inv)*r2inv+G0)*r2inv;
 	    if (eflag)
 	      ecoul -= f*((G1+3.0*G2*r2inv)*r2inv+G0);
 	    B1 -= f*r2inv;
 	  }
 	  B0 = mudj+qj*B1; B3 = -qi*B1+mudi;		// position independent
       if (ni > 0) B0 -= f*3.0*mudj*r2inv*r2inv/B2;
       if (ni > 0) B3 -= f*3.0*mudi*r2inv*r2inv/B2;
 	  force_d[0] = B0*mui[0]+B3*muj[0];		// force contribs
 	  force_d[1] = B0*mui[1]+B3*muj[1];
 	  force_d[2] = B0*mui[2]+B3*muj[2];
       if (ni > 0) {
 	    ti[0] -= f*(3.0*mudj*r2inv*r2inv*d[0]/B2+(qj*r2inv*d[0]-muj[0]*r2inv));
 	    ti[1] -= f*(3.0*mudj*r2inv*r2inv*d[1]/B2+(qj*r2inv*d[1]-muj[1]*r2inv));
 	    ti[2] -= f*(3.0*mudj*r2inv*r2inv*d[2]/B2+(qj*r2inv*d[2]-muj[2]*r2inv));
 	    if (newton_pair || j < nlocal) {
 	      tj[0] -= f*(3.0*mudi*r2inv*r2inv*d[0]/B2-(qi*r2inv*d[0]+mui[0]*r2inv));
 	      tj[1] -= f*(3.0*mudi*r2inv*r2inv*d[1]/B2-(qi*r2inv*d[1]+mui[1]*r2inv));
 	      tj[2] -= f*(3.0*mudi*r2inv*r2inv*d[2]/B2-(qi*r2inv*d[2]+mui[2]*r2inv));
 	    }
       }
 	}						// table real space
       } else {
 	force_coul = ecoul = 0.0;
 	memset(force_d, 0, 3*sizeof(double));
       }
 
       if (rsq < cut_ljsqi[typej]) {			// lj
        	if (order6) {					// long-range lj
 	  register double rn = r2inv*r2inv*r2inv;
 	  register double x2 = g2*rsq, a2 = 1.0/x2;
 	  x2 = a2*exp(-x2)*lj4i[typej];
 	  if (ni < 0) {
 	    force_lj =
 	      (rn*=rn)*lj1i[typej]-g8*(((6.0*a2+6.0)*a2+3.0)*a2+1.0)*x2*rsq;
 	    if (eflag) evdwl = rn*lj3i[typej]-g6*((a2+1.0)*a2+0.5)*x2;
 	  }
 	  else {					// special case
 	    register double f = special_lj[ni], t = rn*(1.0-f);
 	    force_lj = f*(rn *= rn)*lj1i[typej]-
 	      g8*(((6.0*a2+6.0)*a2+3.0)*a2+1.0)*x2*rsq+t*lj2i[typej];
 	    if (eflag) evdwl = 
 		f*rn*lj3i[typej]-g6*((a2+1.0)*a2+0.5)*x2+t*lj4i[typej];
 	  }
 	}
 	else {						// cut lj
 	  register double rn = r2inv*r2inv*r2inv;
 	  if (ni < 0) {
 	    force_lj = rn*(rn*lj1i[typej]-lj2i[typej]);
 	    if (eflag) evdwl = rn*(rn*lj3i[typej]-lj4i[typej])-offseti[typej];
 	  }
 	  else {					// special case
 	    register double f = special_lj[ni];
 	    force_lj = f*rn*(rn*lj1i[typej]-lj2i[typej]);
 	    if (eflag) evdwl = f*(
 		rn*(rn*lj3i[typej]-lj4i[typej])-offseti[typej]);
 	  }
 	}
 	force_lj *= r2inv;
       }
       else force_lj = evdwl = 0.0;
 
       fpair = force_coul+force_lj;			// force
       if (newton_pair || j < nlocal) {
 	register double *fj = f0+(j+(j<<1));
 	fi[0] += fx = d[0]*fpair+force_d[0]; fj[0] -= fx;
 	fi[1] += fy = d[1]*fpair+force_d[1]; fj[1] -= fy;
 	fi[2] += fz = d[2]*fpair+force_d[2]; fj[2] -= fz;
 	tqi[0] += mui[1]*ti[2]-mui[2]*ti[1];		// torque
 	tqi[1] += mui[2]*ti[0]-mui[0]*ti[2];
 	tqi[2] += mui[0]*ti[1]-mui[1]*ti[0];
 	register double *tqj = tq0+(j+(j<<1));
 	tqj[0] += muj[1]*tj[2]-muj[2]*tj[1];
 	tqj[1] += muj[2]*tj[0]-muj[0]*tj[2];
 	tqj[2] += muj[0]*tj[1]-muj[1]*tj[0];
       }
       else {
 	fi[0] += fx = d[0]*fpair+force_d[0];		// force
 	fi[1] += fy = d[1]*fpair+force_d[1];
 	fi[2] += fz = d[2]*fpair+force_d[2];
 	tqi[0] += mui[1]*ti[2]-mui[2]*ti[1];		// torque
 	tqi[1] += mui[2]*ti[0]-mui[0]*ti[2];
 	tqi[2] += mui[0]*ti[1]-mui[1]*ti[0];
       }
 
       if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
 			   evdwl,ecoul,fx,fy,fz,d[0],d[1],d[2]);
     }
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
 /* ---------------------------------------------------------------------- */
 
 /*
 double PairLJLongDipoleLong::single(int i, int j, int itype, int jtype,
 			    double rsq, double factor_coul, double factor_lj,
 			    double &fforce)
 {
   double r6inv, force_coul, force_lj;
   double g2 = g_ewald*g_ewald, g6 = g2*g2*g2, g8 = g6*g2, *q = atom->q;
 
   double eng = 0.0;
   double r2inv = 1.0/rsq;
 
   if ((ewald_order&(1<<3)) && (rsq < cut_coulsq)) {	// coulombic
     double *mui = atom->mu[i], *muj = atom->mu[j];
     double *xi = atom->x[i], *xj = atom->x[j];
     double qi = q[i], qj = q[j];
     double G0, G1, G2, B0, B1, B2, B3, mudi, mudj, muij;
     vector d = {xi[0]-xj[0], xi[1]-xj[1], xi[2]-xj[2]};
     {							// series real space
       register double r = sqrt(rsq);
       register double x = g_ewald*r;
       register double f = exp(-x*x)*qqrd2e;
 
       B0 = 1.0/(1.0+EWALD_P*x);			// eqn 2.8
       B0 *= ((((A5*B0+A4)*B0+A3)*B0+A2)*B0+A1)*f/r;
       B1 = (B0 + C1 * f) * r2inv;
       B2 = (3.0*B1 + C2 * f) * r2inv;
       B3 = (5.0*B2 + C3 * f) * r2inv;
 
       mudi = mui[0]*d[0]+mui[1]*d[1]+mui[2]*d[2];
       mudj = muj[0]*d[0]+muj[1]*d[1]+muj[2]*d[2];
       muij = mui[0]*muj[0]+mui[1]*muj[1]+mui[2]*muj[2];
       G0 = qi*(qj = q[j]);				// eqn 2.10
       G1 = qi*mudj-qj*mudi+muij;
       G2 = -mudi*mudj;
       force_coul = G0*B1+G1*B2+G2*B3;
 	  
       eng += G0*B0+G1*B1+G2*B2;	
       if (factor_coul < 1.0) {			      	// adj part, eqn 2.13
 	force_coul -= (f = force->qqrd2e*(1.0-factor_coul)/r)*(
 	    (3.0*G1+6.0*muij+15.0*G2*r2inv)*r2inv+G0);
 	eng -= f*((G1+3.0*G2*r2inv)*r2inv+G0);
 	B1 -= f*r2inv;
       }
       B0 = mudj*B2-qj*B1; B3 = qi*B1+mudi*B2;		// position independent
       //force_d[0] = B0*mui[0]+B3*muj[0];		// force contributions
       //force_d[1] = B0*mui[1]+B3*muj[1];
       //force_d[2] = B0*mui[2]+B3*muj[2];
     }							// table real space
   }
   else force_coul = 0.0;
 
   if (rsq < cut_ljsq[itype][jtype]) {			// lennard-jones
     r6inv = r2inv*r2inv*r2inv;
     if (ewald_order&0x40) {				// long-range
       register double x2 = g2*rsq, a2 = 1.0/x2, t = r6inv*(1.0-factor_lj);
       x2 = a2*exp(-x2)*lj4[itype][jtype];
       force_lj = factor_lj*(r6inv *= r6inv)*lj1[itype][jtype]-
        	g8*(((6.0*a2+6.0)*a2+3.0)*a2+a2)*x2*rsq+t*lj2[itype][jtype];
       eng += factor_lj*r6inv*lj3[itype][jtype]-
 	g6*((a2+1.0)*a2+0.5)*x2+t*lj4[itype][jtype];
     }
     else {						// cut
       force_lj = factor_lj*r6inv*(lj1[itype][jtype]*r6inv-lj2[itype][jtype]);
       eng += factor_lj*(r6inv*(r6inv*lj3[itype][jtype]-
 	    lj4[itype][jtype])-offset[itype][jtype]);
     }
   } 
   else force_lj = 0.0;
 
   fforce = (force_coul+force_lj)*r2inv;
   return eng;
 }
 */
 
diff --git a/src/KSPACE/fix_tune_kspace.cpp b/src/KSPACE/fix_tune_kspace.cpp
index 44b2bfc55..05f4b6aa5 100644
--- a/src/KSPACE/fix_tune_kspace.cpp
+++ b/src/KSPACE/fix_tune_kspace.cpp
@@ -1,543 +1,542 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Paul Crozier (SNL)
 ------------------------------------------------------------------------- */
 
 #include "string.h"
 #include "stdlib.h"
 #include "fix_tune_kspace.h"
 #include "update.h"
 #include "domain.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "kspace.h"
 #include "pair.h"
 #include "error.h"
 #include "memory.h"
 #include "timer.h"
 #include "neighbor.h"
 #include "modify.h"
 #include "compute.h"
 #include <iostream>
 #include <cmath>
 #include <limits>
 #define SWAP(a,b) {temp=(a);(a)=(b);(b)=temp;}
 #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
 #define GOLD 1.618034
 
 using namespace std;
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 /* ---------------------------------------------------------------------- */
 
 FixTuneKspace::FixTuneKspace(LAMMPS *lmp, int narg, char **arg) :
   Fix(lmp, narg, arg)
 {
   if (narg < 3) error->all(FLERR,"Illegal fix tune/kspace command");
 
   global_freq = 1;
   firststep = 0;
   niter = 0;
   niter_adjust_rcut = 0;
   keep_bracketing = true;
   first_brent_pass = true;
   converged = false;
   need_fd2_brent = false;
 
   ewald_time = pppm_time = msm_time = 0.0;
 
   // parse arguments
 
   nevery = force->inumeric(FLERR,arg[3]);
 
   // set up reneighboring
 
   force_reneighbor = 1;
   next_reneighbor = update->ntimestep + 1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 int FixTuneKspace::setmask()
 {
   int mask = 0;
   mask |= PRE_EXCHANGE;
   mask |= PRE_NEIGHBOR;
   return mask;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixTuneKspace::init()
 {
   if (!force->kspace) 
     error->all(FLERR,"Cannot use fix tune/kspace without a kspace style");
   if (!force->pair) 
     error->all(FLERR,"Cannot use fix tune/kspace without a pair style");
 
   double old_acc = force->kspace->accuracy/force->kspace->two_charge_force;
   char old_acc_str[12];
   sprintf(old_acc_str,"%g",old_acc);
   strcpy(new_acc_str,old_acc_str);
 
   int itmp;
   double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
   pair_cut_coul = *p_cutoff;
 }
 
 /* ----------------------------------------------------------------------
    perform dynamic kspace parameter optimization
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::pre_exchange()
 {
   if (!nevery) return;
   if (!force->kspace) return;
   if (!force->pair) return;
   if (next_reneighbor != update->ntimestep) return;
   next_reneighbor = update->ntimestep + nevery;
 
   double time = get_timing_info();
 
   if (strcmp(force->kspace_style,"ewald") == 0) ewald_time = time;
   if (strcmp(force->kspace_style,"pppm") == 0) pppm_time = time;
   if (strcmp(force->kspace_style,"msm") == 0) msm_time = time;
 
   niter++;
   if (niter == 1) {
     // test Ewald
     store_old_kspace_settings();
     strcpy(new_kspace_style,"ewald");
     sprintf(new_pair_style,"%s/long",base_pair_style);
     update_pair_style(new_pair_style,pair_cut_coul);
     update_kspace_style(new_kspace_style,new_acc_str);
   } else if (niter == 2) {
     // test PPPM
     store_old_kspace_settings();
     strcpy(new_kspace_style,"pppm");
     sprintf(new_pair_style,"%s/long",base_pair_style);
     update_pair_style(new_pair_style,pair_cut_coul);
     update_kspace_style(new_kspace_style,new_acc_str);
   } else if (niter == 3) {
     // test MSM
     store_old_kspace_settings();
     strcpy(new_kspace_style,"msm");
     sprintf(new_pair_style,"%s/msm",base_pair_style);
     update_pair_style(new_pair_style,pair_cut_coul);
     update_kspace_style(new_kspace_style,new_acc_str);
   } else if (niter == 4) {
     store_old_kspace_settings();
     cout << "ewald_time = " << ewald_time << endl;
     cout << "pppm_time = " << pppm_time << endl;
     cout << "msm_time = " << msm_time << endl;
     // switch to fastest one
     strcpy(new_kspace_style,"ewald");
     sprintf(new_pair_style,"%s/long",base_pair_style);
     if (pppm_time < ewald_time && pppm_time < msm_time)
       strcpy(new_kspace_style,"pppm");
     else if (msm_time < pppm_time && msm_time < ewald_time) {
       strcpy(new_kspace_style,"msm");
       sprintf(new_pair_style,"%s/msm",base_pair_style);
     }
     update_pair_style(new_pair_style,pair_cut_coul);
     update_kspace_style(new_kspace_style,new_acc_str);
   } else {
     adjust_rcut(time);
   }
 
   last_spcpu = timer->elapsed(Timer::TOTAL);
 }
 
 /* ----------------------------------------------------------------------
    figure out CPU time per timestep since last time checked
 ------------------------------------------------------------------------- */
 
 double FixTuneKspace::get_timing_info()
 {
   double dvalue;
   double new_cpu;
   int new_step = update->ntimestep;
 
   if (firststep == 0) {
     new_cpu = 0.0;
     dvalue = 0.0;
     firststep = 1;
   } else {
     new_cpu = timer->elapsed(Timer::TOTAL);
     double cpu_diff = new_cpu - last_spcpu;
     int step_diff = new_step - last_step;
     if (step_diff > 0.0) dvalue = cpu_diff/step_diff;
     else dvalue = 0.0;
   }
 
   last_step = new_step;
   last_spcpu = new_cpu;
 
   return dvalue;
 }
 
 /* ----------------------------------------------------------------------
    store old kspace settings: style, accuracy, order, etc
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::store_old_kspace_settings()
 {
   int n = strlen(force->kspace_style) + 1;
   char *old_kspace_style = new char[n];
   strcpy(old_kspace_style,force->kspace_style);
   strcpy(new_kspace_style,old_kspace_style);
   double old_acc = force->kspace->accuracy_relative;
   char old_acc_str[12];
   sprintf(old_acc_str,"%g",old_acc);
   strcpy(new_pair_style,force->pair_style);
   strcpy(base_pair_style,force->pair_style);
   char *trunc;
   if ((trunc = strstr(base_pair_style, "/long")) != NULL) *trunc = '\0';
   if ((trunc = strstr(base_pair_style, "/msm" )) != NULL) *trunc = '\0';
 
   old_differentiation_flag = force->kspace->differentiation_flag;
   old_slabflag = force->kspace->slabflag;
   old_slab_volfactor = force->kspace->slab_volfactor;
 }
 
 /* ----------------------------------------------------------------------
    update the pair style if necessary, preserving the settings
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::update_pair_style(char *new_pair_style, double pair_cut_coul)
 {
   int itmp;
   double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
   *p_cutoff = pair_cut_coul;
 
   // check to see if we need to change pair styles
   if (strcmp(new_pair_style,force->pair_style) == 0) return;
 
   // create a temporary file to store current pair settings
   FILE *p_pair_settings_file;
   p_pair_settings_file = tmpfile();
   force->pair->write_restart(p_pair_settings_file);
   rewind(p_pair_settings_file);
 
   cout << "Creating new pair style: " << new_pair_style << endl;
   // delete old pair style and create new one
   force->create_pair(new_pair_style,lmp->suffix);
 
   // restore current pair settings from temporary file
   force->pair->read_restart(p_pair_settings_file);
 
   double *pcutoff = (double *) force->pair->extract("cut_coul",itmp);
   double current_cutoff = *pcutoff;
   cout << "Coulomb cutoff for real space: " << current_cutoff << endl;
 
   // close temporary file
   fclose(p_pair_settings_file);
 }
 
 /* ----------------------------------------------------------------------
    update the kspace style if necessary
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::update_kspace_style(char *new_kspace_style, char *new_acc_str)
 {
   // create kspace style char string
 
   int narg = 2;
   char **arg;
   arg = NULL;
   int maxarg = 100;
   arg = (char **) memory->srealloc(arg,maxarg*sizeof(char *),"tune/kspace:arg");
   int n = 12;
   arg[0] = new char[n];
   strcpy(arg[0],new_kspace_style);
   arg[1] = new char[n];
   strcpy(arg[1],new_acc_str);
 
   // delete old kspace style and create new one
 
   force->create_kspace(narg,arg,lmp->suffix);
 
   force->kspace->differentiation_flag = old_differentiation_flag;
   force->kspace->slabflag = old_slabflag;
   force->kspace->slab_volfactor = old_slab_volfactor;
 
   // initialize new kspace style, pair style, molecular styles
 
   force->init();
 
   // set up grid
   force->kspace->setup_grid();
 
   // Re-init neighbor list. Probably only needed when redefining the pair style. Should happen after pair->init() to get pair style neighbor list request registered
 
   neighbor->init();
 
   // Re-init computes to update pointers to virials, etc.
 
   for (int i = 0; i < modify->ncompute; i++) modify->compute[i]->init();
 
   memory->sfree(arg);
 }
 
 /* ----------------------------------------------------------------------
    find the optimal real space coulomb cutoff
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::adjust_rcut(double time)
 {
   if (strcmp(force->kspace_style,"msm") == 0) return;
   if (converged) return;
 
   double temp;
   const double TINY = 1.0e-20;
 
   // get the current cutoff
   int itmp;
   double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
   double current_cutoff = *p_cutoff;
   cout << "Old Coulomb cutoff for real space: " << current_cutoff << endl;
 
   // use Brent's method from Numerical Recipes to find optimal real space cutoff
 
   // first time through, get ax_brent and fa_brent, and adjust cutoff
   if (keep_bracketing) {
     if (niter_adjust_rcut == 0) {
       pair_cut_coul /= 2;
     } else if (niter_adjust_rcut == 1) {
       ax_brent = current_cutoff;
       fa_brent = time;
       pair_cut_coul *= 2;
 
     // second time through, get bx_brent and fb_brent, and adjust cutoff
     } else if (niter_adjust_rcut == 2) {
       bx_brent = current_cutoff;
       fb_brent = time;
       if (fb_brent > fa_brent) {
         SWAP(ax_brent,bx_brent);
         SWAP(fb_brent,fa_brent);
         pair_cut_coul /= 4;
       } else {
         pair_cut_coul *= 2;
       }
 
     // third time through, get cx_brent and fc_brent, and adjust cutoff if needed
     } else if (niter_adjust_rcut == 3) {
       cx_brent = current_cutoff;
       fc_brent = time;
       if (fc_brent > fb_brent) keep_bracketing = false;
       else {
         double r = (bx_brent - ax_brent)*(fb_brent - fc_brent);
         double q = (bx_brent - cx_brent)*(fb_brent - fa_brent);
         dx_brent = bx_brent - ((bx_brent - cx_brent)*q - (bx_brent - ax_brent)*r)/
          (2.0*SIGN(MAX(fabs(q - r),TINY),q - r));
         pair_cut_coul = dx_brent;
       }
 
     // after third time through, bracket the minimum, and adjust cutoff
     } else if (niter_adjust_rcut > 3) {
       dx_brent = current_cutoff;
       if (need_fd2_brent) fd2_brent = time;
       else fd_brent = time;
       mnbrak();
       pair_cut_coul = dx_brent;
     }
   }
 
   if (!keep_bracketing) {
     dx_brent = current_cutoff;
     fd_brent = time;
     if (first_brent_pass) brent0();
     else brent2();
     brent1();
     pair_cut_coul = dx_brent;
   }
 
   niter_adjust_rcut++;
 
   if (pair_cut_coul <= 0.0) pair_cut_coul = fabs(MIN(ax_brent,MIN(bx_brent,(MIN(cx_brent,dx_brent))))/2.0) + TINY;
 
   if (pair_cut_coul != pair_cut_coul)
     error->all(FLERR,"Bad real space Coulomb cutoff in fix tune/kspace");
 
   // change the cutoff to pair_cut_coul
   *p_cutoff = pair_cut_coul;
 
   // report the new cutoff
   double *new_cutoff = (double *) force->pair->extract("cut_coul",itmp);
   current_cutoff = *new_cutoff;
   cout << "Adjusted Coulomb cutoff for real space: " << current_cutoff << endl;
 
   store_old_kspace_settings();
   update_pair_style(new_pair_style,pair_cut_coul);
   update_kspace_style(new_kspace_style,new_acc_str);
 }
 
 /* ----------------------------------------------------------------------
    bracket a minimum using parabolic extrapolation
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::mnbrak()
 {
   const double GLIMIT = 100.0, TINY = 1.0e-20;
-  double temp,r,q;
+  double r,q;
   r = (bx_brent - ax_brent)*(fb_brent - fc_brent);
   q = (bx_brent - cx_brent)*(fb_brent - fa_brent);
   dx_brent = bx_brent - ((bx_brent - cx_brent)*q - (bx_brent - ax_brent)*r)/
    (2.0*SIGN(MAX(fabs(q - r),TINY),q - r));
   dxlim = bx_brent + GLIMIT*(cx_brent - bx_brent);
 
   if ((bx_brent - dx_brent)*(dx_brent - cx_brent) > 0.0) {
     if (fd_brent < fc_brent) {
       ax_brent = bx_brent;
       bx_brent = dx_brent;
       fa_brent = fb_brent;
       fb_brent = fd_brent;
       keep_bracketing = false;
       return;
     } else if (fd_brent > fb_brent) {
       cx_brent = dx_brent;
       fc_brent = fd_brent;
       keep_bracketing = false;
       return;
     }
     dx_brent = cx_brent + GOLD*(cx_brent - bx_brent);
     if (need_fd2_brent) {
       fd_brent = fd2_brent;
       need_fd2_brent = false;
     } else {
       need_fd2_brent = true;
       return;
     }
   } else if ((cx_brent - dx_brent)*(dx_brent - dxlim) > 0.0) {
     if (fd_brent < fc_brent) {
       if (need_fd2_brent) {
         need_fd2_brent = false;
       } else {
         need_fd2_brent = true;
         dx_brent += GOLD*(dx_brent - cx_brent);
         return;
       }
       shft3(bx_brent,cx_brent,dx_brent,dx_brent + GOLD*(dx_brent - cx_brent));
       shft3(fb_brent,fc_brent,fd_brent,fd2_brent);
     }
   } else if ((dx_brent - dxlim)*(dxlim - cx_brent) >= 0.0) {
     dx_brent = dxlim;
     if (need_fd2_brent) {
       fd_brent = fd2_brent;
       need_fd2_brent = false;
     } else {
       need_fd2_brent = true;
       return;
     }
   } else {
     dx_brent = cx_brent + GOLD*(cx_brent - bx_brent);
     if (need_fd2_brent) {
       fd_brent = fd2_brent;
       need_fd2_brent = false;
     } else {
       need_fd2_brent = true;
       return;
     }
   }
   shft3(ax_brent,bx_brent,cx_brent,dx_brent);
   shft3(fa_brent,fb_brent,fc_brent,fd_brent);
 }
 
 /* ----------------------------------------------------------------------
    Brent's method from Numerical Recipes
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::brent0()
 {
   a_brent=(ax_brent < cx_brent ? ax_brent : cx_brent);
   b_brent=(ax_brent > cx_brent ? ax_brent : cx_brent);
   x_brent=w_brent=v_brent=bx_brent;
   fw_brent=fv_brent=fx_brent=fb_brent;
 }
 
 /* ----------------------------------------------------------------------
    Brent's method from Numerical Recipes
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::brent1()
 {
-  const int ITMAX=100;
   const double CGOLD=0.3819660;
   const double ZEPS=numeric_limits<double>::epsilon()*1.0e-3;
   double d=0.0,etemp;
   double p,q,r,tol1,tol2,xm;
   double e=0.0;
   double tol=0.001;
 
   xm=0.5*(a_brent+b_brent);
   tol2=2.0*(tol1=tol*fabs(x_brent)+ZEPS);
   if (fabs(x_brent-xm) <= (tol2-0.5*(b_brent-a_brent))) {
     converged = true;
     dx_brent = x_brent;
     return;
   }
   if (fabs(e) > tol1) {
     r=(x_brent-w_brent)*(fx_brent-fv_brent);
     q=(x_brent-v_brent)*(fx_brent-fw_brent);
     p=(x_brent-v_brent)*q-(x_brent-w_brent)*r;
     q=2.0*(q-r);
     if (q > 0.0) p = -p;
     q=fabs(q);
     etemp=e;
     e=d;
     if (fabs(p) >= fabs(0.5*q*etemp) || p <= q*(a_brent-x_brent) || p >= q*(b_brent-x_brent))
       d=CGOLD*(e=(x_brent >= xm ? a_brent-x_brent : b_brent-x_brent));
     else {
       d=p/q;
       dx_brent=x_brent+d;
       if (dx_brent-a_brent < tol2 || b_brent-dx_brent < tol2)
         d=SIGN(tol1,xm-x_brent);
     }
   } else {
     d=CGOLD*(e=(x_brent >= xm ? a_brent-x_brent : b_brent-x_brent));
   }
   dx_brent=(fabs(d) >= tol1 ? x_brent+d : x_brent+SIGN(tol1,d));
 
   first_brent_pass = false;
 
   return;
 }
 
 /* ----------------------------------------------------------------------
    Brent's method from Numerical Recipes
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::brent2()
 {
   if (fd_brent <= fx_brent) {
     if (dx_brent >= x_brent) a_brent=x_brent; else b_brent=x_brent;
     shft3(v_brent,w_brent,x_brent,dx_brent);
     shft3(fv_brent,fw_brent,fx_brent,fd_brent);
   } else {
     if (dx_brent < x_brent) a_brent=dx_brent; else b_brent=dx_brent;
     if (fd_brent <= fw_brent || w_brent == x_brent) {
       v_brent=w_brent;
       w_brent=dx_brent;
       fv_brent=fw_brent;
       fw_brent=fd_brent;
     } else if (fd_brent <= fv_brent || v_brent == x_brent || v_brent == w_brent) {
       v_brent=dx_brent;
       fv_brent=fd_brent;
     }
   }
 }
 
diff --git a/src/RIGID/fix_rigid_nh_small.cpp b/src/RIGID/fix_rigid_nh_small.cpp
index 29223847c..342b4dc50 100644
--- a/src/RIGID/fix_rigid_nh_small.cpp
+++ b/src/RIGID/fix_rigid_nh_small.cpp
@@ -1,1534 +1,1532 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Trung Dac Nguyen (ORNL)
    references: Kamberaj et al., J. Chem. Phys. 122, 224114 (2005)
                Miller et al., J Chem Phys. 116, 8649-8659 (2002)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "stdio.h"
 #include "string.h"
 #include "fix_rigid_nh_small.h"
 #include "math_extra.h"
 #include "atom.h"
 #include "compute.h"
 #include "domain.h"
 #include "update.h"
 #include "modify.h"
 #include "fix_deform.h"
 #include "group.h"
 #include "comm.h"
 #include "force.h"
 #include "kspace.h"
 #include "output.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 enum{NONE,XYZ,XY,YZ,XZ};     // same as in FixRigid
 enum{ISO,ANISO,TRICLINIC};   // same as in FixRigid
 
 #define EPSILON 1.0e-7
 
 enum{FULL_BODY,INITIAL,FINAL,FORCE_TORQUE,VCM_ANGMOM,XCM_MASS,ITENSOR,DOF};
 
 /* ---------------------------------------------------------------------- */
 
 FixRigidNHSmall::FixRigidNHSmall(LAMMPS *lmp, int narg, char **arg) :
   FixRigidSmall(lmp, narg, arg)
 {  
   // error checks
   
   if ((p_flag[0] == 1 && p_period[0] <= 0.0) || 
       (p_flag[1] == 1 && p_period[1] <= 0.0) || 
       (p_flag[2] == 1 && p_period[2] <= 0.0)) 
     error->all(FLERR,"Fix rigid/small npt/nph period must be > 0.0");
   
   if (domain->dimension == 2 && p_flag[2])
     error->all(FLERR,"Invalid fix rigid/small npt/nph command for a 2d simulation");
   if (domain->dimension == 2 && (pcouple == YZ || pcouple == XZ))
     error->all(FLERR,"Invalid fix rigid/small npt/nph command for a 2d simulation");
 
   if (pcouple == XYZ && (p_flag[0] == 0 || p_flag[1] == 0))
     error->all(FLERR,"Invalid fix rigid/small npt/nph command pressure settings");
   if (pcouple == XYZ && domain->dimension == 3 && p_flag[2] == 0)
     error->all(FLERR,"Invalid fix rigid/small npt/nph command pressure settings");
   if (pcouple == XY && (p_flag[0] == 0 || p_flag[1] == 0))
     error->all(FLERR,"Invalid fix rigid/small npt/nph command pressure settings");
   if (pcouple == YZ && (p_flag[1] == 0 || p_flag[2] == 0))
     error->all(FLERR,"Invalid fix rigid/small npt/nph command pressure settings");
   if (pcouple == XZ && (p_flag[0] == 0 || p_flag[2] == 0))
     error->all(FLERR,"Invalid fix rigid/small npt/nph command pressure settings");
 
   // require periodicity in tensile dimension
 
   if (p_flag[0] && domain->xperiodic == 0)
     error->all(FLERR,
                "Cannot use fix rigid/small npt/nph on a non-periodic dimension");
   if (p_flag[1] && domain->yperiodic == 0)
     error->all(FLERR,
                "Cannot use fix rigid/small npt/nph on a non-periodic dimension");
   if (p_flag[2] && domain->zperiodic == 0)
     error->all(FLERR,
                "Cannot use fix rigid/small npt/nph on a non-periodic dimension");
   
   if (pcouple == XYZ && domain->dimension == 3 &&
       (p_start[0] != p_start[1] || p_start[0] != p_start[2] ||
        p_stop[0] != p_stop[1] || p_stop[0] != p_stop[2] ||
        p_period[0] != p_period[1] || p_period[0] != p_period[2]))
     error->all(FLERR,"Invalid fix rigid/small npt/nph pressure settings");
   if (pcouple == XYZ && domain->dimension == 2 &&
       (p_start[0] != p_start[1] || p_stop[0] != p_stop[1] ||
        p_period[0] != p_period[1]))
     error->all(FLERR,"Invalid fix rigid/small npt/nph pressure settings");
   if (pcouple == XY &&
       (p_start[0] != p_start[1] || p_stop[0] != p_stop[1] ||
        p_period[0] != p_period[1]))
     error->all(FLERR,"Invalid fix rigid/small npt/nph pressure settings");
   if (pcouple == YZ &&
       (p_start[1] != p_start[2] || p_stop[1] != p_stop[2] ||
        p_period[1] != p_period[2]))
     error->all(FLERR,"Invalid fix rigid/small npt/nph pressure settings");
   if (pcouple == XZ &&
       (p_start[0] != p_start[2] || p_stop[0] != p_stop[2] ||
        p_period[0] != p_period[2]))
     error->all(FLERR,"Invalid fix rigid/small npt/nph pressure settings");
 
   if ((tstat_flag && t_period <= 0.0) ||
       (p_flag[0] && p_period[0] <= 0.0) ||
       (p_flag[1] && p_period[1] <= 0.0) ||
       (p_flag[2] && p_period[2] <= 0.0))
     error->all(FLERR,"Fix rigid/small nvt/npt/nph damping parameters must be > 0.0");
 
   // memory allocation and initialization
   
   if (tstat_flag || pstat_flag) {
     allocate_chain();
     allocate_order();
   }
   
   if (tstat_flag) {
     eta_t[0] = eta_r[0] = 0.0;
     eta_dot_t[0] = eta_dot_r[0] = 0.0;
     f_eta_t[0] = f_eta_r[0] = 0.0;
   
     for (int i = 1; i < t_chain; i++) {
       eta_t[i] = eta_r[i] = 0.0;
       eta_dot_t[i] = eta_dot_r[i] = 0.0;
     }
   }
   
   if (pstat_flag) {
     epsilon_dot[0] = epsilon_dot[1] = epsilon_dot[2] = 0.0;
     eta_b[0] = eta_dot_b[0] = f_eta_b[0] = 0.0;
     for (int i = 1; i < p_chain; i++) 
       eta_b[i] = eta_dot_b[i] = 0.0;
   }
 
   // rigid body pointers
   
   nrigidfix = 0;
   rfix = NULL;
 
   vol0 = 0.0;
   t0 = 1.0;
   
   tcomputeflag = 0;
   pcomputeflag = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixRigidNHSmall::~FixRigidNHSmall()
 {
   if (tstat_flag || pstat_flag) {
     deallocate_chain();
     deallocate_order();
   }
   
   if (rfix) delete [] rfix;
 
   if (tcomputeflag) {
     modify->delete_compute(id_temp);
     delete [] id_temp;
   }
   
   // delete pressure if fix created it
   
   if (pstat_flag) {
     if (pcomputeflag) modify->delete_compute(id_press);
     delete [] id_press;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 int FixRigidNHSmall::setmask()
 {
   int mask = 0;
   mask = FixRigidSmall::setmask();
   if (tstat_flag || pstat_flag) mask |= THERMO_ENERGY;
   
   return mask;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::init()
 {
   FixRigidSmall::init();
 
   // recheck that dilate group has not been deleted
 
   if (allremap == 0) {
     int idilate = group->find(id_dilate);
     if (idilate == -1)
       error->all(FLERR,"Fix rigid npt/nph dilate group ID does not exist");
     dilate_group_bit = group->bitmask[idilate];
   }
 
   // initialize thermostats
   // set timesteps, constants 
   // store Yoshida-Suzuki integrator parameters
   
   dtv = update->dt;
   dtf = 0.5 * update->dt * force->ftm2v;
   dtq = 0.5 * update->dt;
   
   boltz = force->boltz;
   nktv2p = force->nktv2p;
   mvv2e = force->mvv2e;
   dimension = domain->dimension;
 
   if (force->kspace) kspace_flag = 1;
   else kspace_flag = 0;
   
   // see Table 1 in Kamberaj et al
   
   if (tstat_flag || pstat_flag) {
     if (t_order == 3) {
       w[0] = 1.0 / (2.0 - pow(2.0, 1.0/3.0));
       w[1] = 1.0 - 2.0*w[0];
       w[2] = w[0];
     } else if (t_order == 5) {
       w[0] = 1.0 / (4.0 - pow(4.0, 1.0/3.0));
       w[1] = w[0];
       w[2] = 1.0 - 4.0 * w[0];
       w[3] = w[0];
       w[4] = w[0];
     }
   }  
 
   int icompute;
   if (tcomputeflag) {  
     icompute = modify->find_compute(id_temp);
     if (icompute < 0) 
       error->all(FLERR,"Temp ID for fix rigid npt/nph does not exist");
     temperature = modify->compute[icompute];
   }
 
   if (pstat_flag) {
     if (domain->triclinic) 
       error->all(FLERR,"fix rigid npt/nph does not yet allow triclinic box");
   
     // ensure no conflict with fix deform
 
     for (int i = 0; i < modify->nfix; i++)
       if (strcmp(modify->fix[i]->style,"deform") == 0) {
       	int *dimflag = ((FixDeform *) modify->fix[i])->dimflag;
       	if ((p_flag[0] && dimflag[0]) || (p_flag[1] && dimflag[1]) || 
       	    (p_flag[2] && dimflag[2]))
           error->all(FLERR,"Cannot use fix rigid npt/nph and fix deform on "
                      "same component of stress tensor");
       }
 
     // set frequency
   
     p_freq_max = 0.0;
     p_freq_max = MAX(p_freq[0],p_freq[1]);
     p_freq_max = MAX(p_freq_max,p_freq[2]);
 
     // tally the number of dimensions that are barostatted
     // set initial volume and reference cell, if not already done
 
     pdim = p_flag[0] + p_flag[1] + p_flag[2];
     if (vol0 == 0.0) {
       if (dimension == 2) vol0 = domain->xprd * domain->yprd;
       else vol0 = domain->xprd * domain->yprd * domain->zprd;
     }
 
     // set pressure compute ptr
 
     icompute = modify->find_compute(id_press);
     if (icompute < 0) 
       error->all(FLERR,"Press ID for fix rigid npt/nph does not exist");
     pressure = modify->compute[icompute];
     
     // detect if any rigid fixes exist so rigid bodies move on remap
     // rfix[] = indices to each fix rigid
     // this will include self
 
     if (rfix) delete [] rfix;
     nrigidfix = 0;
     rfix = NULL;
 
     for (int i = 0; i < modify->nfix; i++)
       if (modify->fix[i]->rigid_flag) nrigidfix++;
     if (nrigidfix) {
       rfix = new int[nrigidfix];
       nrigidfix = 0;
       for (int i = 0; i < modify->nfix; i++)
         if (modify->fix[i]->rigid_flag) rfix[nrigidfix++] = i;
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::setup(int vflag)
 {
   FixRigidSmall::setup(vflag);
 
   // total translational and rotational degrees of freedom
   
   int k,ibody;
-  double *inertia;
 
   nf_t = nf_r = dimension * nlocal_body;
   for (ibody = 0; ibody < nlocal_body; ibody++) {
-    inertia = body[ibody].inertia;
     for (k = 0; k < domain->dimension; k++) 
       if (fabs(body[ibody].inertia[k]) < EPSILON) nf_r--;
   }
 
   double nf[2], nfall[2];
   nf[0] = nf_t;
   nf[1] = nf_r;
   MPI_Allreduce(nf,nfall,2,MPI_DOUBLE,MPI_SUM,world);
   nf_t = nfall[0];
   nf_r = nfall[1];
 
   g_f = nf_t + nf_r;  
   onednft = 1.0 + (double)(dimension) / (double)g_f;
   onednfr = (double) (dimension) / (double)g_f;
   
   double mbody[3];
   akin_t = akin_r = 0.0;
   for (int ibody = 0; ibody < nlocal_body; ibody++) {
     Body *b = &body[ibody];
     MathExtra::transpose_matvec(b->ex_space,b->ey_space,b->ez_space,
                                 b->angmom,mbody);
     MathExtra::quatvec(b->quat,mbody,b->conjqm);
     b->conjqm[0] *= 2.0;
     b->conjqm[1] *= 2.0;
     b->conjqm[2] *= 2.0;
     b->conjqm[3] *= 2.0;
     
     if (tstat_flag || pstat_flag) {
       akin_t += b->mass*(b->vcm[0]*b->vcm[0] + b->vcm[1]*b->vcm[1] +
         b->vcm[2]*b->vcm[2]);
       akin_r += b->angmom[0]*b->omega[0] + b->angmom[1]*b->omega[1] +
         b->angmom[2]*b->omega[2];
     }
   }
   
   // accumulate translational and rotational kinetic energies
   
   if (tstat_flag || pstat_flag) {
     double ke[2],keall[2];
     ke[0] = akin_t;
     ke[1] = akin_r;
     MPI_Allreduce(ke,keall,2,MPI_DOUBLE,MPI_SUM,world);
     akin_t = keall[0];
     akin_r = keall[1];
   }
   
   // compute target temperature
   
   if (tstat_flag) compute_temp_target();
   else if (pstat_flag) {
     t0 = temperature->compute_scalar();
     if (t0 == 0.0) {
       if (strcmp(update->unit_style,"lj") == 0) t0 = 1.0;
       else t0 = 300.0;
     }
     t_target = t0;
   }
 
   // compute target pressure
   // compute current pressure
   // trigger virial computation on next timestep
     
   if (pstat_flag) { 
     compute_press_target();
     
     temperature->compute_scalar();
     if (pstyle == ISO) pressure->compute_scalar();
     else pressure->compute_vector();
     couple();
     pressure->addstep(update->ntimestep+1);
   }
   
   // initialize thermostat/barostat settings
   
   double kt, t_mass, tb_mass;
   kt = boltz * t_target;
 
   if (tstat_flag) {
     t_mass = kt / (t_freq*t_freq);
     q_t[0] = nf_t * t_mass;
     q_r[0] = nf_r * t_mass;
     for (int i = 1; i < t_chain; i++) 
       q_t[i] = q_r[i] = t_mass;
 
     for (int i = 1; i < t_chain; i++) {
       f_eta_t[i] = (q_t[i-1] * eta_dot_t[i-1] * eta_dot_t[i-1] - kt)/q_t[i];
       f_eta_r[i] = (q_r[i-1] * eta_dot_r[i-1] * eta_dot_r[i-1] - kt)/q_r[i];
     }
   }
   
   // initial forces on barostat thermostat variables
   
   if (pstat_flag) {
     for (int i = 0; i < 3; i++)
       if (p_flag[i]) {
         epsilon_mass[i] = (g_f + dimension) * kt / (p_freq[i]*p_freq[i]);
         epsilon[i] = log(vol0)/dimension;
       } 
     
     tb_mass = kt / (p_freq_max * p_freq_max);
     q_b[0] = dimension * dimension * tb_mass;
     for (int i = 1; i < p_chain; i++) {
       q_b[i] = tb_mass;
       f_eta_b[i] = (q_b[i] * eta_dot_b[i-1] * eta_dot_b[i-1] - kt)/q_b[i];
     }
   }
   
   // update order/timestep dependent coefficients
   
   if (tstat_flag || pstat_flag) {
     for (int i = 0; i < t_order; i++) {
       wdti1[i] = w[i] * dtv / t_iter;
       wdti2[i] = wdti1[i] / 2.0;
       wdti4[i] = wdti1[i] / 4.0;
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    perform preforce velocity Verlet integration
    see Kamberaj paper for step references
 ------------------------------------------------------------------------- */
 
 void FixRigidNHSmall::initial_integrate(int vflag)
 {
   double tmp,scale_r,scale_t[3],scale_v[3];
   double dtfm,mbody[3],tbody[3],fquat[4];
   double dtf2 = dtf * 2.0;
   
   // compute target temperature
   // update thermostat chains coupled to particles
   
   if (tstat_flag) {
     compute_temp_target();
     nhc_temp_integrate();
   }
 
   // compute target pressure
   // update epsilon dot
   // update thermostat coupled to barostat
   
   if (pstat_flag) {
     nhc_press_integrate();
     
     if (pstyle == ISO) {
       temperature->compute_scalar();
       pressure->compute_scalar();
     } else {
       temperature->compute_vector();
       pressure->compute_vector();
     }
     couple();
     pressure->addstep(update->ntimestep+1);
   
     compute_press_target();
     nh_epsilon_dot();
   }  
   
   // compute scale variables
 
   scale_t[0] = scale_t[1] = scale_t[2] = 1.0;
   scale_v[0] = scale_v[1] = scale_v[2] = 1.0;
   scale_r = 1.0;
 
   if (tstat_flag) {
     tmp = exp(-dtq * eta_dot_t[0]);
     scale_t[0] = scale_t[1] = scale_t[2] = tmp;
     tmp = exp(-dtq * eta_dot_r[0]);
     scale_r = tmp;
   } 
 
   if (pstat_flag) {
     scale_t[0] *= exp(-dtq * (epsilon_dot[0] + mtk_term2));
     scale_t[1] *= exp(-dtq * (epsilon_dot[1] + mtk_term2));
     scale_t[2] *= exp(-dtq * (epsilon_dot[2] + mtk_term2));
     scale_r *= exp(-dtq * (pdim * mtk_term2));
 
     tmp = dtq * epsilon_dot[0];
     scale_v[0] = dtv * exp(tmp) * maclaurin_series(tmp);
     tmp = dtq * epsilon_dot[1];
     scale_v[1] = dtv * exp(tmp) * maclaurin_series(tmp);
     tmp = dtq * epsilon_dot[2];
     scale_v[2] = dtv * exp(tmp) * maclaurin_series(tmp);
   }
     
   // update xcm, vcm, quat, conjqm and angmom
 
   for (int ibody = 0; ibody < nlocal_body; ibody++) {
     Body *b = &body[ibody];
     
     // step 1.1 - update vcm by 1/2 step
     
     dtfm = dtf / b->mass;
     b->vcm[0] += dtfm * b->fcm[0];
     b->vcm[1] += dtfm * b->fcm[1];
     b->vcm[2] += dtfm * b->fcm[2];
     
     if (tstat_flag || pstat_flag) {
       b->vcm[0] *= scale_t[0];
       b->vcm[1] *= scale_t[1];
       b->vcm[2] *= scale_t[2];
     }
     
     // step 1.2 - update xcm by full step
 
     if (!pstat_flag) {
       b->xcm[0] += dtv * b->vcm[0];
       b->xcm[1] += dtv * b->vcm[1];
       b->xcm[2] += dtv * b->vcm[2];
     } else {
       b->xcm[0] += scale_v[0] * b->vcm[0];
       b->xcm[1] += scale_v[1] * b->vcm[1];
       b->xcm[2] += scale_v[2] * b->vcm[2];
     }
     
     // step 1.3 - apply torque (body coords) to quaternion momentum
     
     MathExtra::transpose_matvec(b->ex_space,b->ey_space,b->ez_space,
                                 b->torque,tbody);
     MathExtra::quatvec(b->quat,tbody,fquat);
     
     b->conjqm[0] += dtf2 * fquat[0];
     b->conjqm[1] += dtf2 * fquat[1];
     b->conjqm[2] += dtf2 * fquat[2];
     b->conjqm[3] += dtf2 * fquat[3];
     
     if (tstat_flag || pstat_flag) {
       b->conjqm[0] *= scale_r;
       b->conjqm[1] *= scale_r;
       b->conjqm[2] *= scale_r;
       b->conjqm[3] *= scale_r;
     }
     
     // step 1.4 to 1.13 - use no_squish rotate to update p and q
   
     no_squish_rotate(3,b->conjqm,b->quat,b->inertia,dtq);
     no_squish_rotate(2,b->conjqm,b->quat,b->inertia,dtq);
     no_squish_rotate(1,b->conjqm,b->quat,b->inertia,dtv);
     no_squish_rotate(2,b->conjqm,b->quat,b->inertia,dtq);
     no_squish_rotate(3,b->conjqm,b->quat,b->inertia,dtq);
   
     // update exyz_space
     // transform p back to angmom
     // update angular velocity
     
     MathExtra::q_to_exyz(b->quat,b->ex_space,b->ey_space,
                          b->ez_space);
     MathExtra::invquatvec(b->quat,b->conjqm,mbody);
     MathExtra::matvec(b->ex_space,b->ey_space,b->ez_space,
                       mbody,b->angmom);
     
     b->angmom[0] *= 0.5;
     b->angmom[1] *= 0.5;
     b->angmom[2] *= 0.5;
     
     MathExtra::angmom_to_omega(b->angmom,b->ex_space,b->ey_space,
                                b->ez_space,b->inertia,b->omega);
   }
   
   // virial setup before call to set_xv
 
   if (vflag) v_setup(vflag);
   else evflag = 0;
 
   // forward communicate updated info of all bodies
 
   commflag = INITIAL;
   comm->forward_comm_variable_fix(this);
 
   // accumulate translational and rotational kinetic energies
 
   if (tstat_flag || pstat_flag) {
 
     akin_t = akin_r = 0.0;
     for (int ibody = 0; ibody < nlocal_body; ibody++) {
       Body *b = &body[ibody];  
       akin_t += b->mass*(b->vcm[0]*b->vcm[0] + b->vcm[1]*b->vcm[1] +
         b->vcm[2]*b->vcm[2]);
       akin_r += b->angmom[0]*b->omega[0] + b->angmom[1]*b->omega[1] +
         b->angmom[2]*b->omega[2];
     }
     
     double ke[2],keall[2];
     ke[0] = akin_t;
     ke[1] = akin_r;
     MPI_Allreduce(ke,keall,2,MPI_DOUBLE,MPI_SUM,world);
     akin_t = keall[0];
     akin_r = keall[1];
   }
 
   // remap simulation box by 1/2 step
 
   if (pstat_flag) remap();
 
   // set coords/orient and velocity/rotation of atoms in rigid bodies
   // from quarternion and omega
 
   set_xv();
   
   // remap simulation box by full step
   // redo KSpace coeffs since volume has changed
 
   if (pstat_flag) {
     remap();
     if (kspace_flag) force->kspace->setup();
   }  
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::final_integrate()
 {
   int i,ibody;
   double tmp,scale_t[3],scale_r;
-  double dtfm,xy,xz,yz;
+  double dtfm;
   double mbody[3],tbody[3],fquat[4];
   double dtf2 = dtf * 2.0;
 
   // compute scale variables
   
   scale_t[0] = scale_t[1] = scale_t[2] = 1.0;
   scale_r = 1.0;
 
   if (tstat_flag) {
     tmp = exp(-1.0 * dtq * eta_dot_t[0]);
     scale_t[0] = scale_t[1] = scale_t[2] = tmp;
     scale_r = exp(-1.0 * dtq * eta_dot_r[0]);
   } 
   
   if (pstat_flag) {
     scale_t[0] *= exp(-dtq * (epsilon_dot[0] + mtk_term2));
     scale_t[1] *= exp(-dtq * (epsilon_dot[1] + mtk_term2));
     scale_t[2] *= exp(-dtq * (epsilon_dot[2] + mtk_term2));
     scale_r *= exp(-dtq * (pdim * mtk_term2));
   }
   
   // sum over atoms to get force and torque on rigid body
   
   imageint *image = atom->image;
   double **x = atom->x;
   double **f = atom->f;
   int nlocal = atom->nlocal;
   
   double dx,dy,dz;
   double unwrap[3];
   double *xcm,*fcm,*tcm;
 
   for (ibody = 0; ibody < nlocal_body+nghost_body; ibody++) {
     fcm = body[ibody].fcm;
     fcm[0] = fcm[1] = fcm[2] = 0.0;
     tcm = body[ibody].torque;
     tcm[0] = tcm[1] = tcm[2] = 0.0;
   }
 
   for (i = 0; i < nlocal; i++) {
     if (atom2body[i] < 0) continue;
     Body *b = &body[atom2body[i]];
 
     fcm = b->fcm;
     fcm[0] += f[i][0];
     fcm[1] += f[i][1];
     fcm[2] += f[i][2];
 
     domain->unmap(x[i],image[i],unwrap);
     xcm = b->xcm;
     dx = unwrap[0] - xcm[0];
     dy = unwrap[1] - xcm[1];
     dz = unwrap[2] - xcm[2];
 
     tcm = b->torque;
     tcm[0] += dy*f[i][2] - dz*f[i][1];
     tcm[1] += dz*f[i][0] - dx*f[i][2];
     tcm[2] += dx*f[i][1] - dy*f[i][0];
   }
   
   // extended particles add their torque to torque of body
 
   if (extended) {
     double **torque = atom->torque;
 
     for (i = 0; i < nlocal; i++) {
       if (atom2body[i] < 0) continue;
 
       if (eflags[i] & TORQUE) {
         tcm = body[atom2body[i]].torque;
         tcm[0] += torque[i][0];
         tcm[1] += torque[i][1];
         tcm[2] += torque[i][2];
       }
     }
   }
 
   // reverse communicate fcm, torque of all bodies
 
   commflag = FORCE_TORQUE;
   comm->reverse_comm_variable_fix(this);
 
   // include Langevin thermostat forces and torques
 
   if (langflag) {
     for (int ibody = 0; ibody < nlocal_body; ibody++) {
       fcm = body[ibody].fcm;
       fcm[0] += langextra[ibody][0];
       fcm[1] += langextra[ibody][1];
       fcm[2] += langextra[ibody][2];
       tcm = body[ibody].torque;
       tcm[0] += langextra[ibody][3];
       tcm[1] += langextra[ibody][4];
       tcm[2] += langextra[ibody][5];
     }
   }
   
   // update vcm and angmom
   // include Langevin thermostat forces
   // fflag,tflag = 0 for some dimensions in 2d
 
   for (ibody = 0; ibody < nbody; ibody++) {
     Body *b = &body[ibody];
 
     // update vcm by 1/2 step
 
     dtfm = dtf / b->mass;
     if (tstat_flag || pstat_flag) {
       b->vcm[0] *= scale_t[0];
       b->vcm[1] *= scale_t[1];
       b->vcm[2] *= scale_t[2];
     }
 
     b->vcm[0] += dtfm * b->fcm[0];
     b->vcm[1] += dtfm * b->fcm[1];
     b->vcm[2] += dtfm * b->fcm[2];
 
     // update conjqm, then transform to angmom, set velocity again
     // virial is already setup from initial_integrate
     
     MathExtra::transpose_matvec(b->ex_space,b->ey_space,
                                 b->ez_space,b->torque,tbody);
     MathExtra::quatvec(b->quat,tbody,fquat);
     
     if (tstat_flag || pstat_flag) {
       b->conjqm[0] = scale_r * b->conjqm[0] + dtf2 * fquat[0];
       b->conjqm[1] = scale_r * b->conjqm[1] + dtf2 * fquat[1];
       b->conjqm[2] = scale_r * b->conjqm[2] + dtf2 * fquat[2];
       b->conjqm[3] = scale_r * b->conjqm[3] + dtf2 * fquat[3];
     } else {
       b->conjqm[0] += dtf2 * fquat[0];
       b->conjqm[1] += dtf2 * fquat[1];
       b->conjqm[2] += dtf2 * fquat[2];
       b->conjqm[3] += dtf2 * fquat[3];
     }
 
     MathExtra::invquatvec(b->quat,b->conjqm,mbody);
     MathExtra::matvec(b->ex_space,b->ey_space,b->ez_space,mbody,b->angmom);
     
     b->angmom[0] *= 0.5;
     b->angmom[1] *= 0.5;
     b->angmom[2] *= 0.5;  
     
     MathExtra::angmom_to_omega(b->angmom,b->ex_space,b->ey_space,
                                b->ez_space,b->inertia,b->omega);
   }
 
   // forward communicate updated info of all bodies
 
   commflag = FINAL;
   comm->forward_comm_variable_fix(this);
 
   // accumulate translational and rotational kinetic energies
 
   if (pstat_flag) {
 
     akin_t = akin_r = 0.0;
     for (int ibody = 0; ibody < nlocal_body; ibody++) {
       Body *b = &body[ibody];  
       akin_t += b->mass*(b->vcm[0]*b->vcm[0] + b->vcm[1]*b->vcm[1] +
         b->vcm[2]*b->vcm[2]);
       akin_r += b->angmom[0]*b->omega[0] + b->angmom[1]*b->omega[1] +
         b->angmom[2]*b->omega[2];
     }
     
     double ke[2],keall[2];
     ke[0] = akin_t;
     ke[1] = akin_r;
     MPI_Allreduce(ke,keall,2,MPI_DOUBLE,MPI_SUM,world);
     akin_t = keall[0];
     akin_r = keall[1];
   }
 
   // set velocity/rotation of atoms in rigid bodies
   // virial is already setup from initial_integrate
 
   set_v();
 
   // compute temperature and pressure tensor
   // couple to compute current pressure components
   // trigger virial computation on next timestep
   
   if (tcomputeflag) t_current = temperature->compute_scalar();
   if (pstat_flag) {
     if (pstyle == ISO) pressure->compute_scalar();
     else pressure->compute_vector();
     couple();
     pressure->addstep(update->ntimestep+1);
   }
 
   if (pstat_flag) nh_epsilon_dot();  
   
   // update eta_dot_t and eta_dot_r
   // update eta_dot_b
       
   if (tstat_flag) nhc_temp_integrate();
   if (pstat_flag) nhc_press_integrate();  
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::nhc_temp_integrate()
 {
   int i,j,k;
   double kt,gfkt_t,gfkt_r,tmp,ms,s,s2;
   
   kt = boltz * t_target;
   gfkt_t = nf_t * kt;
   gfkt_r = nf_r * kt;
 
   // update thermostat masses
   
   double t_mass = boltz * t_target / (t_freq * t_freq);
   q_t[0] = nf_t * t_mass;
   q_r[0] = nf_r * t_mass;
   for (i = 1; i < t_chain; i++)
     q_t[i] = q_r[i] = t_mass;
   
   // update force of thermostats coupled to particles
   
   f_eta_t[0] = (akin_t * mvv2e - gfkt_t) / q_t[0];
   f_eta_r[0] = (akin_r * mvv2e - gfkt_r) / q_r[0];
   
   // multiple timestep iteration
   
   for (i = 0; i < t_iter; i++) {
     for (j = 0; j < t_order; j++) {
   
       // update thermostat velocities half step
   
       eta_dot_t[t_chain-1] += wdti2[j] * f_eta_t[t_chain-1];
       eta_dot_r[t_chain-1] += wdti2[j] * f_eta_r[t_chain-1];
       
       for (k = 1; k < t_chain; k++) {
         tmp = wdti4[j] * eta_dot_t[t_chain-k];
         ms = maclaurin_series(tmp);
         s = exp(-1.0 * tmp);
         s2 = s * s;
         eta_dot_t[t_chain-k-1] = eta_dot_t[t_chain-k-1] * s2 + 
           wdti2[j] * f_eta_t[t_chain-k-1] * s * ms;
 	
         tmp = wdti4[j] * eta_dot_r[t_chain-k];
         ms = maclaurin_series(tmp);
         s = exp(-1.0 * tmp);
         s2 = s * s;
         eta_dot_r[t_chain-k-1] = eta_dot_r[t_chain-k-1] * s2 + 
           wdti2[j] * f_eta_r[t_chain-k-1] * s * ms;
       }
       
       // update thermostat positions a full step
       
       for (k = 0; k < t_chain; k++) {
         eta_t[k] += wdti1[j] * eta_dot_t[k];
         eta_r[k] += wdti1[j] * eta_dot_r[k];
       }
       
       // update thermostat forces 
       
       for (k = 1; k < t_chain; k++) {
         f_eta_t[k] = q_t[k-1] * eta_dot_t[k-1] * eta_dot_t[k-1] - kt;
         f_eta_t[k] /= q_t[k];
         f_eta_r[k] = q_r[k-1] * eta_dot_r[k-1] * eta_dot_r[k-1] - kt;
         f_eta_r[k] /= q_r[k];
       }
       
       // update thermostat velocities a full step
       
       for (k = 0; k < t_chain-1; k++) {
         tmp = wdti4[j] * eta_dot_t[k+1];
         ms = maclaurin_series(tmp);
         s = exp(-1.0 * tmp);
         s2 = s * s;
         eta_dot_t[k] = eta_dot_t[k] * s2 + wdti2[j] * f_eta_t[k] * s * ms;
         tmp = q_t[k] * eta_dot_t[k] * eta_dot_t[k] - kt;
         f_eta_t[k+1] = tmp / q_t[k+1];
 	
         tmp = wdti4[j] * eta_dot_r[k+1];
         ms = maclaurin_series(tmp);
         s = exp(-1.0 * tmp);
         s2 = s * s;
         eta_dot_r[k] = eta_dot_r[k] * s2 + wdti2[j] * f_eta_r[k] * s * ms;
         tmp = q_r[k] * eta_dot_r[k] * eta_dot_r[k] - kt;
           f_eta_r[k+1] = tmp / q_r[k+1];
       }
       
       eta_dot_t[t_chain-1] += wdti2[j] * f_eta_t[t_chain-1];
       eta_dot_r[t_chain-1] += wdti2[j] * f_eta_r[t_chain-1];
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::nhc_press_integrate()
 {
   int i,k;
   double tmp,s,s2,ms,kecurrent;
   double kt = boltz * t_target;
   double lkt_press = kt;
   
   // update thermostat masses
   
   double tb_mass = kt / (p_freq_max * p_freq_max);
   q_b[0] = tb_mass;
   for (int i = 1; i < p_chain; i++) {
     q_b[i] = tb_mass;
     f_eta_b[i] = q_b[i-1] * eta_dot_b[i-1] * eta_dot_b[i-1] - kt;
     f_eta_b[i] /= q_b[i];
   }
      
   // update forces acting on thermostat
   
   kecurrent = 0.0;
   for (i = 0; i < 3; i++) 
     if (p_flag[i]) {
       epsilon_mass[i] = (g_f + dimension) * kt / (p_freq[i] * p_freq[i]);
       kecurrent += epsilon_mass[i] * epsilon_dot[i] * epsilon_dot[i];
     }
 
   f_eta_b[0] = (kecurrent - lkt_press) / q_b[0];
   
   // update thermostat velocities a half step
   
   eta_dot_b[p_chain-1] += 0.5 * dtq * f_eta_b[p_chain-1];
   
   for (k = 0; k < p_chain-1; k++) {
     tmp = 0.5 * dtq * eta_dot_b[p_chain-k-1];
     ms = maclaurin_series(tmp);
     s = exp(-0.5 * tmp);
     s2 = s * s;
     eta_dot_b[p_chain-k-2] = eta_dot_b[p_chain-k-2] * s2 + 
       dtq * f_eta_b[p_chain-k-2] * s * ms;
   }
   
   // update thermostat positions
   
   for (k = 0; k < p_chain; k++)
     eta_b[k] += dtv * eta_dot_b[k];
   
   // update epsilon dot
   
   s = exp(-1.0 * dtq * eta_dot_b[0]);
   for (i = 0; i < 3; i++) 
     if (p_flag[i]) epsilon_dot[i] *= s;
       
   kecurrent = 0.0;
   for (i = 0; i < 3; i++) 
     if (p_flag[i]) 
       kecurrent += epsilon_mass[i] * epsilon_dot[i] * epsilon_dot[i];
  
   f_eta_b[0] = (kecurrent - lkt_press) / q_b[0];
   
   // update thermostat velocites a full step
   
   for (k = 0; k < p_chain-1; k++) {
     tmp = 0.5 * dtq * eta_dot_b[k+1];
     ms = maclaurin_series(tmp);
     s = exp(-0.5 * tmp);
     s2 = s * s;
     eta_dot_b[k] = eta_dot_b[k] * s2 + dtq * f_eta_b[k] * s * ms;
     tmp = q_b[k] * eta_dot_b[k] * eta_dot_b[k] - kt;
     f_eta_b[k+1] = tmp / q_b[k+1];
   }
   
   eta_dot_b[p_chain-1] += 0.5 * dtq * f_eta_b[p_chain-1];
 
 }
 
 /* ---------------------------------------------------------------------- 
    compute kinetic energy in the extended Hamiltonian
    conserved quantity = sum of returned energy and potential energy
 -----------------------------------------------------------------------*/
 
 double FixRigidNHSmall::compute_scalar()
 {
-  int i,k,ibody;
+  int i,k;
   double kt = boltz * t_target;
   double energy,ke_t,ke_q,tmp,Pkq[4];
   
-  double *vcm,*inertia,*quat;
+  double *vcm,*quat;
   
   // compute the kinetic parts of H_NVE in Kameraj et al (JCP 2005, pp 224114)
   
   // translational and rotational kinetic energies
 
   ke_t = 0.0;
   ke_q = 0.0;
   
   for (int i = 0; i < nlocal_body; i++) {
     vcm = body[i].vcm;
     quat = body[i].quat;
     ke_t += body[i].mass * (vcm[0]*vcm[0] + vcm[1]*vcm[1] +
       vcm[2]*vcm[2]);
     
     for (k = 1; k < 4; k++) {
       if (k == 1) {
         Pkq[0] = -quat[1];
         Pkq[1] =  quat[0];
         Pkq[2] =  quat[3];
         Pkq[3] = -quat[2];
       } else if (k == 2) {
         Pkq[0] = -quat[2];
         Pkq[1] = -quat[3];
         Pkq[2] =  quat[0];
         Pkq[3] =  quat[1];
       } else if (k == 3) {
         Pkq[0] = -quat[3];
         Pkq[1] =  quat[2];
         Pkq[2] = -quat[1];
         Pkq[3] =  quat[0];      
       }
    
       tmp = body[i].conjqm[0]*Pkq[0] + body[i].conjqm[1]*Pkq[1] +
         body[i].conjqm[2]*Pkq[2] + body[i].conjqm[3]*Pkq[3];
       tmp *= tmp;
     
       if (fabs(body[i].inertia[k-1]) < 1e-6) tmp = 0.0;
       else tmp /= (8.0 * body[i].inertia[k-1]); 
       ke_q += tmp;
     }
   }
   
   double ke[2],keall[2];
   ke[0] = ke_t;
   ke[1] = ke_q;
   MPI_Allreduce(ke,keall,2,MPI_DOUBLE,MPI_SUM,world);
   ke_t = keall[0];
   ke_q = keall[1];
   
   energy = (ke_t + ke_q) * mvv2e;
   
   if (tstat_flag) {
   
     // thermostat chain energy: from equation 12 in Kameraj et al (JCP 2005)
 
     energy += kt * (nf_t * eta_t[0] + nf_r * eta_r[0]);
   
     for (i = 1; i < t_chain; i++) 
       energy += kt * (eta_t[i] + eta_r[i]);
   
     for (i = 0;  i < t_chain; i++) {
       energy += 0.5 * q_t[i] * (eta_dot_t[i] * eta_dot_t[i]);
       energy += 0.5 * q_r[i] * (eta_dot_r[i] * eta_dot_r[i]);
     }
   }
   
   if (pstat_flag) {
 
     // using equation 22 in Kameraj et al for H_NPT
 
     for (i = 0; i < 3; i++)
       energy += 0.5 * epsilon_mass[i] * epsilon_dot[i] * epsilon_dot[i];
   
     double vol;
     if (dimension == 2) vol = domain->xprd * domain->yprd;
     else vol = domain->xprd * domain->yprd * domain->zprd;
 
     double p0 = (p_target[0] + p_target[1] + p_target[2]) / 3.0;
     energy += p0 * vol / nktv2p;
   
     for (i = 0;  i < p_chain; i++) {
       energy += kt * eta_b[i];
       energy += 0.5 * q_b[i] * (eta_dot_b[i] * eta_dot_b[i]);
     }
   }
   
   return energy;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::couple()
 {
   double *tensor = pressure->vector;
  
   if (pstyle == ISO) {
     p_current[0] = p_current[1] = p_current[2] = pressure->scalar;
   } else if (pcouple == XYZ) {
     double ave = 1.0/3.0 * (tensor[0] + tensor[1] + tensor[2]);
     p_current[0] = p_current[1] = p_current[2] = ave;
   } else if (pcouple == XY) {
     double ave = 0.5 * (tensor[0] + tensor[1]);
     p_current[0] = p_current[1] = ave;
     p_current[2] = tensor[2];
   } else if (pcouple == YZ) {
     double ave = 0.5 * (tensor[1] + tensor[2]);
     p_current[1] = p_current[2] = ave;
     p_current[0] = tensor[0];
   } else if (pcouple == XZ) {
     double ave = 0.5 * (tensor[0] + tensor[2]);
     p_current[0] = p_current[2] = ave;
     p_current[1] = tensor[1];
   } else {
     p_current[0] = tensor[0];
     p_current[1] = tensor[1];
     p_current[2] = tensor[2];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::remap()
 {
   int i;
   double oldlo,oldhi,ctr,expfac;
   
   double **x = atom->x;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
   
   // epsilon is not used, except for book-keeping
   
   for (i = 0; i < 3; i++) epsilon[i] += dtq * epsilon_dot[i];
   
   // convert pertinent atoms and rigid bodies to lamda coords
   
   if (allremap) domain->x2lamda(nlocal);
   else {
     for (i = 0; i < nlocal; i++)
       if (mask[i] & dilate_group_bit)
         domain->x2lamda(x[i],x[i]);
   }
   
   if (nrigidfix)
     for (i = 0; i < nrigidfix; i++)
       modify->fix[rfix[i]]->deform(0);
   
   // reset global and local box to new size/shape
   
   for (i = 0; i < 3; i++) {
     if (p_flag[i]) {
       oldlo = domain->boxlo[i];
       oldhi = domain->boxhi[i];
       ctr = 0.5 * (oldlo + oldhi);
       expfac = exp(dtq * epsilon_dot[i]);
       domain->boxlo[i] = (oldlo-ctr)*expfac + ctr;
       domain->boxhi[i] = (oldhi-ctr)*expfac + ctr;
     }
   }
 
   domain->set_global_box();
   domain->set_local_box();
   
   // convert pertinent atoms and rigid bodies back to box coords
   
   if (allremap) domain->lamda2x(nlocal);
   else {
     for (i = 0; i < nlocal; i++)
       if (mask[i] & dilate_group_bit)
         domain->lamda2x(x[i],x[i]);
   }
   
   if (nrigidfix)
     for (i = 0; i< nrigidfix; i++)
       modify->fix[rfix[i]]->deform(1);
 }
 
 /* ----------------------------------------------------------------------
    compute target temperature and kinetic energy
 -----------------------------------------------------------------------*/
 
 void FixRigidNHSmall::compute_temp_target()
 {
   double delta = update->ntimestep - update->beginstep;
   if (delta != 0.0) delta /= update->endstep - update->beginstep;
       
   t_target = t_start + delta * (t_stop-t_start);
 }
 
 /* ----------------------------------------------------------------------
    compute hydrostatic target pressure
 -----------------------------------------------------------------------*/
 
 void FixRigidNHSmall::compute_press_target()
 {
   double delta = update->ntimestep - update->beginstep;
   if (delta != 0.0) delta /= update->endstep - update->beginstep;
       
   p_hydro = 0.0;
   for (int i = 0; i < 3; i++)
     if (p_flag[i]) {
       p_target[i] = p_start[i] + delta * (p_stop[i]-p_start[i]);
       p_hydro += p_target[i];
     }
   p_hydro /= pdim;
 }
 
 /* ----------------------------------------------------------------------
    apply evolution operators to quat, quat momentum
    see Miller paper cited in fix rigid/nvt and fix rigid/npt
 ------------------------------------------------------------------------- */
 
 void FixRigidNHSmall::no_squish_rotate(int k, double *p, double *q,
                                        double *inertia, double dt)
 {
   double phi,c_phi,s_phi,kp[4],kq[4];
 
   // apply permuation operator on p and q, get kp and kq
 
   if (k == 1) {
     kq[0] = -q[1];  kp[0] = -p[1];
     kq[1] =  q[0];  kp[1] =  p[0];
     kq[2] =  q[3];  kp[2] =  p[3];
     kq[3] = -q[2];  kp[3] = -p[2];
   } else if (k == 2) {
     kq[0] = -q[2];  kp[0] = -p[2];
     kq[1] = -q[3];  kp[1] = -p[3];
     kq[2] =  q[0];  kp[2] =  p[0];
     kq[3] =  q[1];  kp[3] =  p[1];
   } else if (k == 3) {
     kq[0] = -q[3];  kp[0] = -p[3];
     kq[1] =  q[2];  kp[1] =  p[2];
     kq[2] = -q[1];  kp[2] = -p[1];
     kq[3] =  q[0];  kp[3] =  p[0];
   }
 
   // obtain phi, cosines and sines
 
   phi = p[0]*kq[0] + p[1]*kq[1] + p[2]*kq[2] + p[3]*kq[3];
   if (fabs(inertia[k-1]) < 1e-6) phi *= 0.0;
   else phi /= 4.0 * inertia[k-1];
   c_phi = cos(dt * phi);
   s_phi = sin(dt * phi);
 
   // advance p and q
 
   p[0] = c_phi*p[0] + s_phi*kp[0];
   p[1] = c_phi*p[1] + s_phi*kp[1];
   p[2] = c_phi*p[2] + s_phi*kp[2];
   p[3] = c_phi*p[3] + s_phi*kp[3];
 
   q[0] = c_phi*q[0] + s_phi*kq[0];
   q[1] = c_phi*q[1] + s_phi*kq[1];
   q[2] = c_phi*q[2] + s_phi*kq[2];
   q[3] = c_phi*q[3] + s_phi*kq[3];
 }
 
 /* ----------------------------------------------------------------------
    update epsilon_dot
 -----------------------------------------------------------------------*/
 
 void FixRigidNHSmall::nh_epsilon_dot()
 {
   int i;
   double volume,scale,f_epsilon;
 
   if (dimension == 2) volume = domain->xprd*domain->yprd;
   else volume = domain->xprd*domain->yprd*domain->zprd;
 
   // MTK terms
   
   mtk_term1 = (akin_t + akin_r) * mvv2e / g_f;
   
   scale = exp(-1.0 * dtq * eta_dot_b[0]);
 
   for (i = 0; i < 3; i++)
     if (p_flag[i]) {
       f_epsilon = (p_current[i]-p_hydro)*volume / nktv2p + mtk_term1;
       f_epsilon /= epsilon_mass[i];
       epsilon_dot[i] += dtq * f_epsilon;
       epsilon_dot[i] *= scale;
     }
   
   mtk_term2 = 0.0;
   for (i = 0; i < 3; i++)
     if (p_flag[i]) mtk_term2 += epsilon_dot[i];
   mtk_term2 /= g_f;
 }
 
 /* ----------------------------------------------------------------------
    pack entire state of Fix into one write 
 ------------------------------------------------------------------------- */
 
 void FixRigidNHSmall::write_restart(FILE *fp)
 {
   if (tstat_flag == 0 && pstat_flag == 0) return;
   
   int nsize = 2; // tstat_flag and pstat_flag
   
   if (tstat_flag) {
     nsize += 1;         // t_chain
     nsize += 4*t_chain; // eta_t, eta_r, eta_dot_t, eta_dot_r
   }
 
   if (pstat_flag) {
     nsize += 7;         // p_chain, epsilon(3) and epsilon_dot(3)
     nsize += 2*p_chain;
   }
   
   double *list;
   memory->create(list,nsize,"rigid_nh:list");
   
   int n = 0;
   
   list[n++] = tstat_flag;
   if (tstat_flag) {
     list[n++] = t_chain;
     for (int i = 0; i < t_chain; i++) {
       list[n++] = eta_t[i];
       list[n++] = eta_r[i];
       list[n++] = eta_dot_t[i];
       list[n++] = eta_dot_r[i];
     }
   }
   
   list[n++] = pstat_flag;
   if (pstat_flag) {
     list[n++] = epsilon[0];
     list[n++] = epsilon[1];
     list[n++] = epsilon[2];
     list[n++] = epsilon_dot[0];
     list[n++] = epsilon_dot[1];
     list[n++] = epsilon_dot[2];
         
     list[n++] = p_chain;
     for (int i = 0; i < p_chain; i++) {
       list[n++] = eta_b[i];
       list[n++] = eta_dot_b[i];
     }
   }
   
   if (comm->me == 0) {
     int size = (nsize)*sizeof(double);
     fwrite(&size,sizeof(int),1,fp);
     fwrite(list,sizeof(double),nsize,fp);
   }
     
   memory->destroy(list);
 }
 
 /* ----------------------------------------------------------------------
    use state info from restart file to restart the Fix 
 ------------------------------------------------------------------------- */
 
 void FixRigidNHSmall::restart(char *buf)
 {
   int n = 0;
   double *list = (double *) buf;
   int flag = static_cast<int> (list[n++]);
   
   if (flag) {
     int m = static_cast<int> (list[n++]);
     if (tstat_flag && m == t_chain) {
       for (int i = 0; i < t_chain; i++) {
         eta_t[i] = list[n++];
         eta_r[i] = list[n++];
         eta_dot_t[i] = list[n++];
         eta_dot_r[i] = list[n++];
       }
     } else n += 4*m;
   }
 
   flag = static_cast<int> (list[n++]);
   if (flag) {
     epsilon[0] = list[n++];
     epsilon[1] = list[n++];
     epsilon[2] = list[n++];
     epsilon_dot[0] = list[n++];
     epsilon_dot[1] = list[n++];
     epsilon_dot[2] = list[n++];
     
     int m = static_cast<int> (list[n++]);
     if (pstat_flag && m == p_chain) {
       for (int i = 0; i < p_chain; i++) {
         eta_b[i] = list[n++];
         eta_dot_b[i] = list[n++];
       }
     } else n += 2*m;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 int FixRigidNHSmall::modify_param(int narg, char **arg)
 {
   if (strcmp(arg[0],"temp") == 0) {
     if (narg < 2) error->all(FLERR,"Illegal fix_modify command");
     if (!pstat_flag) error->all(FLERR,"Illegal fix_modify command");
     if (tcomputeflag) {
       modify->delete_compute(id_temp);
       tcomputeflag = 0;
     }
     delete [] id_temp;
     int n = strlen(arg[1]) + 1;
     id_temp = new char[n];
     strcpy(id_temp,arg[1]);
 
     int icompute = modify->find_compute(arg[1]);
     if (icompute < 0)
       error->all(FLERR,"Could not find fix_modify temperature ID");
     temperature = modify->compute[icompute];
 
     if (temperature->tempflag == 0)
       error->all(FLERR,
                  "Fix_modify temperature ID does not compute temperature");
     if (temperature->igroup != 0 && comm->me == 0)
       error->warning(FLERR,"Temperature for fix modify is not for group all");
 
     // reset id_temp of pressure to new temperature ID
 
     if (pstat_flag) {
       icompute = modify->find_compute(id_press);
       if (icompute < 0)
         error->all(FLERR,"Pressure ID for fix modify does not exist");
       modify->compute[icompute]->reset_extra_compute_fix(id_temp);
     }
 
     return 2;
 
   } else if (strcmp(arg[0],"press") == 0) {
     if (narg < 2) error->all(FLERR,"Illegal fix_modify command");
     if (!pstat_flag) error->all(FLERR,"Illegal fix_modify command");
     if (pcomputeflag) {
       modify->delete_compute(id_press);
       pcomputeflag = 0;
     }
     delete [] id_press;
     int n = strlen(arg[1]) + 1;
     id_press = new char[n];
     strcpy(id_press,arg[1]);
 
     int icompute = modify->find_compute(arg[1]);
     if (icompute < 0) error->all(FLERR,"Could not find fix_modify pressure ID");
     pressure = modify->compute[icompute];
 
     if (pressure->pressflag == 0)
       error->all(FLERR,"Fix_modify pressure ID does not compute pressure");
     return 2;
   }
 
   return 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::allocate_chain()
 {
   if (tstat_flag) {
     q_t = new double[t_chain];
     q_r = new double[t_chain];
     eta_t = new double[t_chain];
     eta_r = new double[t_chain];
     eta_dot_t = new double[t_chain];
     eta_dot_r = new double[t_chain];
     f_eta_t = new double[t_chain];
     f_eta_r = new double[t_chain];
   }
   
   if (pstat_flag) {
     q_b = new double[p_chain];
     eta_b = new double[p_chain];
     eta_dot_b = new double[p_chain];
     f_eta_b = new double[p_chain];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::reset_target(double t_new)
 {
   t_start = t_stop = t_new;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::allocate_order()
 {
   w = new double[t_order];
   wdti1 = new double[t_order];
   wdti2 = new double[t_order];
   wdti4 = new double[t_order];
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::deallocate_chain()
 {
   if (tstat_flag) {
     delete [] q_t;
     delete [] q_r;
     delete [] eta_t;
     delete [] eta_r;
     delete [] eta_dot_t;
     delete [] eta_dot_r;
     delete [] f_eta_t;
     delete [] f_eta_r;
   }
   
   if (pstat_flag) {
     delete [] q_b;
     delete [] eta_b;
     delete [] eta_dot_b;
     delete [] f_eta_b;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixRigidNHSmall::deallocate_order()
 {
   delete [] w;
   delete [] wdti1;
   delete [] wdti2;
   delete [] wdti4;
 }
 
diff --git a/src/USER-LB/fix_lb_fluid.cpp b/src/USER-LB/fix_lb_fluid.cpp
index b75452f26..0161fd658 100644
--- a/src/USER-LB/fix_lb_fluid.cpp
+++ b/src/USER-LB/fix_lb_fluid.cpp
@@ -1,3368 +1,3367 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 
 /* ----------------------------------------------------------------------
    Contributing authors: Frances Mackay, Santtu Ollila, Colin Denniston (UWO)
 ------------------------------------------------------------------------- */
 
 #include "fix_lb_fluid.h"
 #include "math.h"
 #include "mpi.h"
 #include "stdlib.h"
 #include "stdio.h"
 #include "string.h"
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
 #include "domain.h"
 #include "atom.h"
 #include <iostream>
 #include <iomanip>
 #include "group.h"
 #include "random_mars.h"
 #include "update.h"
 #include "force.h"
 #include "modify.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 static const double kappa_lb=0.0;
-static const double sqrt2=1.41421356237310;
 
 FixLbFluid::FixLbFluid(LAMMPS *lmp, int narg, char **arg) :
   Fix(lmp, narg, arg)
 {
   //=====================================================================================================
   //  Sample inputfile call:
   // fix # group lb/fluid nevery typeLB viscosity densityinit_real
   //  
   //  where: nevery:            call this fix every nevery timesteps. 
   //		                 (nevery generally set to 1).
   //         typeLB:            there are two different integrators 
   //                             in the code labelled "1" and "2".
   //         viscosity:         the viscosity of the fluid. 
   //         densityinit_real:  the density of the fluid.
   //
   // optional arguments:
   //  "setArea" type node_area:                       set the surface area per node associated with a
   //                                                   given atom type.  By default the surface area 
   //                                                   is set at 1.0*dx_lb^2.
   //  "setGamma" gamma:                               specify a user-defined value for the force
   //                                                   coupling constant, instead of using the default
   //                                                   value.
   //  "scaleGamma" type scale_factor:                 scale the user provided force coupling constant
   //                                                   by the factor, scale_factor, for the given atom
   //                                                   type.
   //  "dx" dx_lb:                                     the lattice-Boltzmann grid spacing.
   //  "dm" dm_lb:                                     the lattice-Boltzmann mass unit.
   //  "a0" a_0_real:                                  the square of the sound speed in the fluid.
   //  "noise" Temperature seed:                       include noise in the system.  
   //                                                   Temperature is the temperature for the fluid.
   //                                                   seed is the seed for the random number generator.  
   //  "calcforce" N group:                            print the force acting on a given group every 
   //                                                   N timesteps.
   //  "trilinear":                                    use the trilinear interpolation stencil.
   //  "read_restart" restart_file:                    restart a fluid run from restart_file.
   //  "write_restart" N:                              write a fluid restart file every N timesteps.
   //  "zwall_velocity" velocity_bottom velocity_top:  assign velocities to the z-walls
   //                                                   in the system.
   //  "bodyforce" bodyforcex bodyforcey bodyforcez:   add a constant body force to the
   //                                                   fluid.
   //  "printfluid" N:                                 print the fluid density and velocity at each
   //                                                   grid point every N timesteps.
   //  "D3Q19":                                        use the 19 velocity D3Q19 model.  By default,
   //                                                   the 15 velocity D3Q15 model is used.
   //=====================================================================================================
 
   if(narg <7) error->all(FLERR,"Illegal fix lb/fluid command");
 
   MPI_Comm_rank(world,&me);
   MPI_Comm_size(world,&nprocs);
 
   nevery = atoi(arg[3]);
   typeLB = atoi(arg[4]);
   viscosity = atof(arg[5]);
   densityinit_real = atof(arg[6]);
   
   // Default values for optional arguments:
   force_diagnostic=0;
   noisestress = 0;
   trilinear_stencil = 0;
   readrestart = 0;
   printrestart = 0;
   bodyforcex = bodyforcey = bodyforcez = 0.0;
   vwtp = vwbt = 0.0;
   printfluid = 0;
   T = 300.0;
   dm_lb = 1.0;
   fixviscouslb = 0;
   setdx = 1;
   seta0 = 1;
   setGamma = 0;
   setArea = 0;
   numvel = 15;
 
   Gamma = NULL;
   NodeArea = NULL;
 
   int iarg = 7;
   while (iarg < narg){
     if(strcmp(arg[iarg],"setArea")==0){
       if(setGamma == 1)
 	error->all(FLERR,"Illegal fix lb/fluid command: cannot use a combination of default and user-specified gamma values");
       setArea = 1;
       int itype = atoi(arg[iarg+1]);
       double areafactor = atof(arg[iarg+2]);
       if(itype <= 0 || itype > atom->ntypes || areafactor < 0.0)
 	error->all(FLERR,"Illegal fix lb/fluid command: setArea");
       if(NodeArea == NULL){
 	NodeArea = new double[atom->ntypes+1];
 	for(int i=0; i<=atom->ntypes; i++) NodeArea[i] = -1.0;
       }
       NodeArea[itype] = areafactor;
       iarg += 3;
     }
     else if(strcmp(arg[iarg],"setGamma")==0){
       if(setArea == 1)
 	error->all(FLERR,"Illegal fix lb/fluid command: cannot use a combination of default and user-specified gamma values");
       setGamma = 1;
       double Gammaone;
       Gammaone = atof(arg[iarg+1]);
       if(Gamma == NULL)
 	Gamma = new double[atom->ntypes+1];
       for(int i=0; i<=atom->ntypes; i++) Gamma[i] = Gammaone;
       iarg += 2;
     }
     else if(strcmp(arg[iarg],"scaleGamma")==0){
       if(setGamma == 0)
 	error->all(FLERR,"Illegal fix lb/fluid command: must set a value for Gamma before scaling it");
       int itype = atoi(arg[iarg+1]);
       double scalefactor = atof(arg[iarg+2]);
       if(itype <= 0 || itype > atom->ntypes || scalefactor < 0.0)
 	error->all(FLERR,"Illegal fix lb/fluid command: scaleGamma");
       Gamma[itype] *= scalefactor;
       iarg += 3;
     }     
     else if(strcmp(arg[iarg],"dx")==0){
       dx_lb = atof(arg[iarg+1]);
       iarg += 2;
       setdx = 0;
     }
     else if(strcmp(arg[iarg],"dm")==0){
       dm_lb = atof(arg[iarg+1]);
       iarg += 2;
     }
     else if(strcmp(arg[iarg],"a0")==0){
       a_0_real = atof(arg[iarg+1]);
       iarg += 2;
       seta0 = 0;
     }
     else if(strcmp(arg[iarg],"noise")== 0){
       noisestress = 1;
       T = atof(arg[iarg+1]);
       seed = atoi(arg[iarg+2]);
       iarg += 3;
     }
     else if(strcmp(arg[iarg],"calcforce")==0){
       force_diagnostic = atoi(arg[iarg+1]);
       if(force_diagnostic % nevery != 0){
 	char str[200];
 	sprintf(str,"Requesting calcforce output every %i timesteps. Will only print output for those timesteps that are a multiple of nevery.",force_diagnostic);
 	error->warning(FLERR,str);	
       }
       igroupforce=group->find(arg[iarg+2]);
       iarg += 3;
     }
     else if(strcmp(arg[iarg],"trilinear")==0){
       trilinear_stencil = 1;
       iarg += 1;
     }
     else if(strcmp(arg[iarg],"read_restart")==0){
       readrestart = 1;  
       int nlength = strlen(arg[iarg+1]) + 16;
       char *filename = new char[nlength];
       strcpy(filename,arg[iarg+1]); 
       MPI_File_open(world,filename,MPI_MODE_RDONLY,MPI_INFO_NULL,&pFileRead);
       delete [] filename;
       iarg += 2;
     }
     else if(strcmp(arg[iarg],"write_restart")==0){
       printrestart = atoi(arg[iarg+1]);
       if(printrestart % nevery != 0){
 	char str[200];
 	sprintf(str,"Requesting restart files every %i timesteps. Will only print restart files for those timesteps that are a multiple of nevery.",printrestart);
 	error->warning(FLERR,str);	
       }
       iarg += 2;
     }
     else if(strcmp(arg[iarg],"zwall_velocity")==0){
       if(domain->periodicity[2]!=0) error->all(FLERR,"fix lb/fluid error: setting \
 a z wall velocity without implementing fixed BCs in z");
       vwbt = atof(arg[iarg+1]);
       vwtp = atof(arg[iarg+2]);
       iarg += 3;
     }
     else if(strcmp(arg[iarg],"bodyforce")==0){
       bodyforcex = atof(arg[iarg+1]);
       bodyforcey = atof(arg[iarg+2]);
       bodyforcez = atof(arg[iarg+3]);
       iarg += 4;
     }
     else if(strcmp(arg[iarg],"printfluid")==0){
       printfluid = atoi(arg[iarg+1]);
       iarg += 2;
     }
     else if(strcmp(arg[iarg],"D3Q19")==0){
       numvel = 19;
       iarg += 1;
     }
     else error->all(FLERR,"Illegal fix lb/fluid command");
   }
 
   //--------------------------------------------------------------------------
   //Choose between D3Q15 and D3Q19 functions:
   //--------------------------------------------------------------------------
   if(numvel == 15){
     initializeLB = &FixLbFluid::initializeLB15;
     equilibriumdist = &FixLbFluid::equilibriumdist15;
     update_full = &FixLbFluid::update_full15;
   }else{
     initializeLB = &FixLbFluid::initializeLB19;
     equilibriumdist = &FixLbFluid::equilibriumdist19;
     update_full = &FixLbFluid::update_full19; 
   }  
   
   //--------------------------------------------------------------------------
   // perform initial allocation of atom-based array register
   // with Atom class
   //--------------------------------------------------------------------------
   hydroF = NULL;
   grow_arrays(atom->nmax);
   atom->add_callback(0);
 
   for(int i=0; i<atom->nmax; i++)
     for(int j=0; j<3; j++)
     hydroF[i][j] = 0.0;
  
   Ng_lb = NULL;
   w_lb = NULL;
   mg_lb = NULL;
   e = NULL;
   feq = NULL;
   feqold = NULL;
   feqn = NULL;
   feqoldn = NULL;
   f_lb = NULL;
   fnew = NULL;
   density_lb = NULL;
   u_lb = NULL;
   altogether = NULL;
   buf = NULL;
   Ff = NULL;
   Fftempx = NULL;
   Fftempy = NULL;
   Fftempz = NULL;
 
   //--------------------------------------------------------------------------
   // Set the lattice Boltzmann dt. 
   //--------------------------------------------------------------------------
   dt_lb=nevery*(update->dt);
 
   //--------------------------------------------------------------------------
   // Set the lattice Boltzmann dx if it wasn't specified in the 
   // input.
   //--------------------------------------------------------------------------
   if(setdx == 1){
     double dx_lb1 = sqrt(3.0*viscosity*dt_lb/densityinit_real);
     double mindomain = std::min(std::min(domain->xprd/comm->procgrid[0],domain->yprd/comm->procgrid[1]),domain->zprd/comm->procgrid[2]);
     dx_lb = mindomain/floor(mindomain/dx_lb1);
 
     if(comm->me==0){
       char str[128];
       sprintf(str,"Setting the lattice-Boltzmann dx to %10.6f",dx_lb);
       error->message(FLERR,str);
     }   
   }
   //--------------------------------------------------------------------------
   // If the area per node has not been set by the user, set to the 
   // default value of dx_lb*dx_lb.
   //--------------------------------------------------------------------------
   if(setGamma == 0){
     if(setArea == 0){ 
       if(comm->me==0){
 	error->message(FLERR,"Assuming an area per node of dx*dx for all of the MD particles.  This should only be used if these all correspond to point particles; otherwise, change using the setArea keyword");
       }
       NodeArea = new double[atom->ntypes+1];
       for(int i=0; i<=atom->ntypes; i++) NodeArea[i] = -1.0;
     }
     for(int i=0; i<=atom->ntypes; i++)
       if(NodeArea[i] < 0.0) NodeArea[i] = dx_lb*dx_lb;
   }
   //--------------------------------------------------------------------------
   // Set a0 if it wasn't specified in the input
   //--------------------------------------------------------------------------
   if(seta0 == 1)
     a_0_real = 0.33333333*dx_lb*dx_lb/dt_lb/dt_lb;
 
   //--------------------------------------------------------------------------
   // Check to make sure that the total number of grid points in each direction
   // divides evenly among the processors in that direction.
   // Shrink-wrapped boundary conditions (which are not permitted by this fix)
   // might cause a problem, so check for this.  A full check of the boundary 
   // conditions is performed in the init routine, rather than here, as it is
   // possible to change the BCs between runs.
   //--------------------------------------------------------------------------
   double aa;
   double eps=1.0e-8;
   aa = (domain->xprd/comm->procgrid[0])/dx_lb;
   if(fabs(aa - floor(aa+0.5)) > eps){
     if(domain->boundary[0][0] != 0){
       error->all(FLERR,"the x-direction must be periodic");
     }
     char errormessage[200];
     sprintf(errormessage,"With dx= %f, and the simulation domain divided by %i processors in the x direction, the simulation domain in the x direction must be a multiple of %f",dx_lb,comm->procgrid[0],comm->procgrid[0]*dx_lb);
     error->all(FLERR,errormessage);
   }
   aa = (domain->yprd/comm->procgrid[1])/dx_lb;
   if(fabs(aa - floor(aa+0.5)) > eps){
     if(domain->boundary[1][0] != 0){
       error->all(FLERR,"the y-direction must be periodic");
     }
     char errormessage[200];
     sprintf(errormessage,"With dx= %f, and the simulation domain divided by %i processors in the y direction, the simulation domain in the y direction must be a multiple of %f",dx_lb,comm->procgrid[1],comm->procgrid[1]*dx_lb);
     error->all(FLERR,errormessage);
   }
   aa = (domain->zprd/comm->procgrid[2])/dx_lb;
   if(fabs(aa - floor(aa+0.5)) > eps){
     if(domain->boundary[2][0] == 2 || domain->boundary[2][0] == 3){
       error->all(FLERR,"the z-direction can not have shrink-wrap boundary conditions");
     }
     char errormessage[200];
     sprintf(errormessage,"With dx= %f, and the simulation domain divided by %i processors in the z direction, the simulation domain in the z direction must be a multiple of %f",dx_lb,comm->procgrid[2],comm->procgrid[2]*dx_lb);
     error->all(FLERR,errormessage);
   }
   
   //--------------------------------------------------------------------------
   // Set the total number of grid points in each direction.
   //--------------------------------------------------------------------------
   Nbx = (int)(domain->xprd/dx_lb + 0.5);
   Nby = (int)(domain->yprd/dx_lb + 0.5);
   Nbz = (int)(domain->zprd/dx_lb + 0.5);
 
   //--------------------------------------------------------------------------
   // Set the number of grid points in each dimension for the local subgrids.
   //--------------------------------------------------------------------------
   subNbx= Nbx/comm->procgrid[0] + 2;
   subNby= Nby/comm->procgrid[1] + 2;
   subNbz= Nbz/comm->procgrid[2] + 2;
  
   //--------------------------------------------------------------------------
   // In order to calculate the fluid forces correctly, need to have atleast
   // 5 grid points in each direction per processor.
   //--------------------------------------------------------------------------
   if(subNbx<7 || subNby < 7 || subNbz<7)
     error->all(FLERR,"Need at least 5 grid points in each direction per processor");
 
   // If there are walls in the z-direction add an extra grid point.
   if(domain->periodicity[2]==0){
    Nbz += 1;
    if(comm->myloc[2]==comm->procgrid[2]-1)
      subNbz += 1;
   }
 
   if(comm->me==0){
     char str[128];
     if(setdx == 1){
       sprintf(str,"Using a lattice-Boltzmann grid of %i by %i by %i total grid points.  To change, use the dx keyword",Nbx,Nby,Nbz);
     }else{
       sprintf(str,"Using a lattice-Boltzmann grid of %i by %i by %i total grid points.",Nbx,Nby,Nbz);
     }
     error->message(FLERR,str);   
   }
 
   //--------------------------------------------------------------------------
   // Store the largest value of subNbz, which is needed for allocating the
   // buf array (since a processor with comm->myloc[2] == comm->procgrid[2]-1
   // may have an additional subNbz point as compared with the rest).
   //--------------------------------------------------------------------------
   int subNbzmax;
   MPI_Allreduce(&subNbz,&subNbzmax,1,MPI_INT,MPI_MAX,world);
 
   //--------------------------------------------------------------------------
   // Create the MPI datatypes used to pass portions of arrays:
   // datatypes to pass the f and feq arrays.
   //--------------------------------------------------------------------------
   MPI_Aint lb, sizeofdouble;
   MPI_Type_get_extent(MPI_DOUBLE,&lb,&sizeofdouble);
   
   MPI_Type_vector(subNbz-2,numvel,numvel,MPI_DOUBLE,&oneslice);
   MPI_Type_commit(&oneslice);
   MPI_Type_create_hvector(subNby-2,1,numvel*subNbz*sizeofdouble,oneslice,&passxf);
   MPI_Type_commit(&passxf);
  
   MPI_Type_create_hvector(subNbx,1,numvel*subNbz*subNby*sizeofdouble,oneslice,&passyf);
   MPI_Type_commit(&passyf);
   
   MPI_Type_free(&oneslice);
   MPI_Type_vector(subNby,numvel,numvel*subNbz,MPI_DOUBLE,&oneslice);
   MPI_Type_commit(&oneslice);
   MPI_Type_create_hvector(subNbx,1,numvel*subNbz*subNby*sizeofdouble,oneslice,&passzf);
   MPI_Type_commit(&passzf);
 
   // datatypes to pass the u array, and the Ff array.
   MPI_Type_free(&oneslice);
   MPI_Type_vector(subNbz+3,3,3,MPI_DOUBLE,&oneslice);
   MPI_Type_commit(&oneslice);
   MPI_Type_create_hvector(subNby+3,1,3*(subNbz+3)*sizeofdouble,oneslice,&passxu);
   MPI_Type_commit(&passxu);
   
   MPI_Type_create_hvector(subNbx+3,1,3*(subNbz+3)*(subNby+3)*sizeofdouble,oneslice,&passyu);
   MPI_Type_commit(&passyu);
   
   MPI_Type_free(&oneslice);
   MPI_Type_vector(subNby+3,3,3*(subNbz+3),MPI_DOUBLE,&oneslice);
   MPI_Type_commit(&oneslice);
   MPI_Type_create_hvector(subNbx+3,1,3*(subNbz+3)*(subNby+3)*sizeofdouble,oneslice,&passzu);
   MPI_Type_commit(&passzu);
 
   // datatypes to pass the density array.
   MPI_Type_free(&oneslice);
   MPI_Type_vector(subNbz+3,1,1,MPI_DOUBLE,&oneslice);
   MPI_Type_commit(&oneslice);
   MPI_Type_create_hvector(subNby+3,1,1*(subNbz+3)*sizeofdouble,oneslice,&passxrho);
   MPI_Type_commit(&passxrho);
   
   MPI_Type_create_hvector(subNbx+3,1,1*(subNbz+3)*(subNby+3)*sizeofdouble,oneslice,&passyrho);
   MPI_Type_commit(&passyrho);
   
   MPI_Type_free(&oneslice);
   MPI_Type_vector(subNby+3,1,1*(subNbz+3),MPI_DOUBLE,&oneslice);
   MPI_Type_commit(&oneslice);
   MPI_Type_create_hvector(subNbx+3,1,1*(subNbz+3)*(subNby+3)*sizeofdouble,oneslice,&passzrho);
   MPI_Type_commit(&passzrho);
 
   // datatypes to receive a portion of the Ff array.
   MPI_Type_free(&oneslice);
   MPI_Type_vector(subNbz+3,3,3,MPI_DOUBLE,&oneslice);
   MPI_Type_commit(&oneslice);
   MPI_Type_create_hvector(subNby+3,1,3*(subNbz+3)*sizeofdouble,oneslice,&passxtemp);
   MPI_Type_commit(&passxtemp);
   
   MPI_Type_create_hvector(subNbx+3,1,3*(subNbz+3)*5*sizeofdouble,oneslice,&passytemp);
   MPI_Type_commit(&passytemp);
   
   MPI_Type_free(&oneslice);
   MPI_Type_vector(subNby+3,3,3*5,MPI_DOUBLE,&oneslice);
   MPI_Type_commit(&oneslice);
   MPI_Type_create_hvector(subNbx+3,1,3*5*(subNby+3)*sizeofdouble,oneslice,&passztemp);
   MPI_Type_commit(&passztemp);
 
   MPI_Type_free(&oneslice);
 
   //--------------------------------------------------------------------------
   // Allocate the necessary arrays.
   //--------------------------------------------------------------------------
   memory->create(Ng_lb,numvel,"FixLbFluid:Ng_lb");
   memory->create(w_lb,numvel,"FixLbFluid:w_lb");
   memory->create(mg_lb,numvel,numvel,"FixLbFluid:mg_lb");
   memory->create(e,numvel,3,"FixLbFluid:e");
   memory->create(feq,subNbx,subNby,subNbz,numvel,"FixLbFluid:feq");
   if(typeLB == 2){
     memory->create(feqold,subNbx,subNby,subNbz,numvel,"FixLbFluid:feqold");
     memory->create(feqn,subNbx,subNby,subNbz,numvel,"FixLbFluid:feqn");
     memory->create(feqoldn,subNbx,subNby,subNbz,numvel,"FixLbFluid:feqoldn");
   }
   memory->create(f_lb,subNbx,subNby,subNbz,numvel,"FixLbFluid:f_lb");
   memory->create(fnew,subNbx,subNby,subNbz,numvel,"FixLbFluid:fnew");
   memory->create(density_lb,subNbx+3,subNby+3,subNbz+3,"FixLbFluid:density_lb");
   memory->create(u_lb,subNbx+3,subNby+3,subNbz+3,3,"FixLbFluid:u_lb");
   if(printfluid > 0){
     memory->create(buf,subNbx,subNby,subNbzmax,4,"FixLbFluid:buf");
     if(me==0)
       memory->create(altogether,Nbx,Nby,Nbz,4,"FixLbFluid:altogether");
   }
   memory->create(Ff,subNbx+3,subNby+3,subNbz+3,3,"FixLbFluid:Ff");
   memory->create(Fftempx,5,subNby+3,subNbz+3,3,"FixLbFluid:Fftempx");
   memory->create(Fftempy,subNbx+3,5,subNbz+3,3,"FixLbFluid:Fftempy");
   memory->create(Fftempz,subNbx+3,subNby+3,5,3,"FixLbFluid:Fftempz");
 
   if(noisestress==1){
     random = new RanMars(lmp,seed + comm->me);
   }
 
   //--------------------------------------------------------------------------
   // Rescale the variables to Lattice Boltzmann dimensionless units.
   //--------------------------------------------------------------------------
   rescale();
   
   //--------------------------------------------------------------------------
   // Initialize the arrays.
   //--------------------------------------------------------------------------
   (*this.*initializeLB)();
   initialize_feq();
 
 }
 
 FixLbFluid::~FixLbFluid()
 {
 
   atom->delete_callback(id,0);
   memory->destroy(hydroF);
  
   memory->destroy(Ng_lb);
   memory->destroy(w_lb);
   memory->destroy(mg_lb);
   memory->destroy(e);
   memory->destroy(feq);
   if(typeLB == 2){
     memory->destroy(feqold);
     memory->destroy(feqn);
     memory->destroy(feqoldn);
   }
   memory->destroy(f_lb);
   memory->destroy(fnew);
   memory->destroy(density_lb);
   memory->destroy(u_lb);
   if(printfluid>0){
     if(me==0)
       memory->destroy(altogether);
     memory->destroy(buf);
   }
   memory->destroy(Ff);
   memory->destroy(Fftempx);
   memory->destroy(Fftempy);
   memory->destroy(Fftempz);
   
   if(noisestress==1){
     delete random;
   }
 
   if(setGamma == 1){
     delete [] Gamma;
   }else{
     delete [] NodeArea;
   }
 }
 
 int FixLbFluid::setmask()
 {
   int mask =0;
   mask |= INITIAL_INTEGRATE;
   mask |= POST_FORCE;
   mask |= END_OF_STEP;
   return mask;
 }
 
 void FixLbFluid::init(void)
 {
   
   int i,j;
 
   //--------------------------------------------------------------------------
   // Check to see if the MD timestep has changed between runs.
   //--------------------------------------------------------------------------
   double dt_lb_now;
   dt_lb_now=nevery*(update->dt);
 
   if(fabs(dt_lb_now - dt_lb) > 1.0e-12){
     error->warning(FLERR,"Timestep has changed between runs with the same lb/fluid.  Unphysical results may occur");
   }   
   
   //--------------------------------------------------------------------------
   // Make sure the size of the simulation domain has not changed
   // between runs.
   //--------------------------------------------------------------------------
   int Nbx_now,Nby_now,Nbz_now;
   Nbx_now = (int)(domain->xprd/dx_lb + 0.5);
   Nby_now = (int)(domain->yprd/dx_lb + 0.5);
   Nbz_now = (int)(domain->zprd/dx_lb + 0.5);
   // If there are walls in the z-direction add an extra grid point.
   if(domain->periodicity[2]==0){
    Nbz_now += 1;
   }
   
   if(Nbx_now != Nbx || Nby_now != Nby || Nbz_now != Nbz){
     error->all(FLERR,"the simulation domain can not change shape between runs with the same lb/fluid");
   }
   
   //--------------------------------------------------------------------------
   // Check to make sure that the chosen LAMMPS boundary types are compatible
   // with this fix.
   //    shrink-wrap is not compatible in any dimension.
   //    fixed only works in the z-direction.
   //--------------------------------------------------------------------------
   if(domain->boundary[0][0] != 0){
     error->all(FLERR,"the x-direction must be periodic");
   }
   if(domain->boundary[1][0] != 0){
     error->all(FLERR,"the y-direction must be periodic");
   }
   if(domain->boundary[2][0] == 2 || domain->boundary[2][0] == 3){
     error->all(FLERR,"the z-direction can not have shrink-wrap boundary conditions");
   }
   
   //--------------------------------------------------------------------------
   // Check if the lb/viscous fix is also called:
   //--------------------------------------------------------------------------
   groupbit_viscouslb = groupbit_pc = groupbit_rigid_pc_sphere = 0;
   for (i = 0; i < modify->nfix; i++){
     if (strcmp(modify->fix[i]->style,"lb/viscous") == 0){
       fixviscouslb = 1;
       groupbit_viscouslb = group->bitmask[modify->fix[i]->igroup];
     }
     if(strcmp(modify->fix[i]->style,"lb/pc")==0){
       groupbit_pc = group->bitmask[modify->fix[i]->igroup];
     }
     if(strcmp(modify->fix[i]->style,"lb/rigid/pc/sphere")==0){
       groupbit_rigid_pc_sphere = group->bitmask[modify->fix[i]->igroup];
     }
   }
 
   // Warn if the fluid force is not applied to any of the particles.
   if(!(groupbit_viscouslb || groupbit_pc || groupbit_rigid_pc_sphere) && comm->me==0){
     error->message(FLERR,"Not adding the fluid force to any of the MD particles.  To add this force use one of the lb/viscous, lb/pc, or lb/rigid/pc/sphere fixes");
   }
   
   // If fix lb/viscous is called for a particular atom, make sure 
   // lb/pc or lb/rigid/pc/sphere are not:
   if(fixviscouslb == 1){
     int *mask = atom->mask;
     int nlocal = atom->nlocal;
       for(j=0; j<nlocal; j++){
 	if((mask[j] & groupbit) && (mask[j] & groupbit_viscouslb) && (mask[j] & groupbit_pc))
 	  error->one(FLERR,"should not use the lb/viscous command when integrating with the lb/pc fix");
 	if((mask[j] & groupbit) && (mask[j] & groupbit_viscouslb) && (mask[j] & groupbit_rigid_pc_sphere))
 	  error->one(FLERR,"should not use the lb/viscous command when integrating with the lb/rigid/pc/sphere fix");
       }
    }
  
 }
 
 void FixLbFluid::setup(int vflag)
 {
   //--------------------------------------------------------------------------
   // Need to calculate the force on the fluid for a restart run.
   //--------------------------------------------------------------------------
   if(step > 0)
     calc_fluidforce();
 }  
 
 void FixLbFluid::initial_integrate(int vflag)
 {
   // only call every nevery timesteps (by default nevery only affects how
   // often end_of_step is called.
   if(update->ntimestep % nevery == 0){
     //--------------------------------------------------------------------------
     // Print a header labelling any output printed to the screen.
     //--------------------------------------------------------------------------
     static int printheader = 1;
     
     if(printheader == 1){
       if(force_diagnostic > 0 && me == 0){
 	printf("-------------------------------------------------------------------------------\n");
 	printf("     F_x          F_y          F_z          T_x          T_y          T_z\n");
 	printf("-------------------------------------------------------------------------------\n");
       }
       
       if(printfluid > 0 && me == 0){
 	printf("---------------------------------------------------------------------\n");
 	printf("     density            u_x              u_y              u_z \n");
 	printf("---------------------------------------------------------------------\n");
       }
       printheader = 0;
     }
     
     //--------------------------------------------------------------------------
     // Determine the equilibrium distribution on the local subgrid.
     //--------------------------------------------------------------------------
     (*this.*equilibriumdist)(1,subNbx-1,1,subNby-1,1,subNbz-1);
     
     //--------------------------------------------------------------------------
     // Using the equilibrium distribution, calculate the new
     // distribution function.
     //--------------------------------------------------------------------------
     (*this.*update_full)();
     
     std::swap(f_lb,fnew); 
     
     //--------------------------------------------------------------------------
     // Calculate moments of the distribution function.
     //--------------------------------------------------------------------------
     parametercalc_full();
     
     //--------------------------------------------------------------------------
     // Store the equilibrium distribution function, it is needed in
     // the next time step by the update routine.
     //--------------------------------------------------------------------------
     if(typeLB == 2){
       std::swap(feqold,feq);
       std::swap(feqoldn,feqn);
     }
   
   }  
  
   //--------------------------------------------------------------------------
   // Perform diagnostics, and print output for the graphics program
   //--------------------------------------------------------------------------
   if(printfluid > 0 && update->ntimestep > 0 && (update->ntimestep % printfluid == 0))
     streamout();
    
 }
 void FixLbFluid::post_force(int vflag)
 {
   // only call every nevery timesteps (by default nevery only affects how
   // often end_of_step is called.
   if(update->ntimestep % nevery == 0){
     if(fixviscouslb==1)
       calc_fluidforce();
   }
 }
 
 void FixLbFluid::end_of_step()
 {  
   // end_of_step is only called every nevery timesteps
   if(fixviscouslb==0)
     calc_fluidforce();
  
   if(printrestart>0){
     if((update->ntimestep)%printrestart == 0){
       write_restartfile();
     }
   }
 
 }
 
 //==========================================================================
 //   allocate atom-based array
 //==========================================================================
 void FixLbFluid::grow_arrays(int nmax)
 {
   memory->grow(hydroF,nmax,3,"FixLbFluid:hydroF");
 }
 
 //==========================================================================
 //   copy values within local atom-based array
 //==========================================================================
 void FixLbFluid::copy_arrays(int i, int j, int delflag)
 {
   hydroF[j][0] = hydroF[i][0];
   hydroF[j][1] = hydroF[i][1];
   hydroF[j][2] = hydroF[i][2];
 }
 
 //==========================================================================
 //   pack values in local atom-based array for exchange with another proc
 //==========================================================================
 int FixLbFluid::pack_exchange(int i, double *buf)
 {
   buf[0] = hydroF[i][0];
   buf[1] = hydroF[i][1];
   buf[2] = hydroF[i][2];
 
   return 3;
 }
 
 //==========================================================================
 //   unpack values in local atom-based array from exchange with another proc
 //==========================================================================
 int FixLbFluid::unpack_exchange(int nlocal, double *buf)
 {
   hydroF[nlocal][0] = buf[0];
   hydroF[nlocal][1] = buf[1];
   hydroF[nlocal][2] = buf[2];
 
   return 3;
 }
 
 //==========================================================================
 //   calculate the force from the local atoms acting on the fluid.
 //==========================================================================
 void FixLbFluid::calc_fluidforce(void)
 {
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
   double **x = atom->x;
   int i,j,k,m;
   MPI_Request requests[20];
   MPI_Status statuses[20];
   double forceloc[3],force[3];
   double torqueloc[3],torque[3];
   
   //--------------------------------------------------------------------------
   // Zero out arrays
   //--------------------------------------------------------------------------
   std::fill(&Ff[0][0][0][0],&Ff[0][0][0][0] + (subNbx+3)*(subNby+3)*(subNbz+3)*3,0.0);
   std::fill(&Fftempx[0][0][0][0],&Fftempx[0][0][0][0] + 5*(subNby+3)*(subNbz+3)*3,0.0);
   std::fill(&Fftempy[0][0][0][0],&Fftempy[0][0][0][0] + (subNbx+3)*5*(subNbz+3)*3,0.0);
   std::fill(&Fftempz[0][0][0][0],&Fftempz[0][0][0][0] + (subNbx+3)*(subNby+3)*5*3,0.0);
 
   forceloc[0] = forceloc[1] = forceloc[2] = 0.0;
   torqueloc[0] = torqueloc[1] = torqueloc[2] = 0.0;
 
   for(i=0; i<atom->nmax; i++)
     for(j=0; j<3; j++)
       hydroF[i][j] = 0.0;
   
   
   double unwrap[3];
   double dx,dy,dz;
   double massone;
   imageint *image = atom->image;
   double *rmass = atom->rmass;
   double *mass = atom->mass;
   int *type = atom->type;
   double sum[4],xcm[4];
 
   if(force_diagnostic > 0 && update->ntimestep > 0 && (update->ntimestep % force_diagnostic == 0)){
     //Calculate the center of mass of the particle group
     //(needed to calculate the torque).
     sum[0] = sum[1] = sum[2] = sum[3] = 0.0;
     for(i=0; i<nlocal; i++){
       if(mask[i] & group->bitmask[igroupforce]){
 
 	domain->unmap(x[i],image[i],unwrap);
 
 	if(rmass) massone = rmass[i];
 	else massone = mass[type[i]];
 	
 	sum[0] += unwrap[0]*massone;
 	sum[1] += unwrap[1]*massone;
 	sum[2] += unwrap[2]*massone;
 	sum[3] += massone;
       }
     }
     MPI_Allreduce(&sum[0],&xcm[0],4,MPI_DOUBLE,MPI_SUM,world);
     xcm[0] = xcm[0]/xcm[3];
     xcm[1] = xcm[1]/xcm[3];
     xcm[2] = xcm[2]/xcm[3];
   }
 
   //--------------------------------------------------------------------------
   //Calculate the contribution to the force on the fluid.
   //--------------------------------------------------------------------------
   for(i=0; i<nlocal; i++){
     if(mask[i] & groupbit){
       if(trilinear_stencil==1) {
 	trilinear_interpolation(i);
       }else{
 	peskin_interpolation(i);
       }
       
       if(force_diagnostic > 0 && update->ntimestep > 0 && (update->ntimestep % force_diagnostic == 0)){
 	if(mask[i] & group->bitmask[igroupforce]){
 	  
 	  domain->unmap(x[i],image[i],unwrap);
 	  dx = unwrap[0] - xcm[0];
 	  dy = unwrap[1] - xcm[1];
 	  dz = unwrap[2] - xcm[2];
 	  
 	  forceloc[0] += hydroF[i][0];
 	  forceloc[1] += hydroF[i][1];
 	  forceloc[2] += hydroF[i][2];
 	  torqueloc[0] += dy*hydroF[i][2] - dz*hydroF[i][1];
 	  torqueloc[1] += dz*hydroF[i][0] - dx*hydroF[i][2];
 	  torqueloc[2] += dx*hydroF[i][1] - dy*hydroF[i][0];
 	}	    
       } 
     }
   }
 
   //--------------------------------------------------------------------------
   //Communicate the force contributions which lie outside the local processor
   //sub domain.
   //--------------------------------------------------------------------------
   for(i=0; i<10; i++)
     requests[i]=MPI_REQUEST_NULL;
   MPI_Isend(&Ff[0][0][0][0],1,passxu,comm->procneigh[0][0],10,world,&requests[0]);
   MPI_Isend(&Ff[subNbx+2][0][0][0],1,passxu,comm->procneigh[0][0],20,world,&requests[1]);
   MPI_Isend(&Ff[subNbx-1][0][0][0],1,passxu,comm->procneigh[0][1],30,world,&requests[2]);
   MPI_Isend(&Ff[subNbx][0][0][0],1,passxu,comm->procneigh[0][1],40,world,&requests[3]);
   MPI_Isend(&Ff[subNbx+1][0][0][0],1,passxu,comm->procneigh[0][1],50,world,&requests[4]);
   MPI_Irecv(&Fftempx[0][0][0][0],1,passxtemp,comm->procneigh[0][1],10,world,&requests[5]);
   MPI_Irecv(&Fftempx[1][0][0][0],1,passxtemp,comm->procneigh[0][1],20,world,&requests[6]);
   MPI_Irecv(&Fftempx[2][0][0][0],1,passxtemp,comm->procneigh[0][0],30,world,&requests[7]);
   MPI_Irecv(&Fftempx[3][0][0][0],1,passxtemp,comm->procneigh[0][0],40,world,&requests[8]);
   MPI_Irecv(&Fftempx[4][0][0][0],1,passxtemp,comm->procneigh[0][0],50,world,&requests[9]);
   MPI_Waitall(10,requests,statuses);
   
   for(j=0; j<subNby+3; j++){
     for(k=0; k<subNbz+3; k++){
       for(m=0; m<3; m++){
 	Ff[subNbx-2][j][k][m] += Fftempx[0][j][k][m];
 	Ff[subNbx-3][j][k][m] += Fftempx[1][j][k][m];
 	Ff[1][j][k][m] += Fftempx[2][j][k][m];
 	Ff[2][j][k][m] += Fftempx[3][j][k][m];
 	Ff[3][j][k][m] += Fftempx[4][j][k][m];
       }
     }
   }
 
   for(i=0; i<10; i++)
     requests[i]=MPI_REQUEST_NULL;
   MPI_Isend(&Ff[0][0][0][0],1,passyu,comm->procneigh[1][0],10,world,&requests[0]);
   MPI_Isend(&Ff[0][subNby+2][0][0],1,passyu,comm->procneigh[1][0],20,world,&requests[1]);
   MPI_Isend(&Ff[0][subNby-1][0][0],1,passyu,comm->procneigh[1][1],30,world,&requests[2]);
   MPI_Isend(&Ff[0][subNby][0][0],1,passyu,comm->procneigh[1][1],40,world,&requests[3]);
   MPI_Isend(&Ff[0][subNby+1][0][0],1,passyu,comm->procneigh[1][1],50,world,&requests[4]);
   MPI_Irecv(&Fftempy[0][0][0][0],1,passytemp,comm->procneigh[1][1],10,world,&requests[5]);
   MPI_Irecv(&Fftempy[0][1][0][0],1,passytemp,comm->procneigh[1][1],20,world,&requests[6]);
   MPI_Irecv(&Fftempy[0][2][0][0],1,passytemp,comm->procneigh[1][0],30,world,&requests[7]);
   MPI_Irecv(&Fftempy[0][3][0][0],1,passytemp,comm->procneigh[1][0],40,world,&requests[8]);
   MPI_Irecv(&Fftempy[0][4][0][0],1,passytemp,comm->procneigh[1][0],50,world,&requests[9]);
   MPI_Waitall(10,requests,statuses);
 
   for(i=0; i<subNbx+3; i++){
     for(k=0; k<subNbz+3; k++){
       for(m=0; m<3; m++){
 	Ff[i][subNby-2][k][m] += Fftempy[i][0][k][m];
 	Ff[i][subNby-3][k][m] += Fftempy[i][1][k][m];
 	Ff[i][1][k][m] += Fftempy[i][2][k][m];
 	Ff[i][2][k][m] += Fftempy[i][3][k][m];
 	Ff[i][3][k][m] += Fftempy[i][4][k][m];
       }
     }
   }
 
   for(i=0; i<10; i++)
     requests[i]=MPI_REQUEST_NULL;
   MPI_Isend(&Ff[0][0][0][0],1,passzu,comm->procneigh[2][0],10,world,&requests[0]);
   MPI_Isend(&Ff[0][0][subNbz+2][0],1,passzu,comm->procneigh[2][0],20,world,&requests[1]);
   MPI_Isend(&Ff[0][0][subNbz-1][0],1,passzu,comm->procneigh[2][1],30,world,&requests[2]);
   MPI_Isend(&Ff[0][0][subNbz][0],1,passzu,comm->procneigh[2][1],40,world,&requests[3]);
   MPI_Isend(&Ff[0][0][subNbz+1][0],1,passzu,comm->procneigh[2][1],50,world,&requests[4]);
   MPI_Irecv(&Fftempz[0][0][0][0],1,passztemp,comm->procneigh[2][1],10,world,&requests[5]);
   MPI_Irecv(&Fftempz[0][0][1][0],1,passztemp,comm->procneigh[2][1],20,world,&requests[6]);
   MPI_Irecv(&Fftempz[0][0][2][0],1,passztemp,comm->procneigh[2][0],30,world,&requests[7]);
   MPI_Irecv(&Fftempz[0][0][3][0],1,passztemp,comm->procneigh[2][0],40,world,&requests[8]);
   MPI_Irecv(&Fftempz[0][0][4][0],1,passztemp,comm->procneigh[2][0],50,world,&requests[9]);
   MPI_Waitall(10,requests,statuses);  
 
   for(i=0; i<subNbx+3; i++){
     for(j=0; j<subNby+3; j++){
       for(m=0; m<3; m++){
 	Ff[i][j][subNbz-2][m] += Fftempz[i][j][0][m];
 	Ff[i][j][subNbz-3][m] += Fftempz[i][j][1][m];
 	Ff[i][j][1][m] += Fftempz[i][j][2][m];
 	Ff[i][j][2][m] += Fftempz[i][j][3][m];
 	Ff[i][j][3][m] += Fftempz[i][j][4][m];
       }
     }
   }
 
   if(force_diagnostic > 0 && update->ntimestep > 0 && (update->ntimestep % force_diagnostic == 0)){
     force[0] = force[1] = force[2] = 0.0;
     torque[0] = torque[1] = torque[2] =0.0;
     
     MPI_Allreduce(&forceloc[0],&force[0],3,MPI_DOUBLE,MPI_SUM,world); 
     MPI_Allreduce(&torqueloc[0],&torque[0],3,MPI_DOUBLE,MPI_SUM,world);
     
     if(me==0){
       printf("%E %E %E %E %E %E\n",force[0],force[1],force[2],
  	     torque[0],torque[1],torque[2]);
 
     }
   }
   
 }
 //==========================================================================
 // uses the Peskin stencil to perform the velocity, density and
 // force interpolations.
 //==========================================================================
 void FixLbFluid::peskin_interpolation(int i)
 {
   double **x = atom->x;
   double **v = atom->v;
   int *type = atom->type;
   double *rmass = atom->rmass;
   double *mass = atom->mass;
   double massone;
   int ix,iy,iz;
   int ixp,iyp,izp;
   double dx1,dy1,dz1;
   int isten,ii,jj,kk;
   double r,rsq,weightx,weighty,weightz;
   double FfP[64];
   int k;
   double unode[3];
   double mnode;
   double gammavalue;
 
   //--------------------------------------------------------------------------
   //Calculate nearest leftmost grid point.
   //Since array indices from 1 to subNb-2 correspond to the
   // local subprocessor domain (not indices from 0), use the 
   // ceiling value.
   //--------------------------------------------------------------------------
   ix = (int)ceil((x[i][0]-domain->sublo[0])/dx_lb);
   iy = (int)ceil((x[i][1]-domain->sublo[1])/dx_lb);
   iz = (int)ceil((x[i][2]-domain->sublo[2])/dx_lb);
 	
   //--------------------------------------------------------------------------
   //Calculate distances to the nearest points.
   //--------------------------------------------------------------------------
   dx1 = x[i][0] - (domain->sublo[0] + (ix-1)*dx_lb);
   dy1 = x[i][1] - (domain->sublo[1] + (iy-1)*dx_lb);
   dz1 = x[i][2] - (domain->sublo[2] + (iz-1)*dx_lb);
   
   // Need to convert these to lattice units:
   dx1 = dx1/dx_lb;
   dy1 = dy1/dx_lb;
   dz1 = dz1/dx_lb;
   
   unode[0]=0.0; unode[1]=0.0; unode[2]=0.0;
   mnode = 0.0;
   isten=0;
     
   //--------------------------------------------------------------------------
   // Calculate the interpolation weights, and interpolated values of
   // the fluid velocity, and density.
   //--------------------------------------------------------------------------
   for(ii=-1; ii<3; ii++){
     rsq=(-dx1+ii)*(-dx1+ii);
     
     if(rsq>=4)
       weightx=0.0;
     else{
       r=sqrt(rsq);
       if(rsq>1){
 	weightx=(5.0-2.0*r-sqrt(-7.0+12.0*r-4.0*rsq))/8.;
       } else{
 	weightx=(3.0-2.0*r+sqrt(1.0+4.0*r-4.0*rsq))/8.;
       }
     }
     for(jj=-1; jj<3; jj++){
       rsq=(-dy1+jj)*(-dy1+jj);
       if(rsq>=4)
 	weighty=0.0;
       else{
 	r=sqrt(rsq);
 	if(rsq>1){
 	  weighty=(5.0-2.0*r-sqrt(-7.0+12.0*r-4.0*rsq))/8.;
 	} else{
 	  weighty=(3.0-2.0*r+sqrt(1.0+4.0*r-4.0*rsq))/8.;
 	}
       }
       for(kk=-1; kk<3; kk++){
 	rsq=(-dz1+kk)*(-dz1+kk);
 	if(rsq>=4)
 	  weightz=0.0;
 	else{
 	  r=sqrt(rsq);
 	  if(rsq>1){
 	    weightz=(5.0-2.0*r-sqrt(-7.0+12.0*r-4.0*rsq))/8.;
 	  } else{
 	    weightz=(3.0-2.0*r+sqrt(1.0+4.0*r-4.0*rsq))/8.;
 	  }
 	}
 	ixp = ix+ii;
 	iyp = iy+jj;
 	izp = iz+kk;
 
 	//The atom is allowed to be within one lattice grid point outside the
 	//local processor sub-domain.  
 	if(ixp < -1 || ixp > (subNbx+1) || iyp < -1 || iyp > (subNby+1) || izp < -1 || izp > (subNbz+1))
 	  error->one(FLERR,"Atom outside local processor simulation domain.  Either unstable fluid pararmeters, or \
 require more frequent neighborlist rebuilds");
 
 	if(domain->periodicity[2] == 0 && comm->myloc[2] == 0 && izp < 1)
 	  error->warning(FLERR,"Atom too close to lower z wall.  Unphysical results may occur");
 	if(domain->periodicity[2] == 0 && comm->myloc[2] == (comm->procgrid[2]-1) && (izp > (subNbz-2) ))
 	  error->warning(FLERR,"Atom too close to upper z wall.  Unphysical results may occur");
 	
 	if(ixp==-1) ixp=subNbx+2;
 	if(iyp==-1) iyp=subNby+2;
 	if(izp==-1) izp=subNbz+2;
 	
 	FfP[isten] = weightx*weighty*weightz;
 	// interpolated velocity based on delta function.
 	for(k=0; k<3; k++){
 	  unode[k] += u_lb[ixp][iyp][izp][k]*FfP[isten];
 	}
 	if(setGamma==0)
 	  mnode += density_lb[ixp][iyp][izp]*FfP[isten];
 	
 	isten++;
       }
     }
   }
   if(setGamma==0){
     mnode *= NodeArea[type[i]];
 
     if(rmass) massone = rmass[i];
     else massone = mass[type[i]];
     massone = massone/dm_lb; 
 
     gammavalue = 2.0*(mnode*massone)*dtoverdtcollision/(mnode+massone);
   }    
   else{
     gammavalue = Gamma[type[i]];
   }
   
   isten=0;
   for(ii=-1; ii<3; ii++)
     for(jj=-1; jj<3; jj++)
       for(kk=-1; kk<3; kk++){
 	ixp = ix+ii;
 	iyp = iy+jj;
 	izp = iz+kk;
 	
 	if(ixp==-1) ixp=subNbx+2;
 	if(iyp==-1) iyp=subNby+2;
 	if(izp==-1) izp=subNbz+2;
 	// Compute the force on the fluid.  Need to convert the velocity from
 	// LAMMPS units to LB units.
 	for(k=0; k<3; k++){
 	  Ff[ixp][iyp][izp][k] += gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*FfP[isten];
 	}
 	
 	isten++;
       }
   for(k=0; k<3; k++)
     hydroF[i][k] = -1.0*gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*dm_lb*dx_lb/dt_lb/dt_lb;
 }
 
 //==========================================================================
 // uses the trilinear stencil to perform the velocity, density and
 // force interpolations.
 //==========================================================================
 void FixLbFluid::trilinear_interpolation(int i)
 {
   double **x = atom->x;
   double **v = atom->v;
   int *type = atom->type;
   double *rmass = atom->rmass;
   double *mass = atom->mass;
   double massone;
   int ix,iy,iz;
   int ixp,iyp,izp;
   double dx1,dy1,dz1;
   double FfP[8];
   int k;
   double unode[3];
   double mnode;
   double gammavalue;
 
   //--------------------------------------------------------------------------
   // Calculate nearest leftmost grid point.
   // Since array indices from 1 to subNb-2 correspond to the
   // local subprocessor domain (not indices from 0), use the 
   // ceiling value.
   //--------------------------------------------------------------------------
   ix = (int)ceil((x[i][0]-domain->sublo[0])/dx_lb);
   iy = (int)ceil((x[i][1]-domain->sublo[1])/dx_lb);
   iz = (int)ceil((x[i][2]-domain->sublo[2])/dx_lb);
 	
   //--------------------------------------------------------------------------
   //Calculate distances to the nearest points.
   //--------------------------------------------------------------------------
   dx1 = x[i][0] - (domain->sublo[0] + (ix-1)*dx_lb);
   dy1 = x[i][1] - (domain->sublo[1] + (iy-1)*dx_lb);
   dz1 = x[i][2] - (domain->sublo[2] + (iz-1)*dx_lb);
  
   //--------------------------------------------------------------------------
   // Need to convert these to lattice units:
   //--------------------------------------------------------------------------
   dx1 = dx1/dx_lb;
   dy1 = dy1/dx_lb;
   dz1 = dz1/dx_lb;
 
   //--------------------------------------------------------------------------
   // Calculate the interpolation weights
   //--------------------------------------------------------------------------
   FfP[0] = (1.-dx1)*(1.-dy1)*(1.-dz1);
   FfP[1] = (1.-dx1)*(1.-dy1)*dz1;
   FfP[2] = (1.-dx1)*dy1*(1.-dz1);
   FfP[3] = (1.-dx1)*dy1*dz1;
   FfP[4] = dx1*(1.-dy1)*(1.-dz1);
   FfP[5] = dx1*(1.-dy1)*dz1;
   FfP[6] = dx1*dy1*(1.-dz1);
   FfP[7] = dx1*dy1*dz1;
   
   ixp = (ix+1);
   iyp = (iy+1);
   izp = (iz+1);
 
   //The atom is allowed to be within one lattice grid point outside the
   //local processor sub-domain.  
   if(ix < 0 || ixp > (subNbx+1) || iy < 0 || iyp > (subNby+1) || iz < 0 || izp > (subNbz+1))
     error->one(FLERR,"Atom outside local processor simulation domain.  Either unstable fluid pararmeters, or \
 require more frequent neighborlist rebuilds");
 
   if(domain->periodicity[2] == 0 && comm->myloc[2] == 0 && (iz < 1 || izp < 1))
     error->warning(FLERR,"Atom too close to lower z wall.  Unphysical results may occur");
   if(domain->periodicity[2] == 0 && comm->myloc[2] == (comm->procgrid[2]-1) && (izp > (subNbz-2) || iz > (subNbz-2)))
     error->warning(FLERR,"Atom too close to upper z wall.  Unphysical results may occur");  
   
    
   for (k=0; k<3; k++) { 	// tri-linearly interpolated velocity at node
     unode[k] = u_lb[ix][iy][iz][k]*FfP[0]
       + u_lb[ix][iy][izp][k]*FfP[1]
       + u_lb[ix][iyp][iz][k]*FfP[2]
       + u_lb[ix][iyp][izp][k]*FfP[3]
       + u_lb[ixp][iy][iz][k]*FfP[4]
       + u_lb[ixp][iy][izp][k]*FfP[5]
       + u_lb[ixp][iyp][iz][k]*FfP[6]
       + u_lb[ixp][iyp][izp][k]*FfP[7];
   }
 
   if(setGamma==0){
     mnode = density_lb[ix][iy][iz]*FfP[0]
       + density_lb[ix][iy][izp]*FfP[1]
       + density_lb[ix][iyp][iz]*FfP[2]
       + density_lb[ix][iyp][izp]*FfP[3]
       + density_lb[ixp][iy][iz]*FfP[4]
       + density_lb[ixp][iy][izp]*FfP[5]
       + density_lb[ixp][iyp][iz]*FfP[6]
       + density_lb[ixp][iyp][izp]*FfP[7];
 
     mnode *= NodeArea[type[i]];
 
     if(rmass) massone = rmass[i];
     else massone = mass[type[i]];
     massone = massone/dm_lb; 
 
     gammavalue = 2.0*(mnode*massone)*dtoverdtcollision/(mnode+massone);
   }else{
     gammavalue = Gamma[type[i]];
   }
   
 
   for(k=0; k<3; k++){
     Ff[ix][iy][iz][k]    += gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*FfP[0];
     Ff[ix][iy][izp][k]   += gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*FfP[1];
     Ff[ix][iyp][iz][k]   += gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*FfP[2];
     Ff[ix][iyp][izp][k]  += gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*FfP[3];
     Ff[ixp][iy][iz][k]   += gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*FfP[4];
     Ff[ixp][iy][izp][k]  += gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*FfP[5];
     Ff[ixp][iyp][iz][k]  += gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*FfP[6];
     Ff[ixp][iyp][izp][k] += gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*FfP[7];
   }
 
   for(k=0; k<3; k++)
     hydroF[i][k] = -1.0*gammavalue*((v[i][k]*dt_lb/dx_lb)-unode[k])*dm_lb*dx_lb/dt_lb/dt_lb;
 
 }
 
 //==========================================================================
 // read in a fluid restart file.  This is only used to restart the
 // fluid portion of a LAMMPS simulation.  
 //==========================================================================
 void FixLbFluid::read_restartfile(void)
 {
   MPI_Status status;
   MPI_Datatype realtype;
   MPI_Datatype filetype;
 
 
   int realsizes[4] = {subNbx,subNby,subNbz,numvel};
   int realstarts[4] = {1,1,1,0};
   int gsizes[4] = {Nbx,Nby,Nbz,numvel};
   int lsizes[4] = {subNbx-2,subNby-2,subNbz-2,numvel};
   int starts[4] = {comm->myloc[0]*(subNbx-2),comm->myloc[1]*(subNby-2),comm->myloc[2]*(subNbz-2),0};
   if(domain->periodicity[2]==0 && comm->myloc[2]==comm->procgrid[2]-1){
     starts[2] = comm->myloc[2]*(subNbz-3);
   }
 
   MPI_Type_create_subarray(4,realsizes,lsizes,realstarts,MPI_ORDER_C,MPI_DOUBLE,&realtype);
   MPI_Type_commit(&realtype);
 
   MPI_Type_create_subarray(4,gsizes,lsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&filetype);
   MPI_Type_commit(&filetype);
 
   MPI_File_set_view(pFileRead,0,MPI_DOUBLE,filetype,(char *) "native",
                     MPI_INFO_NULL);
   MPI_File_seek(pFileRead,0,MPI_SEEK_SET);
   MPI_File_read_all(pFileRead,&f_lb[0][0][0][0],1,realtype,&status);
   if(typeLB == 2){
     MPI_File_read_all(pFileRead,&feqold[0][0][0][0],1,realtype,&status);
     MPI_File_read_all(pFileRead,&feqoldn[0][0][0][0],1,realtype,&status);  
   }
 
   MPI_Type_free(&realtype);
   MPI_Type_free(&filetype);
   MPI_File_close(&pFileRead);
 
 }
 
 //==========================================================================
 // write a fluid restart file.   
 //==========================================================================
 void FixLbFluid::write_restartfile(void)
 {
 
   MPI_File fh;
   MPI_Status status;
   MPI_Datatype realtype;
   MPI_Datatype filetype;
 
   char *hfile;
   hfile = new char[32];
   sprintf(hfile,"FluidRestart_" BIGINT_FORMAT ".dat",update->ntimestep);
   
   MPI_File_open(world,hfile,MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL,&fh);
 
   int realsizes[4] = {subNbx,subNby,subNbz,numvel};
   int realstarts[4] = {1,1,1,0};
   int gsizes[4] = {Nbx,Nby,Nbz,numvel};
   int lsizes[4] = {subNbx-2,subNby-2,subNbz-2,numvel};
   int starts[4] = {comm->myloc[0]*(subNbx-2),comm->myloc[1]*(subNby-2),comm->myloc[2]*(subNbz-2),0};
   if(domain->periodicity[2]==0 && comm->myloc[2]==comm->procgrid[2]-1){
     starts[2] = comm->myloc[2]*(subNbz-3);
   }
 
   MPI_Type_create_subarray(4,realsizes,lsizes,realstarts,MPI_ORDER_C,MPI_DOUBLE,&realtype);
   MPI_Type_commit(&realtype);
 
   MPI_Type_create_subarray(4,gsizes,lsizes,starts,MPI_ORDER_C,MPI_DOUBLE,&filetype);
   MPI_Type_commit(&filetype);
 
   MPI_File_set_view(fh,0,MPI_DOUBLE,filetype,(char *) "native",MPI_INFO_NULL);
   MPI_File_write_all(fh,&f_lb[0][0][0][0],1,realtype,&status);
   if(typeLB == 2){
     MPI_File_write_all(fh,&feqold[0][0][0][0],1,realtype,&status);
     MPI_File_write_all(fh,&feqoldn[0][0][0][0],1,realtype,&status);  
   }
 
   MPI_Type_free(&realtype);
   MPI_Type_free(&filetype);
   MPI_File_close(&fh);
   delete [] hfile;
 
 }
 
 //==========================================================================
 // rescale the simulation parameters so that dx_lb=dt_lb=dm_lb=1.
 // This assumes that all the simulation parameters have been given in
 // terms of distance, time and mass units. 
 //==========================================================================
 void FixLbFluid::rescale(void)
 {
   vwtp = vwtp*dt_lb/dx_lb;
   vwbt = vwbt*dt_lb/dx_lb;
   
   bodyforcex = bodyforcex*dt_lb*dt_lb/dx_lb;
   bodyforcey = bodyforcey*dt_lb*dt_lb/dx_lb;
   bodyforcez = bodyforcez*dt_lb*dt_lb/dx_lb;
   
   tau=(3.0*viscosity/densityinit_real)*dt_lb*dt_lb/dx_lb/dx_lb;
   tau /= dt_lb;
   if(typeLB==1)
     tau = tau + 0.5;
    
   if(setGamma == 0){
     for(int i=0; i<= atom->ntypes; i++){
       NodeArea[i] = NodeArea[i]/dx_lb/dx_lb;
     }
   }else{
     for(int i=0; i<= atom->ntypes; i++){
       Gamma[i] = Gamma[i]*dt_lb/dm_lb;
     }
   }
    
   densityinit = densityinit_real*dx_lb*dx_lb*dx_lb/dm_lb;
 
   a_0 = a_0_real*dt_lb*dt_lb/(dx_lb*dx_lb);
 
   // Warn if using the D3Q19 model with noise, and a0 is too small.
   if(numvel==19 && noisestress==1 && a_0 < 0.2){
     error->warning(FLERR,"Fix lb/fluid WARNING: Chosen value for a0 may be too small. \
 Check temperature reproduction.\n");
   }
 
   if(noisestress==1){
     if(a_0>0.5555555){
       error->all(FLERR,"Fix lb/fluid ERROR: the Lattice Boltzmann dx and dt need \
 to be chosen such that the scaled a_0 < 5/9\n");
     }
   }
 
   // Courant Condition:
   if(a_0 >= 1.0){
     error->all(FLERR,"Fix lb/fluid ERROR: the lattice Boltzmann dx and dt do not \
 satisfy the Courant condition.\n");
   }
 
   kB = (force->boltz/force->mvv2e)*dt_lb*dt_lb/dx_lb/dx_lb/dm_lb;
 
   if(typeLB==1){
     expminusdtovertau = 0.0;
     Dcoeff = 0.0;
     namp = 2.0*kB*T*(tau-0.5)/3.0;
     noisefactor = 1.0;
     if(a_0 <= 0.333333333333333){
       K_0 = 5.17*(0.333333333333333 - a_0);
     }else{
       K_0 = 2.57*(a_0 - 0.333333333333333);
     }
      dtoverdtcollision = dt_lb*6.0*viscosity/densityinit_real/dx_lb/dx_lb;
   }else if(typeLB==2){
     expminusdtovertau=exp(-1.0/tau);
     Dcoeff=(1.0-(1.0-expminusdtovertau)*tau);
     namp = 2.0*kB*T/3.;
     noisefactor=sqrt((1.0-expminusdtovertau*expminusdtovertau)/
 		     (2.0))/(1.0-expminusdtovertau);
     K_0 = 4.5*(1.0/3.0-a_0);
     dtoverdtcollision = dt_lb*3.0*viscosity/densityinit_real/dx_lb/dx_lb;
   }
 
 }
 
 //==========================================================================
 // Set the lattice-Boltzmann velocity vectors and weights for the D3Q15
 // model.  Initialize the fluid velocity and density.
 //==========================================================================
 void FixLbFluid::initializeLB15(void)
 {
   int i,j,k,m;
 
   //velocity vectors.
   e[0][0]= 0;
   e[0][1]= 0;
   e[0][2]= 0;
 
   e[1][0]= 1;
   e[1][1]= 0;
   e[1][2]= 0;
 
   e[2][0]= 0;
   e[2][1]= 1;
   e[2][2]= 0;
 
   e[3][0]= -1;
   e[3][1]= 0;
   e[3][2]= 0;
 
   e[4][0]= 0;
   e[4][1]= -1;
   e[4][2]= 0;
 
   e[5][0]= 0;
   e[5][1]= 0;
   e[5][2]= 1;
 
   e[6][0]= 0;
   e[6][1]= 0;
   e[6][2]= -1;
 
   e[7][0]= 1;
   e[7][1]= 1;
   e[7][2]= 1;
 
   e[8][0]= -1;
   e[8][1]= 1;
   e[8][2]= 1;
 
   e[9][0]= -1;
   e[9][1]= -1;
   e[9][2]= 1;
 
   e[10][0]= 1;
   e[10][1]= -1;
   e[10][2]= 1;
 
   e[11][0]= 1;
   e[11][1]= 1;
   e[11][2]= -1;
 
   e[12][0]= -1;
   e[12][1]= 1;
   e[12][2]= -1;
 
   e[13][0]= -1;
   e[13][1]= -1;
   e[13][2]= -1;
 
   e[14][0]= 1;
   e[14][1]= -1;
   e[14][2]= -1;
 
   //weights.
   w_lb[0]=2./9.;
   w_lb[1]=1./9.;
   w_lb[2]=1./9.;
   w_lb[3]=1./9.;
   w_lb[4]=1./9.;
   w_lb[5]=1./9.;
   w_lb[6]=1./9.;
   w_lb[7]=1./72.;
   w_lb[8]=1./72.;
   w_lb[9]=1./72.;
   w_lb[10]=1./72.;
   w_lb[11]=1./72.;
   w_lb[12]=1./72.;
   w_lb[13]=1./72.;
   w_lb[14]=1./72.;
 
   Ng_lb[0]=1.;
   Ng_lb[1]=3.;
   Ng_lb[2]=3.;
   Ng_lb[3]=3.;
   Ng_lb[4]=9./2.;
   Ng_lb[5]=9./2.;
   Ng_lb[6]=9./2.;
   Ng_lb[7]=9.;
   Ng_lb[8]=9.;
   Ng_lb[9]=9.;
   Ng_lb[10]=27./2.;
   Ng_lb[11]=27./2.;
   Ng_lb[12]=27./2.;
   Ng_lb[13]=9.;
   Ng_lb[14]=1.;
 
   mg_lb[0][0]=1.;mg_lb[0][1]=1.;mg_lb[0][2]=1.;mg_lb[0][3]=1.;mg_lb[0][4]=1.;
   mg_lb[0][5]=1.;mg_lb[0][6]=1.;mg_lb[0][7]=1.;mg_lb[0][8]=1.;mg_lb[0][9]=1.;
   mg_lb[0][10]=1.;mg_lb[0][11]=1.;mg_lb[0][12]=1.;mg_lb[0][13]=1.;mg_lb[0][14]=1.;
   mg_lb[1][0]=0;mg_lb[1][1]=1.;mg_lb[1][2]=0;mg_lb[1][3]=-1.;mg_lb[1][4]=0;
   mg_lb[1][5]=0;mg_lb[1][6]=0;mg_lb[1][7]=1.;mg_lb[1][8]=-1.;mg_lb[1][9]=-1.;
   mg_lb[1][10]=1.;mg_lb[1][11]=1.;mg_lb[1][12]=-1.;mg_lb[1][13]=-1.;mg_lb[1][14]=1.;
   mg_lb[2][0]=0;mg_lb[2][1]=0;mg_lb[2][2]=1.;mg_lb[2][3]=0;mg_lb[2][4]=-1.;
   mg_lb[2][5]=0;mg_lb[2][6]=0;mg_lb[2][7]=1.;mg_lb[2][8]=1.;mg_lb[2][9]=-1.;
   mg_lb[2][10]=-1.;mg_lb[2][11]=1.;mg_lb[2][12]=1.;mg_lb[2][13]=-1.;mg_lb[2][14]=-1.;
   mg_lb[3][0]=0;mg_lb[3][1]=0;mg_lb[3][2]=0;mg_lb[3][3]=0;mg_lb[3][4]=0;
   mg_lb[3][5]=1.;mg_lb[3][6]=-1.;mg_lb[3][7]=1.;mg_lb[3][8]=1.;mg_lb[3][9]=1.;
   mg_lb[3][10]=1.;mg_lb[3][11]=-1.;mg_lb[3][12]=-1.;mg_lb[3][13]=-1.;mg_lb[3][14]=-1.;
   mg_lb[4][0]=-1./3.;mg_lb[4][1]=2./3.;mg_lb[4][2]=-1./3.;mg_lb[4][3]=2./3.;mg_lb[4][4]=-1./3.;
   mg_lb[4][5]=-1./3.;mg_lb[4][6]=-1./3.;mg_lb[4][7]=2./3.;mg_lb[4][8]=2./3.;mg_lb[4][9]=2./3.;
   mg_lb[4][10]=2./3.;mg_lb[4][11]=2./3.;mg_lb[4][12]=2./3.;mg_lb[4][13]=2./3.;mg_lb[4][14]=2./3.;
   mg_lb[5][0]=-1./3.;mg_lb[5][1]=-1./3.;mg_lb[5][2]=2./3.;mg_lb[5][3]=-1./3.;mg_lb[5][4]=2./3.;
   mg_lb[5][5]=-1./3.;mg_lb[5][6]=-1./3.;mg_lb[5][7]=2./3.;mg_lb[5][8]=2./3.;mg_lb[5][9]=2./3.;
   mg_lb[5][10]=2./3.;mg_lb[5][11]=2./3.;mg_lb[5][12]=2./3.;mg_lb[5][13]=2./3.;mg_lb[5][14]=2./3.;
   mg_lb[6][0]=-1./3.;mg_lb[6][1]=-1./3.;mg_lb[6][2]=-1./3.;mg_lb[6][3]=-1./3.;mg_lb[6][4]=-1./3.;
   mg_lb[6][5]=2./3.;mg_lb[6][6]=2./3.;mg_lb[6][7]=2./3.;mg_lb[6][8]=2./3.;mg_lb[6][9]=2./3.;
   mg_lb[6][10]=2./3.;mg_lb[6][11]=2./3.;mg_lb[6][12]=2./3.;mg_lb[6][13]=2./3.;mg_lb[6][14]=2./3.;
   mg_lb[7][0]=0;mg_lb[7][1]=0;mg_lb[7][2]=0;mg_lb[7][3]=0;mg_lb[7][4]=0;
   mg_lb[7][5]=0;mg_lb[7][6]=0;mg_lb[7][7]=1;mg_lb[7][8]=-1;mg_lb[7][9]=1;
   mg_lb[7][10]=-1;mg_lb[7][11]=1;mg_lb[7][12]=-1;mg_lb[7][13]=1;mg_lb[7][14]=-1; 
   mg_lb[8][0]=0;mg_lb[8][1]=0;mg_lb[8][2]=0;mg_lb[8][3]=0;mg_lb[8][4]=0;
   mg_lb[8][5]=0;mg_lb[8][6]=0;mg_lb[8][7]=1;mg_lb[8][8]=1;mg_lb[8][9]=-1;
   mg_lb[8][10]=-1;mg_lb[8][11]=-1;mg_lb[8][12]=-1;mg_lb[8][13]=1;mg_lb[8][14]=1; 
   mg_lb[9][0]=0;mg_lb[9][1]=0;mg_lb[9][2]=0;mg_lb[9][3]=0;mg_lb[9][4]=0;
   mg_lb[9][5]=0;mg_lb[9][6]=0;mg_lb[9][7]=1;mg_lb[9][8]=-1;mg_lb[9][9]=-1;
   mg_lb[9][10]=1;mg_lb[9][11]=-1;mg_lb[9][12]=1;mg_lb[9][13]=1;mg_lb[9][14]=-1; 
   mg_lb[10][0]=0;mg_lb[10][1]=0;mg_lb[10][2]=-1./3.;mg_lb[10][3]=0;mg_lb[10][4]=1./3.;
   mg_lb[10][5]=0;mg_lb[10][6]=0;mg_lb[10][7]=2./3.;mg_lb[10][8]=2./3.;mg_lb[10][9]=-2./3.;
   mg_lb[10][10]=-2./3.;mg_lb[10][11]=2./3.;mg_lb[10][12]=2./3.;mg_lb[10][13]=-2./3.;mg_lb[10][14]=-2./3.;
   mg_lb[11][0]=0;mg_lb[11][1]=0;mg_lb[11][2]=0;mg_lb[11][3]=0;mg_lb[11][4]=0;
   mg_lb[11][5]=-1./3.;mg_lb[11][6]=1./3.;mg_lb[11][7]=2./3.;mg_lb[11][8]=2./3.;mg_lb[11][9]=2./3.;
   mg_lb[11][10]=2./3.;mg_lb[11][11]=-2./3.;mg_lb[11][12]=-2./3.;mg_lb[11][13]=-2./3.;mg_lb[11][14]=-2./3.;
   mg_lb[12][0]=0;mg_lb[12][1]=-1./3.;mg_lb[12][2]=0;mg_lb[12][3]=1./3.;mg_lb[12][4]=0;
   mg_lb[12][5]=0;mg_lb[12][6]=0;mg_lb[12][7]=2./3.;mg_lb[12][8]=-2./3.;mg_lb[12][9]=-2./3.;
   mg_lb[12][10]=2./3.;mg_lb[12][11]=2./3.;mg_lb[12][12]=-2./3.;mg_lb[12][13]=-2./3.;mg_lb[12][14]=2./3.;
   mg_lb[13][0]=0;mg_lb[13][1]=0;mg_lb[13][2]=0;mg_lb[13][3]=0;mg_lb[13][4]=0;
   mg_lb[13][5]=0;mg_lb[13][6]=0;mg_lb[13][7]=1;mg_lb[13][8]=-1;mg_lb[13][9]=1;
   mg_lb[13][10]=-1;mg_lb[13][11]=-1;mg_lb[13][12]=1;mg_lb[13][13]=-1;mg_lb[13][14]=1;
   mg_lb[14][0]=sqrt(2.);mg_lb[14][1]=-1./sqrt(2.);mg_lb[14][2]=-1./sqrt(2.);
   mg_lb[14][3]=-1./sqrt(2.);mg_lb[14][4]=-1./sqrt(2.);
   mg_lb[14][5]=-1./sqrt(2.);mg_lb[14][6]=-1./sqrt(2.);mg_lb[14][7]=sqrt(2.);
   mg_lb[14][8]=sqrt(2.);mg_lb[14][9]=sqrt(2.);
   mg_lb[14][10]=sqrt(2.);mg_lb[14][11]=sqrt(2.);mg_lb[14][12]=sqrt(2.);
   mg_lb[14][13]=sqrt(2.);mg_lb[14][14]=sqrt(2.);
 
   for(i=0; i<subNbx+3; i++)
     for(j=0; j<subNby+3; j++)
       for(k=0; k<subNbz+3; k++){
 	u_lb[i][j][k][0]=0.0;
 	u_lb[i][j][k][1]=0.0;
 	u_lb[i][j][k][2]=0.0;
 	density_lb[i][j][k] = densityinit;
   }
   for(i=0; i<subNbx; i++)
     for(j=0; j<subNby; j++)
       for(k=0; k<subNbz; k++)
 	for(m=0; m<15; m++)
 	  f_lb[i][j][k][m] = density_lb[i][j][k]/15.0;
 
 }
 
 //==========================================================================
 // Set the lattice-Boltzmann velocity vectors and weights for the D3Q19
 // model.  Initialize the fluid velocity and density.
 //==========================================================================
 void FixLbFluid::initializeLB19(void)
 {
   int i,j,k,m;
 
   //velocity vectors.
   e[0][0]= 0;
   e[0][1]= 0;
   e[0][2]= 0;
 
   e[1][0]= 1;
   e[1][1]= 0;
   e[1][2]= 0;
 
   e[2][0]= 0;
   e[2][1]= 1;
   e[2][2]= 0;
 
   e[3][0]= -1;
   e[3][1]= 0;
   e[3][2]= 0;
 
   e[4][0]= 0;
   e[4][1]= -1;
   e[4][2]= 0;
 
   e[5][0]= 0;
   e[5][1]= 0;
   e[5][2]= 1;
 
   e[6][0]= 0;
   e[6][1]= 0;
   e[6][2]= -1;
 
   e[7][0] = 1;
   e[7][1] = 1;
   e[7][2] = 0;
 
   e[8][0] = 1;
   e[8][1] = -1;
   e[8][2] = 0;
 
   e[9][0] = -1;
   e[9][1] = 1;
   e[9][2] = 0;  
 
   e[10][0] = -1;
   e[10][1] = -1;
   e[10][2] = 0;
 
   e[11][0] = 1;
   e[11][1] = 0;
   e[11][2] = 1;
 
   e[12][0] = 1;
   e[12][1] = 0;
   e[12][2] = -1;
 
   e[13][0] = -1;
   e[13][1] = 0;
   e[13][2] = 1;
 
   e[14][0] = -1;
   e[14][1] = 0;
   e[14][2] = -1;
 
   e[15][0] = 0;
   e[15][1] = 1;
   e[15][2] = 1;
 
   e[16][0] = 0;
   e[16][1] = 1;
   e[16][2] = -1;
 
   e[17][0] = 0;
   e[17][1] = -1;
   e[17][2] = 1;
 
   e[18][0] = 0;
   e[18][1] = -1;
   e[18][2] = -1;
 
   //weights.
   w_lb[0]=1./3.;
   w_lb[1]=1./18.;
   w_lb[2]=1./18.;
   w_lb[3]=1./18.;
   w_lb[4]=1./18.;
   w_lb[5]=1./18.;
   w_lb[6]=1./18.;
   w_lb[7]=1./36.;
   w_lb[8]=1./36.;
   w_lb[9]=1./36.;
   w_lb[10]=1./36.;
   w_lb[11]=1./36.;
   w_lb[12]=1./36.;
   w_lb[13]=1./36.;
   w_lb[14]=1./36.;
   w_lb[15]=1./36.;
   w_lb[16]=1./36.;
   w_lb[17]=1./36.;
   w_lb[18]=1./36.;
 
   Ng_lb[0]=1.;
   Ng_lb[1]=3.;
   Ng_lb[2]=3.;
   Ng_lb[3]=3.;
   Ng_lb[4]=9./2.;
   Ng_lb[5]=9./2.;
   Ng_lb[6]=9./2.;
   Ng_lb[7]=9.;
   Ng_lb[8]=9.;
   Ng_lb[9]=9.;
   Ng_lb[10]=27./2.;
   Ng_lb[11]=27./2.;
   Ng_lb[12]=27./2.;
   Ng_lb[13]=18.;
   Ng_lb[14]=18.;
   Ng_lb[15]=18.;
   Ng_lb[16]=162./7.;
   Ng_lb[17]=126./5.;
   Ng_lb[18]=30.;
 
   mg_lb[0][0] = 1.; mg_lb[0][1] = 1.; mg_lb[0][2] = 1.; mg_lb[0][3] = 1.; mg_lb[0][4] = 1.;
   mg_lb[0][5] = 1.; mg_lb[0][6] = 1.; mg_lb[0][7] = 1.; mg_lb[0][8] = 1.; mg_lb[0][9] = 1.;
   mg_lb[0][10]= 1.; mg_lb[0][11]= 1.; mg_lb[0][12]= 1.; mg_lb[0][13]= 1.; mg_lb[0][14]= 1.;
   mg_lb[0][15]= 1.; mg_lb[0][16]= 1.; mg_lb[0][17]= 1.; mg_lb[0][18]= 1.;
 
   mg_lb[1][0] = 0.; mg_lb[1][1] = 1.; mg_lb[1][2] = 0.; mg_lb[1][3] =-1.; mg_lb[1][4] = 0.;
   mg_lb[1][5] = 0.; mg_lb[1][6] = 0.; mg_lb[1][7] = 1.; mg_lb[1][8] = 1.; mg_lb[1][9] =-1.;
   mg_lb[1][10]=-1.; mg_lb[1][11]= 1.; mg_lb[1][12]= 1.; mg_lb[1][13]=-1.; mg_lb[1][14]=-1.;
   mg_lb[1][15]= 0.; mg_lb[1][16]= 0.; mg_lb[1][17]= 0.; mg_lb[1][18]= 0.;
 
   mg_lb[2][0] = 0.; mg_lb[2][1] = 0.; mg_lb[2][2] = 1.; mg_lb[2][3] = 0.; mg_lb[2][4] =-1.;
   mg_lb[2][5] = 0.; mg_lb[2][6] = 0.; mg_lb[2][7] = 1.; mg_lb[2][8] =-1.; mg_lb[2][9] = 1.;
   mg_lb[2][10]=-1.; mg_lb[2][11]= 0.; mg_lb[2][12]= 0.; mg_lb[2][13]= 0.; mg_lb[2][14]= 0.;
   mg_lb[2][15]= 1.; mg_lb[2][16]= 1.; mg_lb[2][17]=-1.; mg_lb[2][18]=-1.;  
 
   mg_lb[3][0] = 0.; mg_lb[3][1] = 0.; mg_lb[3][2] = 0.; mg_lb[3][3] = 0.; mg_lb[3][4] = 0.;
   mg_lb[3][5] = 1.; mg_lb[3][6] =-1.; mg_lb[3][7] = 0.; mg_lb[3][8] = 0.; mg_lb[3][9] = 0.;
   mg_lb[3][10]= 0.; mg_lb[3][11]= 1.; mg_lb[3][12]=-1.; mg_lb[3][13]= 1.; mg_lb[3][14]=-1.;
   mg_lb[3][15]= 1.; mg_lb[3][16]=-1.; mg_lb[3][17]= 1.; mg_lb[3][18]=-1.;
 
   mg_lb[4][0] =-1./3.; mg_lb[4][1] = 2./3.; mg_lb[4][2] =-1./3.; mg_lb[4][3] = 2./3.; mg_lb[4][4] =-1./3.;
   mg_lb[4][5] =-1./3.; mg_lb[4][6] =-1./3.; mg_lb[4][7] = 2./3.; mg_lb[4][8] = 2./3.; mg_lb[4][9] = 2./3.;
   mg_lb[4][10]= 2./3.; mg_lb[4][11]= 2./3.; mg_lb[4][12]= 2./3.; mg_lb[4][13]= 2./3.; mg_lb[4][14]= 2./3.;
   mg_lb[4][15]=-1./3.; mg_lb[4][16]=-1./3.; mg_lb[4][17]=-1./3.; mg_lb[4][18]=-1./3.;
 
   mg_lb[5][0] =-1./3.; mg_lb[5][1] =-1./3.; mg_lb[5][2] = 2./3.; mg_lb[5][3] =-1./3.; mg_lb[5][4] = 2./3.;
   mg_lb[5][5] =-1./3.; mg_lb[5][6] =-1./3.; mg_lb[5][7] = 2./3.; mg_lb[5][8] = 2./3.; mg_lb[5][9] = 2./3.;
   mg_lb[5][10]= 2./3.; mg_lb[5][11]=-1./3.; mg_lb[5][12]=-1./3.; mg_lb[5][13]=-1./3.; mg_lb[5][14]=-1./3.;
   mg_lb[5][15]= 2./3.; mg_lb[5][16]= 2./3.; mg_lb[5][17]= 2./3.; mg_lb[5][18]= 2./3.;
 
   mg_lb[6][0] =-1./3.; mg_lb[6][1] =-1./3.; mg_lb[6][2] =-1./3.; mg_lb[6][3] =-1./3.; mg_lb[6][4] =-1./3.;
   mg_lb[6][5] = 2./3.; mg_lb[6][6] = 2./3.; mg_lb[6][7] =-1./3.; mg_lb[6][8] =-1./3.; mg_lb[6][9] =-1./3.;
   mg_lb[6][10]=-1./3.; mg_lb[6][11]= 2./3.; mg_lb[6][12]= 2./3.; mg_lb[6][13]= 2./3.; mg_lb[6][14]= 2./3.;
   mg_lb[6][15]= 2./3.; mg_lb[6][16]= 2./3.; mg_lb[6][17]= 2./3.; mg_lb[6][18]= 2./3.;
 
   mg_lb[7][0] = 0.; mg_lb[7][1] = 0.; mg_lb[7][2] = 0.; mg_lb[7][3] = 0.; mg_lb[7][4] = 0.;
   mg_lb[7][5] = 0.; mg_lb[7][6] = 0.; mg_lb[7][7] = 1.; mg_lb[7][8] =-1.; mg_lb[7][9] =-1.;
   mg_lb[7][10]= 1.; mg_lb[7][11]= 0.; mg_lb[7][12]= 0.; mg_lb[7][13]= 0.; mg_lb[7][14]= 0.;
   mg_lb[7][15]= 0.; mg_lb[7][16]= 0.; mg_lb[7][17]= 0.; mg_lb[7][18]= 0.;
 
   mg_lb[8][0] = 0.; mg_lb[8][1] = 0.; mg_lb[8][2] = 0.; mg_lb[8][3] = 0.; mg_lb[8][4] = 0.;
   mg_lb[8][5] = 0.; mg_lb[8][6] = 0.; mg_lb[8][7] = 0.; mg_lb[8][8] = 0.; mg_lb[8][9] = 0.;
   mg_lb[8][10]= 0.; mg_lb[8][11]= 1.; mg_lb[8][12]=-1.; mg_lb[8][13]=-1.; mg_lb[8][14]= 1.;
   mg_lb[8][15]= 0.; mg_lb[8][16]= 0.; mg_lb[8][17]= 0.; mg_lb[8][18]= 0.;
 
   mg_lb[9][0] = 0.; mg_lb[9][1] = 0.; mg_lb[9][2] = 0.; mg_lb[9][3] = 0.; mg_lb[9][4] = 0.;
   mg_lb[9][5] = 0.; mg_lb[9][6] = 0.; mg_lb[9][7] = 0.; mg_lb[9][8] = 0.; mg_lb[9][9] = 0.;
   mg_lb[9][10]= 0.; mg_lb[9][11]= 0.; mg_lb[9][12]= 0.; mg_lb[9][13]= 0.; mg_lb[9][14]= 0.;
   mg_lb[9][15]= 1.; mg_lb[9][16]=-1.; mg_lb[9][17]=-1.; mg_lb[9][18]= 1.;
 
   mg_lb[10][0] = 0.;    mg_lb[10][1] =-1./3.; mg_lb[10][2] = 0.;    mg_lb[10][3] = 1./3.; mg_lb[10][4] = 0.;
   mg_lb[10][5] = 0.;    mg_lb[10][6] = 0.;    mg_lb[10][7] = 2./3.; mg_lb[10][8] = 2./3.; mg_lb[10][9] =-2./3.;
   mg_lb[10][10]=-2./3.; mg_lb[10][11]=-1./3.; mg_lb[10][12]=-1./3.; mg_lb[10][13]= 1./3.; mg_lb[10][14]= 1./3.;
   mg_lb[10][15]= 0.;    mg_lb[10][16]= 0.;    mg_lb[10][17]= 0.;    mg_lb[10][18]= 0.;
 
   mg_lb[11][0] = 0.;    mg_lb[11][1] = 0.;    mg_lb[11][2] =-1./3.; mg_lb[11][3] = 0.;    mg_lb[11][4] = 1./3.;
   mg_lb[11][5] = 0.;    mg_lb[11][6] = 0.;    mg_lb[11][7] = 2./3.; mg_lb[11][8] =-2./3.; mg_lb[11][9] = 2./3.;
   mg_lb[11][10]=-2./3.; mg_lb[11][11]= 0.;    mg_lb[11][12]= 0.;    mg_lb[11][13]= 0.;    mg_lb[11][14]= 0.;
   mg_lb[11][15]=-1./3.; mg_lb[11][16]=-1./3.; mg_lb[11][17]= 1./3.; mg_lb[11][18]= 1./3.;
 
   mg_lb[12][0] = 0.;    mg_lb[12][1] = 0.;    mg_lb[12][2] = 0.;    mg_lb[12][3] = 0.;    mg_lb[12][4] = 0.;
   mg_lb[12][5] =-1./3.; mg_lb[12][6] = 1./3.; mg_lb[12][7] = 0.;    mg_lb[12][8] = 0.;    mg_lb[12][9] = 0.;
   mg_lb[12][10]= 0.;    mg_lb[12][11]= 2./3.; mg_lb[12][12]=-2./3.; mg_lb[12][13]= 2./3.; mg_lb[12][14]=-2./3.;
   mg_lb[12][15]=-1./3.; mg_lb[12][16]= 1./3.; mg_lb[12][17]=-1./3.; mg_lb[12][18]= 1./3.;
 
   mg_lb[13][0] = 0.; mg_lb[13][1] =-0.5; mg_lb[13][2] = 0.;  mg_lb[13][3] = 0.5; mg_lb[13][4] = 0.;
   mg_lb[13][5] = 0.; mg_lb[13][6] = 0.;  mg_lb[13][7] = 0.;  mg_lb[13][8] = 0.;  mg_lb[13][9] = 0.;
   mg_lb[13][10]= 0.; mg_lb[13][11]= 0.5; mg_lb[13][12]= 0.5; mg_lb[13][13]=-0.5; mg_lb[13][14]=-0.5;
   mg_lb[13][15]= 0.; mg_lb[13][16]= 0.;  mg_lb[13][17]= 0.;  mg_lb[13][18]= 0.;
 
   mg_lb[14][0] = 0.;  mg_lb[14][1] = 0.;  mg_lb[14][2] = 0.;  mg_lb[14][3] = 0.;  mg_lb[14][4] = 0.;
   mg_lb[14][5] =-0.5; mg_lb[14][6] = 0.5; mg_lb[14][7] = 0.;  mg_lb[14][8] = 0.;  mg_lb[14][9] = 0.;
   mg_lb[14][10]= 0.;  mg_lb[14][11]= 0.;  mg_lb[14][12]= 0.;  mg_lb[14][13]= 0.;  mg_lb[14][14]= 0.;
   mg_lb[14][15]= 0.5; mg_lb[14][16]=-0.5; mg_lb[14][17]= 0.5; mg_lb[14][18]=-0.5;
 
   mg_lb[15][0] = 0.;  mg_lb[15][1] = 0.;  mg_lb[15][2] =-0.5; mg_lb[15][3] = 0.;  mg_lb[15][4] = 0.5;
   mg_lb[15][5] = 0.;  mg_lb[15][6] = 0.;  mg_lb[15][7] = 0.;  mg_lb[15][8] = 0.;  mg_lb[15][9] = 0.;
   mg_lb[15][10]= 0.;  mg_lb[15][11]= 0.;  mg_lb[15][12]= 0.;  mg_lb[15][13]= 0.;  mg_lb[15][14]= 0.;
   mg_lb[15][15]= 0.5; mg_lb[15][16]= 0.5; mg_lb[15][17]=-0.5; mg_lb[15][18]=-0.5;
 
   mg_lb[16][0] = 1./18.; mg_lb[16][1] =-5./18.; mg_lb[16][2] =-5./18.; mg_lb[16][3] =-5./18.; mg_lb[16][4] =-5./18.;
   mg_lb[16][5] = 2./9.;  mg_lb[16][6] = 2./9.;  mg_lb[16][7] = 7./18.; mg_lb[16][8] = 7./18.; mg_lb[16][9] = 7./18.;
   mg_lb[16][10]= 7./18.; mg_lb[16][11]=-1./9.;  mg_lb[16][12]=-1./9.;  mg_lb[16][13]=-1./9.;  mg_lb[16][14]=-1./9.;
   mg_lb[16][15]=-1./9.;  mg_lb[16][16]=-1./9.;  mg_lb[16][17]=-1./9.;  mg_lb[16][18]=-1./9.;
 
   mg_lb[17][0] = 1./14.; mg_lb[17][1] =-5./14.; mg_lb[17][2] = 1./7.;  mg_lb[17][3] =-5./14.; mg_lb[17][4] = 1./7.;
   mg_lb[17][5] =-3./14.; mg_lb[17][6] =-3./14.; mg_lb[17][7] = 0.;     mg_lb[17][8] = 0.;     mg_lb[17][9] = 0.;
   mg_lb[17][10]= 0.;     mg_lb[17][11]= 5./14.; mg_lb[17][12]= 5./14.; mg_lb[17][13]= 5./14.; mg_lb[17][14]= 5./14.;
   mg_lb[17][15]=-1./7.;  mg_lb[17][16]=-1./7.;  mg_lb[17][17]=-1./7.;  mg_lb[17][18]=-1./7.;
 
   mg_lb[18][0] = 1./10.; mg_lb[18][1] = 0.;     mg_lb[18][2] =-3./10.; mg_lb[18][3] = 0.;    mg_lb[18][4] =-3./10.;
   mg_lb[18][5] =-3./10.; mg_lb[18][6] =-3./10.; mg_lb[18][7] = 0.;     mg_lb[18][8] = 0.;    mg_lb[18][9] = 0.;
   mg_lb[18][10]= 0.;     mg_lb[18][11]= 0.;     mg_lb[18][12]= 0.;     mg_lb[18][13]= 0.;    mg_lb[18][14]= 0.;
   mg_lb[18][15]= 3./10.; mg_lb[18][16]= 3./10.; mg_lb[18][17]= 3./10.; mg_lb[18][18]= 3./10.;
 
   for(i=0; i<subNbx+3; i++)
     for(j=0; j<subNby+3; j++)
       for(k=0; k<subNbz+3; k++){
 	u_lb[i][j][k][0]=0.0;
 	u_lb[i][j][k][1]=0.0;
 	u_lb[i][j][k][2]=0.0;
 	density_lb[i][j][k] = densityinit;
   }
   for(i=0; i<subNbx; i++)
     for(j=0; j<subNby; j++)
       for(k=0; k<subNbz; k++)
 	for(m=0; m<19; m++)
 	  f_lb[i][j][k][m] = density_lb[i][j][k]/19.0;
 
 }
 
 //==========================================================================
 // Initialize the equilibrium distribution functions 
 // (this just uses the initial fluid parameters, and assumes no forces).
 //==========================================================================
 void FixLbFluid::initialize_feq(void)
 {
   int i,j,k,p;
   MPI_Request requests[8];
   MPI_Status statuses[8];
   int numrequests;
 
   // If using the standary LB integrator, do not need to send feqn.
   if(typeLB == 1){
     numrequests = 4;
   }else{
     numrequests = 8;
   }
 
   std::fill(&Ff[0][0][0][0],&Ff[0][0][0][0] + (subNbx+3)*(subNby+3)*(subNbz+3)*3,0.0);
   std::fill(&Fftempx[0][0][0][0],&Fftempx[0][0][0][0] + 5*(subNby+3)*(subNbz+3)*3,0.0);
   std::fill(&Fftempy[0][0][0][0],&Fftempy[0][0][0][0] + (subNbx+3)*5*(subNbz+3)*3,0.0);
   std::fill(&Fftempz[0][0][0][0],&Fftempz[0][0][0][0] + (subNbx+3)*(subNby+3)*5*3,0.0);  
 
   if(readrestart == 0){
     step=0;
 
     parametercalc_full();
     (*this.*equilibriumdist)(1,subNbx-1,1,subNby-1,1,subNbz-1);  
 
     for(i=0; i<numrequests; i++)
       requests[i]=MPI_REQUEST_NULL;
     MPI_Isend(&feq[1][1][1][0],1,passxf,comm->procneigh[0][0],15,world,&requests[0]);
     MPI_Irecv(&feq[0][1][1][0],1,passxf,comm->procneigh[0][0],25,world,&requests[1]);
     MPI_Isend(&feq[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],25,world,&requests[2]);
     MPI_Irecv(&feq[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],15,world,&requests[3]);
     if(typeLB == 2){
       MPI_Isend(&feqn[1][1][1][0],1,passxf,comm->procneigh[0][0],10,world,&requests[4]);
       MPI_Irecv(&feqn[0][1][1][0],1,passxf,comm->procneigh[0][0],20,world,&requests[5]);
       MPI_Isend(&feqn[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],20,world,&requests[6]);
       MPI_Irecv(&feqn[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],10,world,&requests[7]);
     }  
     MPI_Waitall(numrequests,requests,statuses);
     
     for(i=0; i<numrequests; i++)
       requests[i]=MPI_REQUEST_NULL;
     MPI_Isend(&feq[0][1][1][0],1,passyf,comm->procneigh[1][0],15,world,&requests[0]);
     MPI_Irecv(&feq[0][0][1][0],1,passyf,comm->procneigh[1][0],25,world,&requests[1]);   
     MPI_Isend(&feq[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],25,world,&requests[2]);
     MPI_Irecv(&feq[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],15,world,&requests[3]);
     if(typeLB == 2){
       MPI_Isend(&feqn[0][1][1][0],1,passyf,comm->procneigh[1][0],10,world,&requests[4]);
       MPI_Irecv(&feqn[0][0][1][0],1,passyf,comm->procneigh[1][0],20,world,&requests[5]);   
       MPI_Isend(&feqn[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],20,world,&requests[6]);
       MPI_Irecv(&feqn[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],10,world,&requests[7]);
     }
     MPI_Waitall(numrequests,requests,statuses);
     
     for(i=0; i<numrequests; i++)
       requests[i]=MPI_REQUEST_NULL;
     MPI_Isend(&feq[0][0][1][0],1,passzf,comm->procneigh[2][0],15,world,&requests[0]);
     MPI_Irecv(&feq[0][0][0][0],1,passzf,comm->procneigh[2][0],25,world,&requests[1]);
     MPI_Isend(&feq[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],25,world,&requests[2]);
     MPI_Irecv(&feq[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],15,world,&requests[3]);
     if(typeLB == 2){
       MPI_Isend(&feqn[0][0][1][0],1,passzf,comm->procneigh[2][0],10,world,&requests[4]);
       MPI_Irecv(&feqn[0][0][0][0],1,passzf,comm->procneigh[2][0],20,world,&requests[5]);
       MPI_Isend(&feqn[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],20,world,&requests[6]);
       MPI_Irecv(&feqn[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],10,world,&requests[7]);
     } 
     MPI_Waitall(numrequests,requests,statuses);
     
     //Save feqold.
     if(typeLB == 2){
       for(i=0; i<subNbx; i++)
 	for(j=0; j<subNby; j++)
 	  for(k=0; k<subNbz; k++)
 	    for(p=0; p<numvel; p++){
 	      feqold[i][j][k][p] = feq[i][j][k][p];
 	      feqoldn[i][j][k][p] = feqn[i][j][k][p];
 	    }
     }
   }else{
     step = 1;
     
     read_restartfile();
     
     if(typeLB == 2){
       for(i=0; i<8; i++)
 	requests[i]=MPI_REQUEST_NULL;
       MPI_Isend(&feqold[1][1][1][0],1,passxf,comm->procneigh[0][0],15,world,&requests[0]);
       MPI_Irecv(&feqold[0][1][1][0],1,passxf,comm->procneigh[0][0],25,world,&requests[1]);
       MPI_Isend(&feqold[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],25,world,&requests[2]);
       MPI_Irecv(&feqold[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],15,world,&requests[3]);
       MPI_Isend(&feqoldn[1][1][1][0],1,passxf,comm->procneigh[0][0],10,world,&requests[4]);
       MPI_Irecv(&feqoldn[0][1][1][0],1,passxf,comm->procneigh[0][0],20,world,&requests[5]);
       MPI_Isend(&feqoldn[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],20,world,&requests[6]);
       MPI_Irecv(&feqoldn[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],10,world,&requests[7]);  
       MPI_Waitall(8,requests,statuses);
       
       for(i=0; i<8; i++)
 	requests[i]=MPI_REQUEST_NULL;
       MPI_Isend(&feqold[0][1][1][0],1,passyf,comm->procneigh[1][0],15,world,&requests[0]);
       MPI_Irecv(&feqold[0][0][1][0],1,passyf,comm->procneigh[1][0],25,world,&requests[1]);   
       MPI_Isend(&feqold[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],25,world,&requests[2]);
       MPI_Irecv(&feqold[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],15,world,&requests[3]);
       MPI_Isend(&feqoldn[0][1][1][0],1,passyf,comm->procneigh[1][0],10,world,&requests[4]);
       MPI_Irecv(&feqoldn[0][0][1][0],1,passyf,comm->procneigh[1][0],20,world,&requests[5]);   
       MPI_Isend(&feqoldn[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],20,world,&requests[6]);
       MPI_Irecv(&feqoldn[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],10,world,&requests[7]);
       MPI_Waitall(8,requests,statuses);
       
       for(i=0; i<8; i++)
 	requests[i]=MPI_REQUEST_NULL;
       MPI_Isend(&feqold[0][0][1][0],1,passzf,comm->procneigh[2][0],15,world,&requests[0]);
       MPI_Irecv(&feqold[0][0][0][0],1,passzf,comm->procneigh[2][0],25,world,&requests[1]);
       MPI_Isend(&feqold[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],25,world,&requests[2]);
       MPI_Irecv(&feqold[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],15,world,&requests[3]);   
       MPI_Isend(&feqoldn[0][0][1][0],1,passzf,comm->procneigh[2][0],10,world,&requests[4]);
       MPI_Irecv(&feqoldn[0][0][0][0],1,passzf,comm->procneigh[2][0],20,world,&requests[5]);
       MPI_Isend(&feqoldn[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],20,world,&requests[6]);
       MPI_Irecv(&feqoldn[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],10,world,&requests[7]); 
       MPI_Waitall(8,requests,statuses);
     }
     parametercalc_full();
   }
 }
 
 //==========================================================================
 // Compute the lattice Boltzmann equilibrium distribution functions for
 // the D3Q15 model.
 //==========================================================================
 void FixLbFluid::equilibriumdist15(int xstart, int xend, int ystart, int yend, int zstart, int zend) {
 
   double rho;
   int i, j, k, l, iup, idwn, jup, jdwn, kup, kdwn;
   double Fx_w, Fy_w, Fz_w;
 
   double total_density(0.0);
   double drhox, drhoy, drhoz, drhoxx, drhoyy, drhozz;
   double Pxx, Pyy, Pzz, Pxy, Pxz, Pyz;
   double grs, p0;
   double dPdrho;
 
   double S[2][3],std;
   int jj;
  
   double etacov[15],ghostnoise;
 
 
   for (i=xstart; i<xend; i++) {
     iup=i+1;
     idwn=i-1;
     for (j=ystart; j<yend; j++) {
       jup=j+1;
       jdwn=j-1;
       for (k=zstart; k<zend; k++) {
 	kup=k+1;
 	kdwn=k-1;
 
 	rho=density_lb[i][j][k];
 	total_density += rho;
 
 	// Derivatives.
 	drhox = (density_lb[iup][j][k] - density_lb[idwn][j][k])/2.0;
 	drhoxx = (density_lb[iup][j][k] - 2.0*density_lb[i][j][k] + 
 		  density_lb[idwn][j][k]);
 
 	drhoy = (density_lb[i][jup][k] - density_lb[i][jdwn][k])/2.0;
 	drhoyy = (density_lb[i][jup][k] - 2.0*density_lb[i][j][k] + 
 		  density_lb[i][jdwn][k]);
 
 	drhoz = (density_lb[i][j][kup] - density_lb[i][j][kdwn])/2.0;
 	drhozz = (density_lb[i][j][kup] - 2.0*density_lb[i][j][k] + 
 		  density_lb[i][j][kdwn]);
 
 	// Need one-sided derivatives for the boundary of the domain, if fixed boundary
 	// conditions are used.
 	if(domain->periodicity[2]==0){
 	  if(comm->myloc[2]==0 && k==1){
 	    drhoz = (-3.0*density_lb[i][j][k] + 4.0*density_lb[i][j][k+1] - 
 		     density_lb[i][j][k+2])/2.0;
 	    drhozz = (-density_lb[i][j][k+3] + 4.0*density_lb[i][j][k+2] - 
 		      5.0*density_lb[i][j][k+1] + 2.0*rho);
 	  }
 	  if(comm->myloc[2]==comm->procgrid[2]-1 && k==subNbz-2){
 	    drhoz = -(-3.0*density_lb[i][j][k] + 4.0*density_lb[i][j][k-1] - 
 		      density_lb[i][j][k-2])/2.0;
 	    drhozz = (-density_lb[i][j][k-3] + 4.0*density_lb[i][j][k-2] - 
 		      5.0*density_lb[i][j][k-1] + 2.0*rho);
 	  }
 	}
 
 	grs = drhox*drhox + drhoy*drhoy + drhoz*drhoz;	
 
 	p0 = rho*a_0-kappa_lb*rho*(drhoxx + drhoyy + drhozz);
 //                   kappa_lb is the square gradient coeff in the pressure tensor
 
 	dPdrho = a_0; //assuming here that kappa_lb = 0.
 
 
 	if(typeLB==1){
 	  Pxx = p0 + kappa_lb*(drhox*drhox - 0.5*grs)+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (3.0*u_lb[i][j][k][0]*drhox+u_lb[i][j][k][1]*drhoy+u_lb[i][j][k][2]*drhoz);
 	  Pyy = p0 + kappa_lb*(drhoy*drhoy - 0.5*grs)+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhox+3.0*u_lb[i][j][k][1]*drhoy+u_lb[i][j][k][2]*drhoz);
 	  Pzz = p0 + kappa_lb*(drhoz*drhoz - 0.5*grs)+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhox+u_lb[i][j][k][1]*drhoy+3.0*u_lb[i][j][k][2]*drhoz);
 	  Pxy = kappa_lb*drhox*drhoy+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhoy+u_lb[i][j][k][1]*drhox);
 	  Pxz = kappa_lb*drhox*drhoz+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhoz+u_lb[i][j][k][2]*drhox);
 	  Pyz = kappa_lb*drhoy*drhoz+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][1]*drhoz+u_lb[i][j][k][2]*drhoy);
 	}else if(typeLB==2){
 	  Pxx = p0 + kappa_lb*(drhox*drhox - 0.5*grs)+tau*(1.0/3.0-dPdrho)*
 	    (3.0*u_lb[i][j][k][0]*drhox+u_lb[i][j][k][1]*drhoy+u_lb[i][j][k][2]*drhoz);
 	  Pyy = p0 + kappa_lb*(drhoy*drhoy - 0.5*grs)+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhox+3.0*u_lb[i][j][k][1]*drhoy+u_lb[i][j][k][2]*drhoz);
 	  Pzz = p0 + kappa_lb*(drhoz*drhoz - 0.5*grs)+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhox+u_lb[i][j][k][1]*drhoy+3.0*u_lb[i][j][k][2]*drhoz);
 	  Pxy = kappa_lb*drhox*drhoy+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhoy+u_lb[i][j][k][1]*drhox);
 	  Pxz = kappa_lb*drhox*drhoz+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhoz+u_lb[i][j][k][2]*drhox);
 	  Pyz = kappa_lb*drhoy*drhoz+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][1]*drhoz+u_lb[i][j][k][2]*drhoy);
 	}	  
 
  	Fx_w = Ff[i][j][k][0];
  	Fy_w = Ff[i][j][k][1];
  	Fz_w = Ff[i][j][k][2];
 
 	etacov[0] = rho;
 	etacov[1] = rho*u_lb[i][j][k][0] + Fx_w*tau + rho*bodyforcex*tau;
 	etacov[2] = rho*u_lb[i][j][k][1] + Fy_w*tau + rho*bodyforcey*tau;
 	etacov[3] = rho*u_lb[i][j][k][2] + Fz_w*tau + rho*bodyforcez*tau;
 
 	etacov[4] = Pxx + rho*u_lb[i][j][k][0]*u_lb[i][j][k][0] -rho/3. + 
 	  tau*(2.0*u_lb[i][j][k][0]*(Fx_w+rho*bodyforcex));
 	etacov[5] = Pyy + rho*u_lb[i][j][k][1]*u_lb[i][j][k][1] -rho/3. + 
 	  tau*(2.0*u_lb[i][j][k][1]*(Fy_w+rho*bodyforcey));
 	etacov[6] = Pzz + rho*u_lb[i][j][k][2]*u_lb[i][j][k][2] -rho/3. + 
 	  tau*(2.0*u_lb[i][j][k][2]*(Fz_w+rho*bodyforcez));
 	etacov[7] = Pxy + rho*u_lb[i][j][k][0]*u_lb[i][j][k][1] + 
 	  tau*(u_lb[i][j][k][0]*(Fy_w+rho*bodyforcey) + (Fx_w+rho*bodyforcex)*u_lb[i][j][k][1]);
 	etacov[8] = Pyz + rho*u_lb[i][j][k][1]*u_lb[i][j][k][2] + 
 	  tau*(u_lb[i][j][k][1]*(Fz_w+rho*bodyforcez) + (Fy_w+rho*bodyforcey)*u_lb[i][j][k][2]);
 	etacov[9] = Pxz + rho*u_lb[i][j][k][0]*u_lb[i][j][k][2] + 
 	  tau*(u_lb[i][j][k][0]*(Fz_w+rho*bodyforcez) + (Fx_w+rho*bodyforcex)*u_lb[i][j][k][2]);
 	etacov[10] = 0.0; 
 	etacov[11] = 0.0; 
 	etacov[12] = 0.0;
 	etacov[13] = rho*u_lb[i][j][k][0]*u_lb[i][j][k][1]*u_lb[i][j][k][2];
 	const double TrP = Pxx+Pyy+Pzz;
 	etacov[14] = K_0*(rho-TrP);
        
 	for (l=0; l<15; l++) {
 
 	  feq[i][j][k][l] = 0.0;
  	  for (int ii=0; ii<15; ii++) 
  	    feq[i][j][k][l] += w_lb[l]*mg_lb[ii][l]*etacov[ii]*Ng_lb[ii];
 
 	  if(typeLB == 2){
 	    feqn[i][j][k][l] = feq[i][j][k][l];
 	  }
 	}
 
 	if(noisestress==1){
 	  std = sqrt(namp*rho);
 
 	  for(jj=0; jj<3; jj++)
 	    S[0][jj] = std*random->gaussian();
 	  for(jj=0; jj<3; jj++)
 	    S[1][jj] = std*random->gaussian(); 
 
 	  etacov[4] = (S[0][0]*sqrt(3.0-3.0*a_0));
 	  etacov[5] = ((1.0-3.0*a_0)*S[0][0]/sqrt(3.0-3.0*a_0)+
 		       sqrt((8.0-12.0*a_0)/(3.0-3.0*a_0))*S[0][1]);
 	  etacov[6] = ((1.0-3.0*a_0)*S[0][0]/sqrt(3.0-3.0*a_0)+
 		       (2.0-6.0*a_0)*S[0][1]/sqrt((8.0-12.0*a_0)*(3.0-3.0*a_0))+
 		       sqrt((5.0-9.0*a_0)/(2.0-3.0*a_0))*S[0][2]);
 	  etacov[7] = S[1][0];
 	  etacov[8] = S[1][1];
 	  etacov[9] = S[1][2];
 
 	  for (l=10; l<15; l++) {
 	    etacov[l] = sqrt(9.0*namp*rho/Ng_lb[l])*random->gaussian();
 	  }
 	  etacov[14] += -K_0*(etacov[4]+etacov[5]+etacov[6]);  //correction from noise to TrP
 
 	  for (l=0; l<15; l++) {
 	    ghostnoise = w_lb[l]*
 	      (mg_lb[4][l]*etacov[4]*Ng_lb[4] + mg_lb[5][l]*etacov[5]*Ng_lb[5] + 
 	       mg_lb[6][l]*etacov[6]*Ng_lb[6] + mg_lb[7][l]*etacov[7]*Ng_lb[7] + 
 	       mg_lb[8][l]*etacov[8]*Ng_lb[8] + mg_lb[9][l]*etacov[9]*Ng_lb[9] + 
 	       mg_lb[10][l]*etacov[10]*Ng_lb[10] + mg_lb[11][l]*etacov[11]*Ng_lb[11]
 	       + mg_lb[12][l]*etacov[12]*Ng_lb[12] + mg_lb[13][l]*etacov[13]*Ng_lb[13]
 	       + mg_lb[14][l]*etacov[14]*Ng_lb[14]);
 	    feq[i][j][k][l] += ghostnoise*noisefactor;
 	  }
 	}	
       }
     }
   }
 }
 
 //==========================================================================
 // Compute the lattice Boltzmann equilibrium distribution functions for
 // the D3Q19 model.
 //==========================================================================
 void FixLbFluid::equilibriumdist19(int xstart, int xend, int ystart, int yend, int zstart, int zend) {
 
   double rho;
   int i, j, k, l, iup, idwn, jup, jdwn, kup, kdwn;
   double Fx_w, Fy_w, Fz_w;
 
   double total_density(0.0);
   double drhox, drhoy, drhoz, drhoxx, drhoyy, drhozz;
   double Pxx, Pyy, Pzz, Pxy, Pxz, Pyz;
   double grs, p0;
   double dPdrho;
 
   double S[2][3],std;
   int jj;
  
   double etacov[19],ghostnoise;
 
   for (i=xstart; i<xend; i++) {
     iup=i+1;
     idwn=i-1;
     for (j=ystart; j<yend; j++) {
       jup=j+1;
       jdwn=j-1;
       for (k=zstart; k<zend; k++) {
 	kup=k+1;
 	kdwn=k-1;
 
 	rho=density_lb[i][j][k];
 	total_density += rho;
 
 	// Derivatives.
 	drhox = (density_lb[iup][j][k] - density_lb[idwn][j][k])/2.0;
 	drhoxx = (density_lb[iup][j][k] - 2.0*density_lb[i][j][k] + 
 		  density_lb[idwn][j][k]);
 
 	drhoy = (density_lb[i][jup][k] - density_lb[i][jdwn][k])/2.0;
 	drhoyy = (density_lb[i][jup][k] - 2.0*density_lb[i][j][k] + 
 		  density_lb[i][jdwn][k]);
 
 	drhoz = (density_lb[i][j][kup] - density_lb[i][j][kdwn])/2.0;
 	drhozz = (density_lb[i][j][kup] - 2.0*density_lb[i][j][k] + 
 		  density_lb[i][j][kdwn]);
 
 	// Need one-sided derivatives for the boundary of the domain, if fixed boundary
 	// conditions are used.
 	if(domain->periodicity[2]==0){
 	  if(comm->myloc[2]==0 && k==1){
 	    drhoz = (-3.0*density_lb[i][j][k] + 4.0*density_lb[i][j][k+1] - 
 		     density_lb[i][j][k+2])/2.0;
 	    drhozz = (-density_lb[i][j][k+3] + 4.0*density_lb[i][j][k+2] - 
 		      5.0*density_lb[i][j][k+1] + 2.0*rho);
 	  }
 	  if(comm->myloc[2]==comm->procgrid[2]-1 && k==subNbz-2){
 	    drhoz = -(-3.0*density_lb[i][j][k] + 4.0*density_lb[i][j][k-1] - 
 		      density_lb[i][j][k-2])/2.0;
 	    drhozz = (-density_lb[i][j][k-3] + 4.0*density_lb[i][j][k-2] - 
 		      5.0*density_lb[i][j][k-1] + 2.0*rho);
 	  }
 	}
 
 	grs = drhox*drhox + drhoy*drhoy + drhoz*drhoz;	
 
 	p0 = rho*a_0-kappa_lb*rho*(drhoxx + drhoyy + drhozz);
 //                   kappa_lb is the square gradient coeff in the pressure tensor
 
 	dPdrho = a_0; //assuming here that kappa_lb = 0.
 
 
 	if(typeLB==1){
 	  Pxx = p0 + kappa_lb*(drhox*drhox - 0.5*grs)+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (3.0*u_lb[i][j][k][0]*drhox+u_lb[i][j][k][1]*drhoy+u_lb[i][j][k][2]*drhoz);
 	  Pyy = p0 + kappa_lb*(drhoy*drhoy - 0.5*grs)+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhox+3.0*u_lb[i][j][k][1]*drhoy+u_lb[i][j][k][2]*drhoz);
 	  Pzz = p0 + kappa_lb*(drhoz*drhoz - 0.5*grs)+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhox+u_lb[i][j][k][1]*drhoy+3.0*u_lb[i][j][k][2]*drhoz);
 	  Pxy = kappa_lb*drhox*drhoy+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhoy+u_lb[i][j][k][1]*drhox);
 	  Pxz = kappa_lb*drhox*drhoz+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhoz+u_lb[i][j][k][2]*drhox);
 	  Pyz = kappa_lb*drhoy*drhoz+(tau-0.5)*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][1]*drhoz+u_lb[i][j][k][2]*drhoy);
 	}else if(typeLB==2){
 	  Pxx = p0 + kappa_lb*(drhox*drhox - 0.5*grs)+tau*(1.0/3.0-dPdrho)*
 	    (3.0*u_lb[i][j][k][0]*drhox+u_lb[i][j][k][1]*drhoy+u_lb[i][j][k][2]*drhoz);
 	  Pyy = p0 + kappa_lb*(drhoy*drhoy - 0.5*grs)+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhox+3.0*u_lb[i][j][k][1]*drhoy+u_lb[i][j][k][2]*drhoz);
 	  Pzz = p0 + kappa_lb*(drhoz*drhoz - 0.5*grs)+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhox+u_lb[i][j][k][1]*drhoy+3.0*u_lb[i][j][k][2]*drhoz);
 	  Pxy = kappa_lb*drhox*drhoy+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhoy+u_lb[i][j][k][1]*drhox);
 	  Pxz = kappa_lb*drhox*drhoz+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][0]*drhoz+u_lb[i][j][k][2]*drhox);
 	  Pyz = kappa_lb*drhoy*drhoz+tau*(1.0/3.0-dPdrho)*
 	    (u_lb[i][j][k][1]*drhoz+u_lb[i][j][k][2]*drhoy);
 	}	  
 
  	Fx_w = Ff[i][j][k][0];
  	Fy_w = Ff[i][j][k][1];
  	Fz_w = Ff[i][j][k][2];
 
 	etacov[0] = rho;
 	etacov[1] = rho*u_lb[i][j][k][0] + Fx_w*tau + rho*bodyforcex*tau;
 	etacov[2] = rho*u_lb[i][j][k][1] + Fy_w*tau + rho*bodyforcey*tau;
 	etacov[3] = rho*u_lb[i][j][k][2] + Fz_w*tau + rho*bodyforcez*tau;
 
 	etacov[4] = Pxx + rho*u_lb[i][j][k][0]*u_lb[i][j][k][0] -rho/3. + 
 	  tau*(2.0*u_lb[i][j][k][0]*(Fx_w+rho*bodyforcex));
 	etacov[5] = Pyy + rho*u_lb[i][j][k][1]*u_lb[i][j][k][1] -rho/3. + 
 	  tau*(2.0*u_lb[i][j][k][1]*(Fy_w+rho*bodyforcey));
 	etacov[6] = Pzz + rho*u_lb[i][j][k][2]*u_lb[i][j][k][2] -rho/3. + 
 	  tau*(2.0*u_lb[i][j][k][2]*(Fz_w+rho*bodyforcez));
 	etacov[7] = Pxy + rho*u_lb[i][j][k][0]*u_lb[i][j][k][1] + 
 	  tau*(u_lb[i][j][k][0]*(Fy_w+rho*bodyforcey) + (Fx_w+rho*bodyforcex)*u_lb[i][j][k][1]);
 	etacov[8] = Pxz + rho*u_lb[i][j][k][0]*u_lb[i][j][k][2] + 
 	  tau*(u_lb[i][j][k][0]*(Fz_w+rho*bodyforcez) + (Fx_w+rho*bodyforcex)*u_lb[i][j][k][2]);
 	etacov[9] = Pyz + rho*u_lb[i][j][k][1]*u_lb[i][j][k][2] + 
 	  tau*(u_lb[i][j][k][1]*(Fz_w+rho*bodyforcez) + (Fy_w+rho*bodyforcey)*u_lb[i][j][k][2]);
 	etacov[10] = 0.0; 
 	etacov[11] = 0.0; 
 	etacov[12] = 0.0;
 	etacov[13] = 0.0;
 	etacov[14] = 0.0;
 	etacov[15] = 0.0;
 	etacov[16] = 0.0;
 	etacov[17] = 0.0;
 	etacov[18] = 0.0;
 	
 	for (l=0; l<19; l++) {
 
 	  feq[i][j][k][l] = 0.0;
  	  for (int ii=0; ii<19; ii++) 
  	    feq[i][j][k][l] += w_lb[l]*mg_lb[ii][l]*etacov[ii]*Ng_lb[ii];
 
 	  if(typeLB == 2){
 	    feqn[i][j][k][l] = feq[i][j][k][l];
 	  }
 	}
 
 	if(noisestress==1){
 	  std = sqrt(namp*rho);
 
 	  for(jj=0; jj<3; jj++)
 	    S[0][jj] = std*random->gaussian();
 	  for(jj=0; jj<3; jj++)
 	    S[1][jj] = std*random->gaussian(); 
 
 	  etacov[4] = (S[0][0]*sqrt(3.0-3.0*a_0));
 	  etacov[5] = ((1.0-3.0*a_0)*S[0][0]/sqrt(3.0-3.0*a_0)+
 		       sqrt((8.0-12.0*a_0)/(3.0-3.0*a_0))*S[0][1]);
 	  etacov[6] = ((1.0-3.0*a_0)*S[0][0]/sqrt(3.0-3.0*a_0)+
 		       (2.0-6.0*a_0)*S[0][1]/sqrt((8.0-12.0*a_0)*(3.0-3.0*a_0))+
 		       sqrt((5.0-9.0*a_0)/(2.0-3.0*a_0))*S[0][2]);
 	  etacov[7] = S[1][0];
 	  etacov[8] = S[1][1];
 	  etacov[9] = S[1][2];
 
 	  for (l=10; l<19; l++) {
 	    etacov[l] = sqrt(9.0*namp*rho/Ng_lb[l])*random->gaussian();
 	  }
 	  
 	  for (l=0; l<19; l++) {
 	    ghostnoise = w_lb[l]*
 	      (mg_lb[4][l]*etacov[4]*Ng_lb[4] + mg_lb[5][l]*etacov[5]*Ng_lb[5] + 
 	       mg_lb[6][l]*etacov[6]*Ng_lb[6] + mg_lb[7][l]*etacov[7]*Ng_lb[7] + 
 	       mg_lb[8][l]*etacov[8]*Ng_lb[8] + mg_lb[9][l]*etacov[9]*Ng_lb[9] + 
 	       mg_lb[10][l]*etacov[10]*Ng_lb[10] + mg_lb[11][l]*etacov[11]*Ng_lb[11]
 	       + mg_lb[12][l]*etacov[12]*Ng_lb[12] + mg_lb[13][l]*etacov[13]*Ng_lb[13]
 	       + mg_lb[14][l]*etacov[14]*Ng_lb[14] + mg_lb[15][l]*etacov[15]*Ng_lb[15]
 	       + mg_lb[16][l]*etacov[16]*Ng_lb[16] + mg_lb[17][l]*etacov[17]*Ng_lb[17]
 	       + mg_lb[18][l]*etacov[18]*Ng_lb[18]);
 	    feq[i][j][k][l] += ghostnoise*noisefactor;
 	  }
 	}
 	
       }
     }
   }
 }
 
 //==========================================================================
 // Calculate the fluid density and velocity over the entire simulation
 // domain.
 //==========================================================================
 void FixLbFluid::parametercalc_full(void)
 {
   MPI_Request requests[4];
   MPI_Status statuses[4];
   MPI_Request requests2[12];
   MPI_Status statuses2[12];
   int numrequests;
   int i;
 
   //--------------------------------------------------------------------------
   // send the boundaries of f_lb, as they will be needed later by the update
   // routine, and use these to calculate the density and velocity on the 
   // boundary.
   //--------------------------------------------------------------------------
   for(i=0; i<4; i++)
     requests[i]=MPI_REQUEST_NULL;
   MPI_Isend(&f_lb[1][1][1][0],1,passxf,comm->procneigh[0][0],10,world,&requests[0]);
   MPI_Irecv(&f_lb[0][1][1][0],1,passxf,comm->procneigh[0][0],20,world,&requests[1]);
   MPI_Isend(&f_lb[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],20,world,&requests[2]);
   MPI_Irecv(&f_lb[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],10,world,&requests[3]);
   parametercalc_part(1,subNbx-1,1,subNby-1,1,subNbz-1);
   MPI_Waitall(4,requests,statuses);
 
   for(i=0; i<4; i++)
     requests[i]=MPI_REQUEST_NULL;
   MPI_Isend(&f_lb[0][1][1][0],1,passyf,comm->procneigh[1][0],10,world,&requests[0]);
   MPI_Irecv(&f_lb[0][0][1][0],1,passyf,comm->procneigh[1][0],20,world,&requests[1]);   
   MPI_Isend(&f_lb[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],20,world,&requests[2]);
   MPI_Irecv(&f_lb[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],10,world,&requests[3]);
   parametercalc_part(0,1,1,subNby-1,1,subNbz-1);
   parametercalc_part(subNbx-1,subNbx,1,subNby-1,1,subNbz-1);
   MPI_Waitall(4,requests,statuses);
 
   for(i=0; i<4; i++)
     requests[i]=MPI_REQUEST_NULL;
   MPI_Isend(&f_lb[0][0][1][0],1,passzf,comm->procneigh[2][0],10,world,&requests[0]);
   MPI_Irecv(&f_lb[0][0][0][0],1,passzf,comm->procneigh[2][0],20,world,&requests[1]);
   MPI_Isend(&f_lb[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],20,world,&requests[2]);
   MPI_Irecv(&f_lb[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],10,world,&requests[3]);
   parametercalc_part(0,subNbx,0,1,1,subNbz-1);
   parametercalc_part(0,subNbx,subNby-1,subNby,1,subNbz-1);
   MPI_Waitall(4,requests,statuses);
   
   parametercalc_part(0,subNbx,0,subNby,0,1);
   parametercalc_part(0,subNbx,0,subNby,subNbz-1,subNbz);
 
   //--------------------------------------------------------------------------
   // Send the remaining portions of the u array (and density array if Gamma 
   // is set the default way).
   //--------------------------------------------------------------------------
   if(setGamma == 0) numrequests = 12;
   else numrequests = 6;
 
   for(i=0; i<numrequests; i++)
     requests2[i]=MPI_REQUEST_NULL;
   MPI_Isend(&u_lb[2][0][0][0],1,passxu,comm->procneigh[0][0],10,world,&requests2[0]);
   MPI_Isend(&u_lb[3][0][0][0],1,passxu,comm->procneigh[0][0],20,world,&requests2[1]);
   MPI_Isend(&u_lb[subNbx-3][0][0][0],1,passxu,comm->procneigh[0][1],30,world,&requests2[2]);
   MPI_Irecv(&u_lb[subNbx][0][0][0],1,passxu,comm->procneigh[0][1],10,world,&requests2[3]);
   MPI_Irecv(&u_lb[subNbx+1][0][0][0],1,passxu,comm->procneigh[0][1],20,world,&requests2[4]);
   MPI_Irecv(&u_lb[subNbx+2][0][0][0],1,passxu,comm->procneigh[0][0],30,world,&requests2[5]);
   if(setGamma==0){
     MPI_Isend(&density_lb[2][0][0],1,passxrho,comm->procneigh[0][0],40,world,&requests2[6]);
     MPI_Isend(&density_lb[3][0][0],1,passxrho,comm->procneigh[0][0],50,world,&requests2[7]);
     MPI_Isend(&density_lb[subNbx-3][0][0],1,passxrho,comm->procneigh[0][1],60,world,&requests2[8]);
     MPI_Irecv(&density_lb[subNbx][0][0],1,passxrho,comm->procneigh[0][1],40,world,&requests2[9]);
     MPI_Irecv(&density_lb[subNbx+1][0][0],1,passxrho,comm->procneigh[0][1],50,world,&requests2[10]);
     MPI_Irecv(&density_lb[subNbx+2][0][0],1,passxrho,comm->procneigh[0][0],60,world,&requests2[11]);
   }
   MPI_Waitall(numrequests,requests2,statuses2);
 
   for(i=0; i<numrequests; i++)
     requests2[i]=MPI_REQUEST_NULL;
   MPI_Isend(&u_lb[0][2][0][0],1,passyu,comm->procneigh[1][0],10,world,&requests2[0]);
   MPI_Isend(&u_lb[0][3][0][0],1,passyu,comm->procneigh[1][0],20,world,&requests2[1]);
   MPI_Isend(&u_lb[0][subNby-3][0][0],1,passyu,comm->procneigh[1][1],30,world,&requests2[2]);
   MPI_Irecv(&u_lb[0][subNby][0][0],1,passyu,comm->procneigh[1][1],10,world,&requests2[3]);
   MPI_Irecv(&u_lb[0][subNby+1][0][0],1,passyu,comm->procneigh[1][1],20,world,&requests2[4]);
   MPI_Irecv(&u_lb[0][subNby+2][0][0],1,passyu,comm->procneigh[1][0],30,world,&requests2[5]);
   if(setGamma==0){
     MPI_Isend(&density_lb[0][2][0],1,passyrho,comm->procneigh[1][0],40,world,&requests2[6]);
     MPI_Isend(&density_lb[0][3][0],1,passyrho,comm->procneigh[1][0],50,world,&requests2[7]);
     MPI_Isend(&density_lb[0][subNby-3][0],1,passyrho,comm->procneigh[1][1],60,world,&requests2[8]);
     MPI_Irecv(&density_lb[0][subNby][0],1,passyrho,comm->procneigh[1][1],40,world,&requests2[9]);
     MPI_Irecv(&density_lb[0][subNby+1][0],1,passyrho,comm->procneigh[1][1],50,world,&requests2[10]);
     MPI_Irecv(&density_lb[0][subNby+2][0],1,passyrho,comm->procneigh[1][0],60,world,&requests2[11]);
   }
   MPI_Waitall(numrequests,requests2,statuses2);
 
   for(i=0; i<12; i++)
     requests2[i]=MPI_REQUEST_NULL;
   int requestcount=0;
   if(domain->periodicity[2]!=0 || comm->myloc[2] != 0){
     MPI_Isend(&u_lb[0][0][2][0],1,passzu,comm->procneigh[2][0],10,world,&requests2[requestcount]);
     MPI_Isend(&u_lb[0][0][3][0],1,passzu,comm->procneigh[2][0],20,world,&requests2[requestcount+1]);
     MPI_Irecv(&u_lb[0][0][subNbz+2][0],1,passzu,comm->procneigh[2][0],30,world,&requests2[requestcount+2]);
     requestcount=requestcount+3;
     if(setGamma==0){
       MPI_Isend(&density_lb[0][0][2],1,passzrho,comm->procneigh[2][0],40,world,&requests2[requestcount]);
       MPI_Isend(&density_lb[0][0][3],1,passzrho,comm->procneigh[2][0],50,world,&requests2[requestcount+1]);
       MPI_Irecv(&density_lb[0][0][subNbz+2],1,passzrho,comm->procneigh[2][0],60,world,&requests2[requestcount+2]);
       requestcount=requestcount+3;
     }
   }
   if(domain->periodicity[2]!=0 || comm->myloc[2] != (comm->procgrid[2]-1)){
     MPI_Isend(&u_lb[0][0][subNbz-3][0],1,passzu,comm->procneigh[2][1],30,world,&requests2[requestcount]);
     MPI_Irecv(&u_lb[0][0][subNbz][0],1,passzu,comm->procneigh[2][1],10,world,&requests2[requestcount+1]);
     MPI_Irecv(&u_lb[0][0][subNbz+1][0],1,passzu,comm->procneigh[2][1],20,world,&requests2[requestcount+2]);
     requestcount=requestcount+3;
     if(setGamma==0){
       MPI_Isend(&density_lb[0][0][subNbz-3],1,passzrho,comm->procneigh[2][1],60,world,&requests2[requestcount]);
       MPI_Irecv(&density_lb[0][0][subNbz],1,passzrho,comm->procneigh[2][1],40,world,&requests2[requestcount+1]);
       MPI_Irecv(&density_lb[0][0][subNbz+1],1,passzrho,comm->procneigh[2][1],50,world,&requests2[requestcount+2]);
       requestcount=requestcount+3;
     }
   }    
   MPI_Waitall(requestcount,requests2,statuses2); 
 
 }
 
 //==========================================================================
 // Calculate the fluid density and velocity over a simulation volume
 // specified by xstart,xend; ystart,yend; zstart,zend.
 //==========================================================================
 void FixLbFluid::parametercalc_part(int xstart, int xend, int ystart, int yend, int zstart, int zend)
 {
   int i,j,k,m;
 
   for(i=xstart; i<xend; i++){
     for(j=ystart; j<yend; j++){
       for(k=zstart; k<zend; k++){
 
 	density_lb[i][j][k]=0.0;
 	u_lb[i][j][k][0]=0.0;
 	u_lb[i][j][k][1]=0.0;
 	u_lb[i][j][k][2]=0.0;
 	for (m=0; m<numvel; m++) {
 	 
 	  density_lb[i][j][k] += f_lb[i][j][k][m];
 	  
 	  u_lb[i][j][k][0] += f_lb[i][j][k][m]*e[m][0];
 	  u_lb[i][j][k][1] += f_lb[i][j][k][m]*e[m][1];
 	  u_lb[i][j][k][2] += f_lb[i][j][k][m]*e[m][2];
 	  
 	}
 	
 	//For the on-lattice wall scheme, need to set this velocity to zero.
 	if(domain->periodicity[2]==0){
 	  if(comm->myloc[2]==0){
 	    if(k==1){
 	      u_lb[i][j][k][2]=0.0;
 	    }
 	  }
 	  if(comm->myloc[2]==comm->procgrid[2]-1){
 	    if(k==subNbz-2){
 	      u_lb[i][j][k][2]=0.0;
 	    }
 	  }
 	  
 	}
       
 	u_lb[i][j][k][0]=u_lb[i][j][k][0]/density_lb[i][j][k];
 	u_lb[i][j][k][1]=u_lb[i][j][k][1]/density_lb[i][j][k];
 	u_lb[i][j][k][2]=u_lb[i][j][k][2]/density_lb[i][j][k];
       }
     }
   }
   
 }
 
 //==========================================================================
 // Update the distribution function over a simulation volume specified 
 // by xstart,xend; ystart,yend; zstart,zend.
 //==========================================================================
 void FixLbFluid::update_periodic(int xstart, int xend, int ystart, int yend, int zstart, int zend)
 {
   int i,j,k,m;
   int imod,jmod,kmod,imodm,jmodm,kmodm;
 
   for(i=xstart; i<xend; i++)
     for(j=ystart; j<yend; j++)
       for(k=zstart; k<zend; k++){
 	
 	if(typeLB==1){
 	  for(m=0; m<numvel; m++){
 	    imod = i-e[m][0];
 	    jmod = j-e[m][1];
 	    kmod = k-e[m][2];
 
 	    fnew[i][j][k][m] = f_lb[imod][jmod][kmod][m] + (feq[imod][jmod][kmod][m]-f_lb[imod][jmod][kmod][m])/tau;
 	  }	    
 	}else if(typeLB==2){
 	  for(m=0; m<numvel; m++){
 	    imod = i-e[m][0];
 	    jmod = j-e[m][1];
 	    kmod = k-e[m][2];
 	    
 	    fnew[i][j][k][m] = feq[imod][jmod][kmod][m] + (f_lb[imod][jmod][kmod][m] - feq[imod][jmod][kmod][m])*expminusdtovertau;
 	  }
 	  
 	  fnew[i][j][k][0]+=Dcoeff*(feq[i][j][k][0]-feqold[i][j][k][0]);
 	  for(m=1; m<numvel; m++){
 	    imod = i-e[m][0];
 	    jmod = j-e[m][1];
 	    kmod = k-e[m][2];
 	    imodm = i+e[m][0];
 	    jmodm = j+e[m][1];
 	    kmodm = k+e[m][2];
 	    
 	     fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) + (0.5-Dcoeff*(tau+0.5))*
 	       (feqn[imodm][jmodm][kmodm][m] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 
 	  }
 	}		   
       }  
 }
 
 //==========================================================================
 //   Print the fluid properties to the screen.
 //==========================================================================
 void FixLbFluid::streamout(void)
 {
   int i,j,k;
   int istart,jstart,kstart;
   int iend,jend,kend;
   int w,iproc;
   int size,sizeloc;
   MPI_Request request_send,request_recv;
   MPI_Status status;
 
   //--------------------------------------------------------------------------
   // **Uncomment in order to test conservation of mass and momentum.
   //--------------------------------------------------------------------------
   // massloc=0.0;
   // momentumloc[0]=momentumloc[1]=momentumloc[2]=0.0;
   // for(i=1; i<subNbx-1; i++){
   //   for(j=1; j<subNby-1; j++){
   //     for(k=1; k<subNbz-1; k++){
   // 	massloc += density_lb[i][j][k];
   // 	momentumloc[0] += density_lb[i][j][k]*u_lb[i][j][k][0];
   // 	momentumloc[1] += density_lb[i][j][k]*u_lb[i][j][k][1];
   // 	momentumloc[2] += density_lb[i][j][k]*u_lb[i][j][k][2];
   //     }
   //   }
   // }
 
   // MPI_Allreduce(&massloc,&mass,1,MPI_DOUBLE,MPI_SUM,world);
   // MPI_Allreduce(&momentumloc[0],&momentum[0],3,MPI_DOUBLE,MPI_SUM,world);
 
   // if(comm->me==0){
   //   printf("%16.12f %16.12f %16.12f %16.12f\n",mass*dm_lb,momentum[0]*dm_lb*dx_lb/dt_lb,momentum[1]*dm_lb*dx_lb/dt_lb,momentum[2]*dm_lb*dx_lb/dt_lb);
   //  }
 
   sizeloc=(subNbx*subNby*subNbz*4);
   MPI_Allreduce(&sizeloc,&size,1,MPI_INT,MPI_MAX,world);
 
   if(me==0){
     for(iproc=0; iproc < comm->nprocs; iproc++){
       if(iproc){
 	MPI_Irecv(&buf[0][0][0][0],size,MPI_DOUBLE,iproc,0,world,&request_recv);
 	MPI_Wait(&request_recv,&status);
 
 	istart=static_cast<int> (buf[0][0][0][0]);
 	jstart=static_cast<int> (buf[0][0][0][1]);
 	kstart=static_cast<int> (buf[0][0][0][2]);
 	iend=static_cast<int> (buf[0][0][1][0]);
 	jend=static_cast<int> (buf[0][0][1][1]);
 	kend=static_cast<int> (buf[0][0][1][2]);
 
 	for(i=istart; i<iend; i++){
 	  for(j=jstart; j<jend; j++){
 	    for(k=kstart; k<kend; k++){
 	      for(w=0; w<4; w++){
 		altogether[i][j][k][w]=buf[i-istart+1][j-jstart+1][k-kstart+1][w];
 	      }
 	    }
 	  }
 	}
       }else{
 	for(i=1; i<subNbx-1; i++){
 	  for(j=1; j<subNby-1; j++){
 	    for(k=1; k<subNbz-1; k++){
 	      altogether[i-1][j-1][k-1][0]=density_lb[i][j][k];
 	      altogether[i-1][j-1][k-1][1]=u_lb[i][j][k][0];
 	      altogether[i-1][j-1][k-1][2]=u_lb[i][j][k][1];
 	      altogether[i-1][j-1][k-1][3]=u_lb[i][j][k][2];
 	    }
 	  }
 	}
       }
     }
     //i = Nbx/2;
     //j = Nby/2;
     for(i=0; i<Nbx; i++)
       for(j=0; j<Nby; j++)
 	for(k=0; k<Nbz; k++){
 	  printf("%16.12f %16.12f %16.12f %16.12f\n",altogether[i][j][k][0]*dm_lb/dx_lb/dx_lb/dx_lb,altogether[i][j][k][1]*dx_lb/dt_lb,altogether[i][j][k][2]*dx_lb/dt_lb,altogether[i][j][k][3]*dx_lb/dt_lb);
 	}
     
     
   } else {
     istart=comm->myloc[0]*(subNbx-2);
     jstart=comm->myloc[1]*(subNby-2);
     if(domain->periodicity[2]==0){
       if(comm->myloc[2]==comm->procgrid[2]-1){
 	kstart=comm->myloc[2]*(subNbz-3);
       }else{
 	kstart=comm->myloc[2]*(subNbz-2);
       }
     }else{
       kstart=comm->myloc[2]*(subNbz-2);
     }
     iend=istart+subNbx-2;
     jend=jstart+subNby-2;
     kend=kstart+subNbz-2;
     for(i=0; i<subNbx; i++){
       for(j=0; j<subNby; j++){
 	for(k=0; k<subNbz; k++){
 	  buf[i][j][k][0]=density_lb[i][j][k];
 	  buf[i][j][k][1]=u_lb[i][j][k][0];
 	  buf[i][j][k][2]=u_lb[i][j][k][1];
 	  buf[i][j][k][3]=u_lb[i][j][k][2];
 	}
       }
     }
     buf[0][0][0][0]=istart;
     buf[0][0][0][1]=jstart;
     buf[0][0][0][2]=kstart;
     buf[0][0][1][0]=iend;
     buf[0][0][1][1]=jend;
     buf[0][0][1][2]=kend;
 
     MPI_Isend(&buf[0][0][0][0],size,MPI_DOUBLE,0,0,world,&request_send);
     MPI_Wait(&request_send,&status); 
   }
 
 }
 
 //==========================================================================
 // Update the distribution functions over the entire simulation domain for
 // the D3Q15 model.
 //==========================================================================
 void FixLbFluid::update_full15(void)
 {
   
    MPI_Request req_send15,req_recv15;
    MPI_Request req_send25,req_recv25;
    MPI_Request requests[8];
    MPI_Status statuses[8];
    int numrequests;
    double tmp1;
    MPI_Status status;
    double rb;
    int i,j,k,m;
    int imod,jmod,kmod;
    int imodm,jmodm,kmodm;
 
    //--------------------------------------------------------------------------
    // If using the standard LB integrator, do not need to send info about feqn.
    //--------------------------------------------------------------------------
    if(typeLB == 1){
      numrequests = 4;
    }else{
      numrequests = 8;
    }
  
    //--------------------------------------------------------------------------
    // Fixed z boundary conditions.
    //--------------------------------------------------------------------------
    if(domain->periodicity[2]==0){  
  
      for(i=0; i<numrequests; i++)
        requests[i]=MPI_REQUEST_NULL;
      MPI_Isend(&feq[1][1][1][0],1,passxf,comm->procneigh[0][0],15,world,&requests[0]);
      MPI_Irecv(&feq[0][1][1][0],1,passxf,comm->procneigh[0][0],25,world,&requests[1]);
      MPI_Isend(&feq[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],25,world,&requests[2]);
      MPI_Irecv(&feq[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],15,world,&requests[3]);
      if(typeLB == 2){
        MPI_Isend(&feqn[1][1][1][0],1,passxf,comm->procneigh[0][0],10,world,&requests[4]);
        MPI_Irecv(&feqn[0][1][1][0],1,passxf,comm->procneigh[0][0],20,world,&requests[5]);
        MPI_Isend(&feqn[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],20,world,&requests[6]);
        MPI_Irecv(&feqn[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],10,world,&requests[7]);
      }
      update_periodic(2,subNbx-2,2,subNby-2,2,subNbz-2);
      MPI_Waitall(numrequests,requests,statuses);
 
 
      for(i=0; i<numrequests; i++)
        requests[i]=MPI_REQUEST_NULL;
      MPI_Isend(&feq[0][1][1][0],1,passyf,comm->procneigh[1][0],15,world,&requests[0]);
      MPI_Irecv(&feq[0][0][1][0],1,passyf,comm->procneigh[1][0],25,world,&requests[1]);   
      MPI_Isend(&feq[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],25,world,&requests[2]);
      MPI_Irecv(&feq[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],15,world,&requests[3]);
      if(typeLB == 2){
        MPI_Isend(&feqn[0][1][1][0],1,passyf,comm->procneigh[1][0],10,world,&requests[4]);
        MPI_Irecv(&feqn[0][0][1][0],1,passyf,comm->procneigh[1][0],20,world,&requests[5]);   
        MPI_Isend(&feqn[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],20,world,&requests[6]);
        MPI_Irecv(&feqn[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],10,world,&requests[7]);
      }
      update_periodic(1,2,2,subNby-2,2,subNbz-2);
      update_periodic(subNbx-2,subNbx-1,2,subNby-2,2,subNbz-2);
      MPI_Waitall(numrequests,requests,statuses);
      
      for(i=0; i<numrequests; i++)
        requests[i]=MPI_REQUEST_NULL;
      MPI_Isend(&feq[0][0][1][0],1,passzf,comm->procneigh[2][0],15,world,&requests[0]);
      MPI_Irecv(&feq[0][0][0][0],1,passzf,comm->procneigh[2][0],25,world,&requests[1]);
      MPI_Isend(&feq[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],25,world,&requests[2]);
      MPI_Irecv(&feq[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],15,world,&requests[3]);
      if(typeLB == 2){
        MPI_Isend(&feqn[0][0][1][0],1,passzf,comm->procneigh[2][0],10,world,&requests[4]);
        MPI_Irecv(&feqn[0][0][0][0],1,passzf,comm->procneigh[2][0],20,world,&requests[5]);
        MPI_Isend(&feqn[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],20,world,&requests[6]);
        MPI_Irecv(&feqn[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],10,world,&requests[7]);
      } 
      update_periodic(1,subNbx-1,1,2,2,subNbz-2);
      update_periodic(1,subNbx-1,subNby-2,subNby-1,2,subNbz-2);
      MPI_Waitall(numrequests,requests,statuses);
 
      if(typeLB==1){
        update_periodic(1,subNbx-1,1,subNby-1,1,2);
        update_periodic(1,subNbx-1,1,subNby-1,subNbz-2,subNbz-1);
      }else if(typeLB==2){
        if(comm->myloc[2]==0){
      	 for(i=1; i<subNbx-1; i++){
      	   for(j=1;j<subNby-1;j++){
      	     k=1;
      	     for(m=0; m<15; m++){
      	       imod = i-e[m][0];
      	       jmod = j-e[m][1];
      	       kmod = k-e[m][2];
 	     
      	       fnew[i][j][k][m] = feq[imod][jmod][kmod][m] + (f_lb[imod][jmod][kmod][m]-feq[imod][jmod][kmod][m])*expminusdtovertau;
      	     }
 
      	     for(m=0; m<15; m++){
      	       imod = i-e[m][0];
      	       jmod = j-e[m][1];
      	       kmod = k-e[m][2];
 	       imodm = i+e[m][0];
 	       jmodm = j+e[m][1];
 	       kmodm = k+e[m][2];
 	       
      	       if(m==5)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][6] - feqold[imod][jmod][kmod][m]) + 
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][6] - feqn[imod][jmod][kmod][6]);
      	       else if(m==7)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][11] - feqold[imod][jmod][kmod][m]) +
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][11] - feqn[imod][jmod][kmod][11]);
      	       else if(m==8)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][12] - feqold[imod][jmod][kmod][m]) +
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][12] - feqn[imod][jmod][kmod][12]);
      	       else if(m==9)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][13] - feqold[imod][jmod][kmod][m]) +
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][13] - feqn[imod][jmod][kmod][13]);
      	       else if(m==10)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][14] - feqold[imod][jmod][kmod][m]) +
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][14] - feqn[imod][jmod][kmod][14]);
 	       else if(m==6)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m]-feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][5] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	       else if(m==11)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m]-feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][7] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	
 	       else if(m==12)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m]-feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][8] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	 
 	       else if(m==13)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m]-feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][9] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	 
 	       else if(m==14)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m]-feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][10] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	
      	       else
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) + 
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqn[imodm][jmodm][kmodm][m] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 
      	     }
      	   }
      	 }      
        }else{
      	 update_periodic(1,subNbx-1,1,subNby-1,1,2);
        }
        if(comm->myloc[2]==comm->procgrid[2]-1){
      	 for(i=1;i<subNbx-1;i++){
      	   for(j=1;j<subNby-1;j++){
      	     k=subNbz-2;
      	     for(m=0; m<15; m++){
      	       imod = i-e[m][0];
      	       jmod = j-e[m][1];
      	       kmod = k-e[m][2];
 
      	       fnew[i][j][k][m] = feq[imod][jmod][kmod][m] + (f_lb[imod][jmod][kmod][m]-feq[imod][jmod][kmod][m])*expminusdtovertau;
      	     }	    
      	     for(m=0; m<15; m++){
      	       imod = i-e[m][0];
      	       jmod = j-e[m][1];
      	       kmod = k-e[m][2];
 	       imodm = i+e[m][0];
 	       jmodm = j+e[m][1];
 	       kmodm = k+e[m][2];
 	       
      	       if(m==6)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][5] - feqold[imod][jmod][kmod][m]) + 
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][5] - feqn[imod][jmod][kmod][5]);
      	       else if(m==11)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][7] - feqold[imod][jmod][kmod][m]) +
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][7] - feqn[imod][jmod][kmod][7]);
      	       else if(m==12)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][8] - feqold[imod][jmod][kmod][m]) +
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][8] - feqn[imod][jmod][kmod][8]);
      	       else if(m==13)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][9] - feqold[imod][jmod][kmod][m]) +
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][9] - feqn[imod][jmod][kmod][9]);
      	       else if(m==14)
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][10] - feqold[imod][jmod][kmod][m]) +
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][10] - feqn[imod][jmod][kmod][10]);
 	       else if(m==5)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][6] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	       else if(m==7)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][11] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	   
 	       else if(m==8)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][12] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	
  	       else if(m==9)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][13] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	   
 	       else if(m==10)
 		 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		   (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][14] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	
      	       else
      	       	 fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) + 
      	       	   (0.5-Dcoeff*(tau+0.5))*(feqn[imodm][jmodm][kmodm][m] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	       
      	     }
      	   }
      	 }
        }
        else{
      	 update_periodic(1,subNbx-1,1,subNby-1,subNbz-2,subNbz-1);
        }
      }
      
      req_send15=MPI_REQUEST_NULL;
      req_recv25=MPI_REQUEST_NULL;
      req_send25=MPI_REQUEST_NULL;
      req_recv15=MPI_REQUEST_NULL;
      
      if(comm->myloc[2]==0){
        MPI_Isend(&fnew[0][0][1][0],1,passzf,comm->procneigh[2][0],15,world,&req_send15);
        MPI_Irecv(&fnew[0][0][0][0],1,passzf,comm->procneigh[2][0],25,world,&req_recv25);
      }
      
      if(comm->myloc[2]==comm->procgrid[2]-1){
        MPI_Isend(&fnew[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],25,world,&req_send25);
        MPI_Irecv(&fnew[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],15,world,&req_recv15);
      }
      if(comm->myloc[2]==0){
        MPI_Wait(&req_send15,&status);
        MPI_Wait(&req_recv25,&status);
        
        for(i=1;i<subNbx-1;i++){
 	 for(j=1;j<subNby-1;j++){
 	   k=1;
 	   if(typeLB == 1){
 	     fnew[i][j][k][5]=fnew[i][j][k-1][6];
 	     tmp1=fnew[i][j][k-1][11]+fnew[i][j][k-1][12]+fnew[i][j][k-1][13]+fnew[i][j][k-1][14]; 
 	   }
 	   else{
 	     fnew[i][j][k][5]=fnew[i][j][k-1][6] + (0.5-Dcoeff*(tau+0.5))*feqn[i][j][k+1][5];
 	     tmp1=fnew[i][j][k-1][11]+fnew[i][j][k-1][12]+fnew[i][j][k-1][13]+fnew[i][j][k-1][14] + 
 	       (0.5-Dcoeff*(tau+0.5))*(feqn[i-1][j-1][k+1][7] + feqn[i+1][j-1][k+1][8] + 
 				       feqn[i+1][j+1][k+1][9] + feqn[i-1][j+1][k+1][10]);
 	   }
 
 	   fnew[i][j][k][7]=-0.25*(fnew[i][j][k][1]+fnew[i][j][k][2]-fnew[i][j][k][3]-
 	   			   fnew[i][j][k][4]+2.0*fnew[i][j][k][11]-2.0*fnew[i][j][k][13]-tmp1);
 	   fnew[i][j][k][8]=0.25*(fnew[i][j][k][1]-fnew[i][j][k][2]-fnew[i][j][k][3]+
 	   			  fnew[i][j][k][4]+2.0*fnew[i][j][k][14]-2.0*fnew[i][j][k][12]+tmp1);
 	   fnew[i][j][k][9]=0.25*(fnew[i][j][k][1]+fnew[i][j][k][2]-fnew[i][j][k][3]-
 	   			  fnew[i][j][k][4]+2.0*fnew[i][j][k][11]-2.0*fnew[i][j][k][13]+tmp1);
 	   fnew[i][j][k][10]=-0.25*(fnew[i][j][k][1]-fnew[i][j][k][2]-fnew[i][j][k][3]+
 	   			    fnew[i][j][k][4]+2.0*fnew[i][j][k][14]-2.0*fnew[i][j][k][12]-tmp1);
 
 
 
 	   rb=fnew[i][j][k][0]+fnew[i][j][k][1]+fnew[i][j][k][2]+fnew[i][j][k][3]+fnew[i][j][k][4]+
 	     fnew[i][j][k][5]+fnew[i][j][k][6]+tmp1+fnew[i][j][k][11]+fnew[i][j][k][12]+
 	     fnew[i][j][k][13]+fnew[i][j][k][14];
 	   
 	   fnew[i][j][k][7] += 0.25*rb*vwbt;
 	   fnew[i][j][k][8] += 0.25*rb*vwbt;
 	   fnew[i][j][k][9] += -0.25*rb*vwbt;
 	   fnew[i][j][k][10] += -0.25*rb*vwbt;
 	 }
        }
 
      }
      if(comm->myloc[2]==comm->procgrid[2]-1){
        MPI_Wait(&req_send25,&status);
        MPI_Wait(&req_recv15,&status);
        
        for(i=1;i<subNbx-1;i++){
 	 for(j=1;j<subNby-1;j++){
 	   k=subNbz-2;
 	   
 	   if(typeLB == 1){
 	     fnew[i][j][k][6]=fnew[i][j][k+1][5];
 	     tmp1=fnew[i][j][k+1][7]+fnew[i][j][k+1][8]+fnew[i][j][k+1][9]+fnew[i][j][k+1][10];
 	   }
 	   else{
 	     fnew[i][j][k][6]=fnew[i][j][k+1][5] + (0.5-Dcoeff*(tau+0.5))*feqn[i][j][k-1][6];
 	     tmp1=fnew[i][j][k+1][7]+fnew[i][j][k+1][8]+fnew[i][j][k+1][9]+fnew[i][j][k+1][10] + 
 	       (0.5-Dcoeff*(tau+0.5))*(feqn[i-1][j-1][k-1][11] + feqn[i+1][j-1][k-1][12] + 
 				       feqn[i+1][j+1][k-1][13] + feqn[i-1][j+1][k-1][14]);
 	   }
   
 	   fnew[i][j][k][11]=-0.25*(fnew[i][j][k][1]+fnew[i][j][k][2]-fnew[i][j][k][3]-
 	   			    fnew[i][j][k][4]+2.0*fnew[i][j][k][7]-2.0*fnew[i][j][k][9]-tmp1);
 	   fnew[i][j][k][12]=0.25*(fnew[i][j][k][1]-fnew[i][j][k][2]-fnew[i][j][k][3]+
 	   			   fnew[i][j][k][4]-2.0*fnew[i][j][k][8]+2.0*fnew[i][j][k][10]+tmp1);
 	   fnew[i][j][k][13]=0.25*(fnew[i][j][k][1]+fnew[i][j][k][2]-fnew[i][j][k][3]-
 	   			   fnew[i][j][k][4]+2.0*fnew[i][j][k][7]-2.0*fnew[i][j][k][9]+tmp1);
 	   fnew[i][j][k][14]=-0.25*(fnew[i][j][k][1]-fnew[i][j][k][2]-fnew[i][j][k][3]+
 	   			    fnew[i][j][k][4]-2.0*fnew[i][j][k][8]+2.0*fnew[i][j][k][10]-tmp1);
 
 	   
 	   rb=fnew[i][j][k][0]+fnew[i][j][k][1]+fnew[i][j][k][2]+fnew[i][j][k][3]+fnew[i][j][k][4]+
 	     fnew[i][j][k][5]+fnew[i][j][k][6]+fnew[i][j][k][7]+fnew[i][j][k][8]+fnew[i][j][k][9]+
 	     fnew[i][j][k][10]+tmp1;
 	   
 	   fnew[i][j][k][11] += 0.25*rb*vwtp;
 	   fnew[i][j][k][12] += 0.25*rb*vwtp;
 	   fnew[i][j][k][13] += -0.25*rb*vwtp;
 	   fnew[i][j][k][14] += -0.25*rb*vwtp;
 	 }
        }	
      }
      
      //--------------------------------------------------------------------------
      // Periodic z boundary conditions.
      //--------------------------------------------------------------------------
    }else {
 
      for(i=0; i<numrequests; i++)
        requests[i]=MPI_REQUEST_NULL;
      MPI_Isend(&feq[1][1][1][0],1,passxf,comm->procneigh[0][0],15,world,&requests[0]);
      MPI_Irecv(&feq[0][1][1][0],1,passxf,comm->procneigh[0][0],25,world,&requests[1]);
      MPI_Isend(&feq[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],25,world,&requests[2]);
      MPI_Irecv(&feq[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],15,world,&requests[3]);
      if(typeLB == 2){
        MPI_Isend(&feqn[1][1][1][0],1,passxf,comm->procneigh[0][0],10,world,&requests[4]);
        MPI_Irecv(&feqn[0][1][1][0],1,passxf,comm->procneigh[0][0],20,world,&requests[5]);
        MPI_Isend(&feqn[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],20,world,&requests[6]);
        MPI_Irecv(&feqn[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],10,world,&requests[7]);
      }
      update_periodic(2,subNbx-2,2,subNby-2,2,subNbz-2);
      MPI_Waitall(numrequests,requests,statuses);
 
      for(i=0; i<numrequests; i++)
        requests[i]=MPI_REQUEST_NULL;
      MPI_Isend(&feq[0][1][1][0],1,passyf,comm->procneigh[1][0],15,world,&requests[0]);
      MPI_Irecv(&feq[0][0][1][0],1,passyf,comm->procneigh[1][0],25,world,&requests[1]);   
      MPI_Isend(&feq[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],25,world,&requests[2]);
      MPI_Irecv(&feq[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],15,world,&requests[3]);
      if(typeLB == 2){
        MPI_Isend(&feqn[0][1][1][0],1,passyf,comm->procneigh[1][0],10,world,&requests[4]);
        MPI_Irecv(&feqn[0][0][1][0],1,passyf,comm->procneigh[1][0],20,world,&requests[5]);   
        MPI_Isend(&feqn[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],20,world,&requests[6]);
        MPI_Irecv(&feqn[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],10,world,&requests[7]);
      }
      update_periodic(1,2,2,subNby-2,2,subNbz-2);
      update_periodic(subNbx-2,subNbx-1,2,subNby-2,2,subNbz-2);
      MPI_Waitall(numrequests,requests,statuses);
 
      for(i=0; i<numrequests; i++)
        requests[i]=MPI_REQUEST_NULL;
      MPI_Isend(&feq[0][0][1][0],1,passzf,comm->procneigh[2][0],15,world,&requests[0]);
      MPI_Irecv(&feq[0][0][0][0],1,passzf,comm->procneigh[2][0],25,world,&requests[1]);
      MPI_Isend(&feq[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],25,world,&requests[2]);
      MPI_Irecv(&feq[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],15,world,&requests[3]);
      if(typeLB == 2){
        MPI_Isend(&feqn[0][0][1][0],1,passzf,comm->procneigh[2][0],10,world,&requests[4]);
        MPI_Irecv(&feqn[0][0][0][0],1,passzf,comm->procneigh[2][0],20,world,&requests[5]);
        MPI_Isend(&feqn[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],20,world,&requests[6]);
        MPI_Irecv(&feqn[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],10,world,&requests[7]);
      }  
      update_periodic(1,subNbx-1,1,2,2,subNbz-2);
      update_periodic(1,subNbx-1,subNby-2,subNby-1,2,subNbz-2);
      MPI_Waitall(numrequests,requests,statuses);
      
      update_periodic(1,subNbx-1,1,subNby-1,1,2);
      update_periodic(1,subNbx-1,1,subNby-1,subNbz-2,subNbz-1);
    }
  
 }
 
 //==========================================================================
 // Update the distribution functions over the entire simulation domain for
 // the D3Q19 model.
 //==========================================================================
 void FixLbFluid::update_full19(void)
 {
   
   MPI_Request req_send15,req_recv15;
   MPI_Request req_send25,req_recv25;
   MPI_Request requests[8];
   MPI_Status statuses[8];
   int numrequests;
   double tmp1,tmp2,tmp3;
   MPI_Status status;
   double rb;
   int i,j,k,m;
   int imod,jmod,kmod;
   int imodm,jmodm,kmodm;
   
   //--------------------------------------------------------------------------
   // If using the standard LB integrator, do not need to send info about feqn.
   //--------------------------------------------------------------------------
   if(typeLB == 1){
     numrequests = 4;
   }else{
     numrequests = 8;
   }
   
   //--------------------------------------------------------------------------
   // Fixed z boundary conditions.
   //--------------------------------------------------------------------------
   if(domain->periodicity[2]==0){  
     
     for(i=0; i<numrequests; i++)
       requests[i]=MPI_REQUEST_NULL;
     MPI_Isend(&feq[1][1][1][0],1,passxf,comm->procneigh[0][0],15,world,&requests[0]);
     MPI_Irecv(&feq[0][1][1][0],1,passxf,comm->procneigh[0][0],25,world,&requests[1]);
     MPI_Isend(&feq[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],25,world,&requests[2]);
     MPI_Irecv(&feq[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],15,world,&requests[3]);
     if(typeLB == 2){
       MPI_Isend(&feqn[1][1][1][0],1,passxf,comm->procneigh[0][0],10,world,&requests[4]);
       MPI_Irecv(&feqn[0][1][1][0],1,passxf,comm->procneigh[0][0],20,world,&requests[5]);
       MPI_Isend(&feqn[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],20,world,&requests[6]);
       MPI_Irecv(&feqn[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],10,world,&requests[7]);
     }
     update_periodic(2,subNbx-2,2,subNby-2,2,subNbz-2);
     MPI_Waitall(numrequests,requests,statuses);
     
     for(i=0; i<numrequests; i++)
       requests[i]=MPI_REQUEST_NULL;
     MPI_Isend(&feq[0][1][1][0],1,passyf,comm->procneigh[1][0],15,world,&requests[0]);
     MPI_Irecv(&feq[0][0][1][0],1,passyf,comm->procneigh[1][0],25,world,&requests[1]);   
     MPI_Isend(&feq[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],25,world,&requests[2]);
     MPI_Irecv(&feq[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],15,world,&requests[3]);
     if(typeLB == 2){
       MPI_Isend(&feqn[0][1][1][0],1,passyf,comm->procneigh[1][0],10,world,&requests[4]);
       MPI_Irecv(&feqn[0][0][1][0],1,passyf,comm->procneigh[1][0],20,world,&requests[5]);   
       MPI_Isend(&feqn[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],20,world,&requests[6]);
       MPI_Irecv(&feqn[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],10,world,&requests[7]);
     }
     update_periodic(1,2,2,subNby-2,2,subNbz-2);
     update_periodic(subNbx-2,subNbx-1,2,subNby-2,2,subNbz-2);
     MPI_Waitall(numrequests,requests,statuses);
     
     for(i=0; i<numrequests; i++)
       requests[i]=MPI_REQUEST_NULL;
     MPI_Isend(&feq[0][0][1][0],1,passzf,comm->procneigh[2][0],15,world,&requests[0]);
     MPI_Irecv(&feq[0][0][0][0],1,passzf,comm->procneigh[2][0],25,world,&requests[1]);
     MPI_Isend(&feq[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],25,world,&requests[2]);
     MPI_Irecv(&feq[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],15,world,&requests[3]);
     if(typeLB == 2){
       MPI_Isend(&feqn[0][0][1][0],1,passzf,comm->procneigh[2][0],10,world,&requests[4]);
       MPI_Irecv(&feqn[0][0][0][0],1,passzf,comm->procneigh[2][0],20,world,&requests[5]);
       MPI_Isend(&feqn[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],20,world,&requests[6]);
       MPI_Irecv(&feqn[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],10,world,&requests[7]);
     } 
     update_periodic(1,subNbx-1,1,2,2,subNbz-2);
     update_periodic(1,subNbx-1,subNby-2,subNby-1,2,subNbz-2);
     MPI_Waitall(numrequests,requests,statuses);
     
     if(typeLB==1){
       update_periodic(1,subNbx-1,1,subNby-1,1,2);
       update_periodic(1,subNbx-1,1,subNby-1,subNbz-2,subNbz-1);
     }else if(typeLB==2){
       if(comm->myloc[2]==0){
 	for(i=1; i<subNbx-1; i++){
 	  for(j=1;j<subNby-1;j++){
 	    k=1;
 	    for(m=0; m<19; m++){
 	      imod = i-e[m][0];
 	      jmod = j-e[m][1];
 	      kmod = k-e[m][2];
 	      
 	      fnew[i][j][k][m] = feq[imod][jmod][kmod][m] + (f_lb[imod][jmod][kmod][m]-feq[imod][jmod][kmod][m])*expminusdtovertau;
 	    }
 	    
 	    for(m=0; m<19; m++){
 	      imod = i-e[m][0];
 	      jmod = j-e[m][1];
 	      kmod = k-e[m][2];
 	      imodm = i+e[m][0];
 	      jmodm = j+e[m][1];
 	      kmodm = k+e[m][2];
 	      
 	      if(m==5)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][6] - feqold[imod][jmod][kmod][m]) + 
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][6] - feqn[imod][jmod][kmod][6]);
 	      else if(m==11)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][12] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][12] - feqn[imod][jmod][kmod][12]);
 	      else if(m==13)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][14] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][14] - feqn[imod][jmod][kmod][14]);
 	      else if(m==15)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][16] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][16] - feqn[imod][jmod][kmod][16]);
 	      else if(m==17)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][18] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][18] - feqn[imod][jmod][kmod][18]);
 	      else if(m==6)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][5] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	      else if(m==12)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][11] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	      else if(m==14)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][13] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	      else if(m==16)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][15] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	      else if(m==18)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][17] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	      else
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) + 
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[imodm][jmodm][kmodm][m] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	    }
 	  }
 	}      
       }else{
 	update_periodic(1,subNbx-1,1,subNby-1,1,2);
       }
       if(comm->myloc[2]==comm->procgrid[2]-1){
 	for(i=1;i<subNbx-1;i++){
 	  for(j=1;j<subNby-1;j++){
 	    k=subNbz-2;
 	    for(m=0; m<19; m++){
 	      imod = i-e[m][0];
 	      jmod = j-e[m][1];
 	      kmod = k-e[m][2];
 	      
 	      fnew[i][j][k][m] = feq[imod][jmod][kmod][m] + (f_lb[imod][jmod][kmod][m]-feq[imod][jmod][kmod][m])*expminusdtovertau;
 	    }	    
 	    for(m=0; m<19; m++){
 	      imod = i-e[m][0];
 	      jmod = j-e[m][1];
 	      kmod = k-e[m][2];
 	      imodm = i+e[m][0];
 	      jmodm = j+e[m][1];
 	      kmodm = k+e[m][2];
 	      
 	      if(m==6)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][5] - feqold[imod][jmod][kmod][m]) + 
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][5] - feqn[imod][jmod][kmod][5]);
 	      else if(m==12)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][11] - feqold[imod][jmod][kmod][m]) + 
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][11] - feqn[imod][jmod][kmod][11]);
 	      else if(m==14)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][13] - feqold[imod][jmod][kmod][m]) + 
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][13] - feqn[imod][jmod][kmod][13]);
 	      else if(m==16)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][15] - feqold[imod][jmod][kmod][m]) + 
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][15] - feqn[imod][jmod][kmod][15]);
 	      else if(m==18)
 		fnew[i][j][k][m] += Dcoeff*(feq[imod][jmod][kmod][17] - feqold[imod][jmod][kmod][m]) + 
 		  (0.5-Dcoeff*(tau+0.5))*(feqoldn[imod][jmod][kmod][m] - feqoldn[imod][jmod][kmod][17] - feqn[imod][jmod][kmod][17]);
 	      else if(m==5)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][6] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	  
 	      else if(m==11)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][12] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	  
 	      else if(m==13)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][14] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	
 	      else if(m==15)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][16] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);
 	      else if(m==17)
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) +
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[i][j][k][18] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	  	  
 	      else
 		fnew[i][j][k][m] += Dcoeff*(feq[i][j][k][m] - feqold[imod][jmod][kmod][m]) + 
 		  (0.5-Dcoeff*(tau+0.5))*(feqn[imodm][jmodm][kmodm][m] - feqoldn[i][j][k][m] - feqn[i][j][k][m] + feqoldn[imod][jmod][kmod][m]);	       
 	    }
 	  }
 	}
       }
       else{
 	update_periodic(1,subNbx-1,1,subNby-1,subNbz-2,subNbz-1);
       }
     }
     
     req_send15=MPI_REQUEST_NULL;
     req_recv25=MPI_REQUEST_NULL;
     req_send25=MPI_REQUEST_NULL;
     req_recv15=MPI_REQUEST_NULL;
     
     if(comm->myloc[2]==0){
       MPI_Isend(&fnew[0][0][1][0],1,passzf,comm->procneigh[2][0],15,world,&req_send15);
       MPI_Irecv(&fnew[0][0][0][0],1,passzf,comm->procneigh[2][0],25,world,&req_recv25);
     }
     
     if(comm->myloc[2]==comm->procgrid[2]-1){
       MPI_Isend(&fnew[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],25,world,&req_send25);
       MPI_Irecv(&fnew[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],15,world,&req_recv15);
     }
     if(comm->myloc[2]==0){
       MPI_Wait(&req_send15,&status);
       MPI_Wait(&req_recv25,&status);
       
       for(i=1;i<subNbx-1;i++){
 	for(j=1;j<subNby-1;j++){
 	  k=1;
 	  
 	  if(typeLB == 1){
 	    fnew[i][j][k][5]=fnew[i][j][k-1][6];
 	    tmp1=fnew[i][j][k-1][12]+fnew[i][j][k-1][14]+fnew[i][j][k-1][16]+fnew[i][j][k-1][18];
 	  }
 	  else{
 	    fnew[i][j][k][5]=fnew[i][j][k-1][6] + (0.5-Dcoeff*(tau+0.5))*feqn[i][j][k+1][5];
 	    tmp1=fnew[i][j][k-1][12]+fnew[i][j][k-1][14]+fnew[i][j][k-1][16]+fnew[i][j][k-1][18] +
 	      (0.5-Dcoeff*(tau+0.5))*(feqn[i-1][j][k+1][11] + feqn[i+1][j][k+1][13] + 
 				      feqn[i][j-1][k+1][15] + feqn[i][j+1][k+1][17]);	     
 	  }
 	  
 	  tmp2=fnew[i][j][k][3]+fnew[i][j][k][9]+fnew[i][j][k][10]+fnew[i][j][k][14]-
 	    fnew[i][j][k][1]-fnew[i][j][k][7]-fnew[i][j][k][8]-fnew[i][j][k][12];
 	  
 	  rb=fnew[i][j][k][0]+fnew[i][j][k][1]+fnew[i][j][k][2]+fnew[i][j][k][3]+fnew[i][j][k][4]+
 	    fnew[i][j][k][5]+fnew[i][j][k][6]+fnew[i][j][k][7]+fnew[i][j][k][8]+fnew[i][j][k][9]+
 	    fnew[i][j][k][10]+fnew[i][j][k][12]+fnew[i][j][k][14]+fnew[i][j][k][16]+fnew[i][j][k][18]+tmp1;
 	  
 	  tmp3=rb*vwbt-fnew[i][j][k][2]+fnew[i][j][k][4]-fnew[i][j][k][7]+fnew[i][j][k][8]-fnew[i][j][k][9]+
 	    fnew[i][j][k][10]-fnew[i][j][k][16]+fnew[i][j][k][18];
 	  
 	  fnew[i][j][k][11] = 0.25*(tmp1+2.0*tmp2);
 	  fnew[i][j][k][13] = 0.25*(tmp1-2.0*tmp2);
 	  fnew[i][j][k][15] = 0.25*(tmp1+2.0*tmp3);
 	  fnew[i][j][k][17] = 0.25*(tmp1-2.0*tmp3);
 	}
       }
       
     }
     if(comm->myloc[2]==comm->procgrid[2]-1){
       MPI_Wait(&req_send25,&status);
       MPI_Wait(&req_recv15,&status);
       
       for(i=1;i<subNbx-1;i++){
 	for(j=1;j<subNby-1;j++){
 	  k=subNbz-2;
 	  
 	  if(typeLB == 1){
 	    fnew[i][j][k][6]=fnew[i][j][k+1][5];
 	    tmp1=fnew[i][j][k+1][11]+fnew[i][j][k+1][13]+fnew[i][j][k+1][15]+fnew[i][j][k+1][17];
 	  }
 	  else{
 	    fnew[i][j][k][6]=fnew[i][j][k+1][5] + (0.5-Dcoeff*(tau+0.5))*feqn[i][j][k-1][6];
 	    tmp1=fnew[i][j][k+1][11]+fnew[i][j][k+1][13]+fnew[i][j][k+1][15]+fnew[i][j][k+1][17] +
 	      (0.5-Dcoeff*(tau+0.5))*(feqn[i-1][j][k-1][12] + feqn[i+1][j][k-1][14] + 
 				      feqn[i][j-1][k-1][16] + feqn[i][j+1][k-1][18]);
 	  }
 	  
 	  tmp2=fnew[i][j][k][3]+fnew[i][j][k][9]+fnew[i][j][k][10]+fnew[i][j][k][13]-fnew[i][j][k][1]-
 	    fnew[i][j][k][7]-fnew[i][j][k][8]-fnew[i][j][k][11];
 	  
 	  rb=fnew[i][j][k][0]+fnew[i][j][k][1]+fnew[i][j][k][2]+fnew[i][j][k][3]+fnew[i][j][k][4]+
 	    fnew[i][j][k][5]+fnew[i][j][k][6]+fnew[i][j][k][7]+fnew[i][j][k][8]+fnew[i][j][k][9]+
 	    fnew[i][j][k][10]+fnew[i][j][k][11]+fnew[i][j][k][13]+fnew[i][j][k][15]+fnew[i][j][k][17]+tmp1;
 	  
 	  tmp3=rb*vwtp-fnew[i][j][k][2]+fnew[i][j][k][4]-fnew[i][j][k][7]+fnew[i][j][k][8]-fnew[i][j][k][9]+
 	    fnew[i][j][k][10]-fnew[i][j][k][15]+fnew[i][j][k][17];
 	  
 	  fnew[i][j][k][12] = 0.25*(tmp1+2.0*tmp2);
 	  fnew[i][j][k][14] = 0.25*(tmp1-2.0*tmp2);
 	  fnew[i][j][k][16] = 0.25*(tmp1+2.0*tmp3);
 	  fnew[i][j][k][18] = 0.25*(tmp1-2.0*tmp3);
 	}
       }	
     }
     
     //--------------------------------------------------------------------------
     // Periodic z boundary conditions.
     //--------------------------------------------------------------------------
   }else {
     
     for(i=0; i<numrequests; i++)
       requests[i]=MPI_REQUEST_NULL;
     MPI_Isend(&feq[1][1][1][0],1,passxf,comm->procneigh[0][0],15,world,&requests[0]);
     MPI_Irecv(&feq[0][1][1][0],1,passxf,comm->procneigh[0][0],25,world,&requests[1]);
     MPI_Isend(&feq[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],25,world,&requests[2]);
     MPI_Irecv(&feq[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],15,world,&requests[3]);
     if(typeLB == 2){
       MPI_Isend(&feqn[1][1][1][0],1,passxf,comm->procneigh[0][0],10,world,&requests[4]);
       MPI_Irecv(&feqn[0][1][1][0],1,passxf,comm->procneigh[0][0],20,world,&requests[5]);
       MPI_Isend(&feqn[subNbx-2][1][1][0],1,passxf,comm->procneigh[0][1],20,world,&requests[6]);
       MPI_Irecv(&feqn[subNbx-1][1][1][0],1,passxf,comm->procneigh[0][1],10,world,&requests[7]);
     }
     update_periodic(2,subNbx-2,2,subNby-2,2,subNbz-2);
     MPI_Waitall(numrequests,requests,statuses);
     
     for(i=0; i<numrequests; i++)
       requests[i]=MPI_REQUEST_NULL;
     MPI_Isend(&feq[0][1][1][0],1,passyf,comm->procneigh[1][0],15,world,&requests[0]);
     MPI_Irecv(&feq[0][0][1][0],1,passyf,comm->procneigh[1][0],25,world,&requests[1]);   
     MPI_Isend(&feq[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],25,world,&requests[2]);
     MPI_Irecv(&feq[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],15,world,&requests[3]);
     if(typeLB == 2){
       MPI_Isend(&feqn[0][1][1][0],1,passyf,comm->procneigh[1][0],10,world,&requests[4]);
       MPI_Irecv(&feqn[0][0][1][0],1,passyf,comm->procneigh[1][0],20,world,&requests[5]);   
       MPI_Isend(&feqn[0][subNby-2][1][0],1,passyf,comm->procneigh[1][1],20,world,&requests[6]);
       MPI_Irecv(&feqn[0][subNby-1][1][0],1,passyf,comm->procneigh[1][1],10,world,&requests[7]);
     }
     update_periodic(1,2,2,subNby-2,2,subNbz-2);
     update_periodic(subNbx-2,subNbx-1,2,subNby-2,2,subNbz-2);
     MPI_Waitall(numrequests,requests,statuses);
     
     for(i=0; i<numrequests; i++)
       requests[i]=MPI_REQUEST_NULL;
     MPI_Isend(&feq[0][0][1][0],1,passzf,comm->procneigh[2][0],15,world,&requests[0]);
     MPI_Irecv(&feq[0][0][0][0],1,passzf,comm->procneigh[2][0],25,world,&requests[1]);
     MPI_Isend(&feq[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],25,world,&requests[2]);
     MPI_Irecv(&feq[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],15,world,&requests[3]);
     if(typeLB == 2){
       MPI_Isend(&feqn[0][0][1][0],1,passzf,comm->procneigh[2][0],10,world,&requests[4]);
       MPI_Irecv(&feqn[0][0][0][0],1,passzf,comm->procneigh[2][0],20,world,&requests[5]);
       MPI_Isend(&feqn[0][0][subNbz-2][0],1,passzf,comm->procneigh[2][1],20,world,&requests[6]);
       MPI_Irecv(&feqn[0][0][subNbz-1][0],1,passzf,comm->procneigh[2][1],10,world,&requests[7]);
     }  
     update_periodic(1,subNbx-1,1,2,2,subNbz-2);
     update_periodic(1,subNbx-1,subNby-2,subNby-1,2,subNbz-2);
     MPI_Waitall(numrequests,requests,statuses);
     
     update_periodic(1,subNbx-1,1,subNby-1,1,2);
     update_periodic(1,subNbx-1,1,subNby-1,subNbz-2,subNbz-1);
   }
   
 }
 
 
diff --git a/src/USER-MISC/pair_cdeam.cpp b/src/USER-MISC/pair_cdeam.cpp
index cbb8e23f7..dbfbac823 100644
--- a/src/USER-MISC/pair_cdeam.cpp
+++ b/src/USER-MISC/pair_cdeam.cpp
@@ -1,644 +1,643 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Alexander Stukowski
                         Technical University of Darmstadt,
                         Germany Department of Materials Science
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_cdeam.h"
 #include "atom.h"
 #include "force.h"
 #include "comm.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 // This is for debugging purposes. The ASSERT() macro is used in the code to check
 // if everything runs as expected. Change this to #if 0 if you don't need the checking.
 #if 0
         #define ASSERT(cond) ((!(cond)) ? my_failure(error,__FILE__,__LINE__) : my_noop())
 
         inline void my_noop() {}
         inline void my_failure(Error* error, const char* file, int line) {
                 char str[1024];
                 sprintf(str,"Assertion failure: File %s, line %i", file, line);
                 error->one(FLERR,str);
         }
 #else
         #define ASSERT(cond)
 #endif
 
 #define MAXLINE 1024        // This sets the maximum line length in EAM input files.
 
 PairCDEAM::PairCDEAM(LAMMPS *lmp, int _cdeamVersion) : PairEAM(lmp), PairEAMAlloy(lmp), cdeamVersion(_cdeamVersion)
 {
         single_enable = 0;
         restartinfo = 0;
 
         rhoB = NULL;
         D_values = NULL;
         hcoeff = NULL;
 
         // Set communication buffer sizes needed by this pair style.
         if(cdeamVersion == 1) {
                 comm_forward = 4;
                 comm_reverse = 3;
         }
         else if(cdeamVersion == 2) {
                 comm_forward = 3;
                 comm_reverse = 2;
         }
         else {
                 error->all(FLERR,"Invalid CD-EAM potential version.");
         }
 }
 
 PairCDEAM::~PairCDEAM()
 {
         memory->destroy(rhoB);
         memory->destroy(D_values);
         if(hcoeff) delete[] hcoeff;
 }
 
 void PairCDEAM::compute(int eflag, int vflag)
 {
         int i,j,ii,jj,inum,jnum,itype,jtype;
         double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
         double rsq,rhoip,rhojp,recip,phi;
         int *ilist,*jlist,*numneigh,**firstneigh;
 
         evdwl = 0.0;
         if (eflag || vflag) ev_setup(eflag,vflag);
         else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
 
         // Grow per-atom arrays if necessary
         if(atom->nmax > nmax) {
                 memory->destroy(rho);
                 memory->destroy(fp);
                 memory->destroy(rhoB);
                 memory->destroy(D_values);
                 nmax = atom->nmax;
                 memory->create(rho,nmax,"pair:rho");
                 memory->create(rhoB,nmax,"pair:rhoB");
                 memory->create(fp,nmax,"pair:fp");
                 memory->create(D_values,nmax,"pair:D_values");
         }
 
         double **x = atom->x;
         double **f = atom->f;
         int *type = atom->type;
         int nlocal = atom->nlocal;
         int newton_pair = force->newton_pair;
 
         inum = list->inum;
         ilist = list->ilist;
         numneigh = list->numneigh;
         firstneigh = list->firstneigh;
 
         // Zero out per-atom arrays.
         int m = nlocal + atom->nghost;
         for(i = 0; i < m; i++) {
                 rho[i] = 0.0;
                 rhoB[i] = 0.0;
                 D_values[i] = 0.0;
         }
 
         // Stage I
 
         // Compute rho and rhoB at each local atom site.
         // Additionally calculate the D_i values here if we are using the one-site formulation.
         // For the two-site formulation we have to calculate the D values in an extra loop (Stage II).
         for(ii = 0; ii < inum; ii++) {
                 i = ilist[ii];
                 xtmp = x[i][0];
                 ytmp = x[i][1];
                 ztmp = x[i][2];
                 itype = type[i];
                 jlist = firstneigh[i];
                 jnum = numneigh[i];
 
                 for(jj = 0; jj < jnum; jj++) {
                         j = jlist[jj];
                         j &= NEIGHMASK;
 
                         delx = xtmp - x[j][0];
                         dely = ytmp - x[j][1];
                         delz = ztmp - x[j][2];
                         rsq = delx*delx + dely*dely + delz*delz;
 
                         if(rsq < cutforcesq) {
                                 jtype = type[j];
                                 double r = sqrt(rsq);
                                 const EAMTableIndex index = radiusToTableIndex(r);
                                 double localrho = RhoOfR(index, jtype, itype);
                                 rho[i] += localrho;
                                 if(jtype == speciesB) rhoB[i] += localrho;
                                 if(newton_pair || j < nlocal) {
                                         localrho = RhoOfR(index, itype, jtype);
                                         rho[j] += localrho;
                                         if(itype == speciesB) rhoB[j] += localrho;
                                 }
 
                                 if(cdeamVersion == 1 && itype != jtype) {
                                         // Note: if the i-j interaction is not concentration dependent (because either
                                         // i or j are not species A or B) then its contribution to D_i and D_j should
                                         // be ignored.
                                         // This if-clause is only required for a ternary.
                                         if((itype == speciesA && jtype == speciesB) || (jtype == speciesA && itype == speciesB)) {
                                                 double Phi_AB = PhiOfR(index, itype, jtype, 1.0 / r);
                                                 D_values[i] += Phi_AB;
                                                 if(newton_pair || j < nlocal)
                                                         D_values[j] += Phi_AB;
                                         }
                                 }
                         }
                 }
         }
 
         // Communicate and sum densities.
         if(newton_pair) {
                 communicationStage = 1;
                 comm->reverse_comm_pair(this);
         }
 
         // fp = derivative of embedding energy at each atom
         // phi = embedding energy at each atom
         for(ii = 0; ii < inum; ii++) {
                 i = ilist[ii];
                 EAMTableIndex index = rhoToTableIndex(rho[i]);
                 fp[i] = FPrimeOfRho(index, type[i]);
                 if(eflag) {
                         phi = FofRho(index, type[i]);
                         if (eflag_global) eng_vdwl += phi;
                         if (eflag_atom) eatom[i] += phi;
                 }
         }
 
         // Communicate derivative of embedding function and densities
         // and D_values (this for one-site formulation only).
         communicationStage = 2;
         comm->forward_comm_pair(this);
 
         // The electron densities may not drop to zero because then the concentration would no longer be defined.
         // But the concentration is not needed anyway if there is no interaction with another atom, which is the case
         // if the electron density is exactly zero. That's why the following lines have been commented out.
         //
         //for(i = 0; i < nlocal + atom->nghost; i++) {
         //        if(rho[i] == 0 && (type[i] == speciesA || type[i] == speciesB))
         //                error->one(FLERR,"CD-EAM potential routine: Detected atom with zero electron density.");
         //}
 
         // Stage II
         // This is only required for the original two-site formulation of the CD-EAM potential.
 
         if(cdeamVersion == 2) {
                 // Compute intermediate value D_i for each atom.
                 for(ii = 0; ii < inum; ii++) {
                         i = ilist[ii];
                         xtmp = x[i][0];
                         ytmp = x[i][1];
                         ztmp = x[i][2];
                         itype = type[i];
                         jlist = firstneigh[i];
                         jnum = numneigh[i];
 
                         // This code line is required for ternary alloys.
                         if(itype != speciesA && itype != speciesB) continue;
 
                         double x_i = rhoB[i] / rho[i];        // Concentration at atom i.
 
                         for(jj = 0; jj < jnum; jj++) {
                                 j = jlist[jj];
                                 j &= NEIGHMASK;
                                 jtype = type[j];
                                 if(itype == jtype) continue;
 
                                 // This code line is required for ternary alloys.
                                 if(jtype != speciesA && jtype != speciesB) continue;
 
                                 delx = xtmp - x[j][0];
                                 dely = ytmp - x[j][1];
                                 delz = ztmp - x[j][2];
                                 rsq = delx*delx + dely*dely + delz*delz;
 
                                 if(rsq < cutforcesq) {
                                         double r = sqrt(rsq);
                                         const EAMTableIndex index = radiusToTableIndex(r);
 
                                         // The concentration independent part of the cross pair potential.
                                         double Phi_AB = PhiOfR(index, itype, jtype, 1.0 / r);
 
                                         // Average concentration of two sites
                                         double x_ij = 0.5 * (x_i + rhoB[j]/rho[j]);
 
                                         // Calculate derivative of h(x_ij) polynomial function.
                                         double h_prime = evalHprime(x_ij);
 
                                         D_values[i] += h_prime * Phi_AB / (2.0 * rho[i] * rho[i]);
                                         if(newton_pair || j < nlocal)
                                                 D_values[j] += h_prime * Phi_AB / (2.0 * rho[j] * rho[j]);
                                 }
                         }
                 }
 
                 // Communicate and sum D values.
                 if(newton_pair) {
                         communicationStage = 3;
                         comm->reverse_comm_pair(this);
                 }
                 communicationStage = 4;
                 comm->forward_comm_pair(this);
         }
 
         // Stage III
 
         // Compute force acting on each atom.
         for(ii = 0; ii < inum; ii++) {
                 i = ilist[ii];
                 xtmp = x[i][0];
                 ytmp = x[i][1];
                 ztmp = x[i][2];
                 itype = type[i];
 
                 jlist = firstneigh[i];
                 jnum = numneigh[i];
 
                 // Concentration at site i
                 double x_i = -1.0;                // The value -1 indicates: no concentration dependence for all interactions of atom i.
                                                                 // It will be replaced by the concentration at site i if atom i is either A or B.
 
                 double D_i, h_prime_i;
 
                 // This if-clause is only required for ternary alloys.
                 if((itype == speciesA || itype == speciesB) && rho[i] != 0.0) {
 
                         // Compute local concentration at site i.
                         x_i = rhoB[i]/rho[i];
                         ASSERT(x_i >= 0 && x_i<=1.0);
 
                         if(cdeamVersion == 1) {
                                 // Calculate derivative of h(x_i) polynomial function.
                                 h_prime_i = evalHprime(x_i);
                                 D_i = D_values[i] * h_prime_i / (2.0 * rho[i] * rho[i]);
-                        }
-                        else if(cdeamVersion == 2) {
+                        } else if(cdeamVersion == 2) {
                                 D_i = D_values[i];
+                        } else {
+                          ASSERT(false);
                         }
-                        else ASSERT(false);
                 }
 
                 for(jj = 0; jj < jnum; jj++) {
                         j = jlist[jj];
                         j &= NEIGHMASK;
 
                         delx = xtmp - x[j][0];
                         dely = ytmp - x[j][1];
                         delz = ztmp - x[j][2];
                         rsq = delx*delx + dely*dely + delz*delz;
 
                         if(rsq < cutforcesq) {
                                 jtype = type[j];
                                 double r = sqrt(rsq);
                                 const EAMTableIndex index = radiusToTableIndex(r);
 
                                 // rhoip = derivative of (density at atom j due to atom i)
                                 // rhojp = derivative of (density at atom i due to atom j)
                                 // psip needs both fp[i] and fp[j] terms since r_ij appears in two
                                 //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
                                 //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
                                 rhoip = RhoPrimeOfR(index, itype, jtype);
                                 rhojp = RhoPrimeOfR(index, jtype, itype);
                                 fpair = fp[i]*rhojp + fp[j]*rhoip;
                                 recip = 1.0/r;
 
                                 double x_j = -1;  // The value -1 indicates: no concentration dependence for this i-j pair
                                                   // because atom j is not of species A nor B.
 
                                 // This code line is required for ternary alloy.
                                 if(jtype == speciesA || jtype == speciesB) {
                                         ASSERT(rho[i] != 0.0);
                                         ASSERT(rho[j] != 0.0);
 
                                         // Compute local concentration at site j.
                                         x_j = rhoB[j]/rho[j];
                                         ASSERT(x_j >= 0 && x_j<=1.0);
 
                                         double D_j=0.0;
                                         if(cdeamVersion == 1) {
                                                 // Calculate derivative of h(x_j) polynomial function.
                                                 double h_prime_j = evalHprime(x_j);
                                                 D_j = D_values[j] * h_prime_j / (2.0 * rho[j] * rho[j]);
-                                        }
-                                        else if(cdeamVersion == 2) {
+                                        } else if(cdeamVersion == 2) {
                                                 D_j = D_values[j];
+                                        } else {
+                                          ASSERT(false);
                                         }
-                                        else ASSERT(false);
-
                                         double t2 = -rhoB[j];
                                         if(itype == speciesB) t2 += rho[j];
                                         fpair += D_j * rhoip * t2;
                                 }
 
                                 // This if-clause is only required for a ternary alloy.
                                 // Actually we don't need it at all because D_i should be zero anyway if
                                 // atom i has no concentration dependent interactions (because it is not species A or B).
                                 if(x_i != -1.0) {
                                         double t1 = -rhoB[i];
                                         if(jtype == speciesB) t1 += rho[i];
                                         fpair += D_i * rhojp * t1;
                                 }
 
                                 double phip;
                                 double phi = PhiOfR(index, itype, jtype, recip, phip);
                                 if(itype == jtype || x_i == -1.0 || x_j == -1.0) {
                                         // Case of no concentration dependence.
                                         fpair += phip;
-                                }
-                                else {
+                                } else {
                                         // We have a concentration dependence for the i-j interaction.
                                         double h=0.0;
                                         if(cdeamVersion == 1) {
                                                 // Calculate h(x_i) polynomial function.
                                                 double h_i = evalH(x_i);
                                                 // Calculate h(x_j) polynomial function.
                                                 double h_j = evalH(x_j);
                                                 h = 0.5 * (h_i + h_j);
-                                        }
-                                        else if(cdeamVersion == 2) {
+                                        } else if(cdeamVersion == 2) {
                                                 // Average concentration.
                                                 double x_ij = 0.5 * (x_i + x_j);
                                                 // Calculate h(x_ij) polynomial function.
                                                 h = evalH(x_ij);
+                                        } else {
+                                          ASSERT(false);
                                         }
-                                        else ASSERT(false);
                                         fpair += h * phip;
                                         phi *= h;
                                 }
 
                                 // Divide by r_ij and negate to get forces from gradient.
                                 fpair /= -r;
 
                                 f[i][0] += delx*fpair;
                                 f[i][1] += dely*fpair;
                                 f[i][2] += delz*fpair;
                                 if(newton_pair || j < nlocal) {
                                         f[j][0] -= delx*fpair;
                                         f[j][1] -= dely*fpair;
                                         f[j][2] -= delz*fpair;
                                 }
 
                                 if(eflag) evdwl = phi;
                                 if(evflag) ev_tally(i,j,nlocal,newton_pair,evdwl,0.0,fpair,delx,dely,delz);
                         }
                 }
         }
 
         if(vflag_fdotr) virial_fdotr_compute();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairCDEAM::coeff(int narg, char **arg)
 {
         PairEAMAlloy::coeff(narg, arg);
 
         // Make sure the EAM file is a CD-EAM binary alloy.
         if(setfl->nelements < 2)
                 error->all(FLERR,"The EAM file must contain at least 2 elements to be used with the eam/cd pair style.");
 
         // Read in the coefficients of the h polynomial from the end of the EAM file.
         read_h_coeff(arg[2]);
 
         // Determine which atom type is the A species and which is the B species in the alloy.
         // By default take the first element (index 0) in the EAM file as the A species
         // and the second element (index 1) in the EAM file as the B species.
         speciesA = -1;
         speciesB = -1;
         for(int i = 1; i <= atom->ntypes; i++) {
                 if(map[i] == 0) {
                         if(speciesA >= 0)
                                 error->all(FLERR,"The first element from the EAM file may only be mapped to a single atom type.");
                         speciesA = i;
                 }
                 if(map[i] == 1) {
                         if(speciesB >= 0)
                                 error->all(FLERR,"The second element from the EAM file may only be mapped to a single atom type.");
                         speciesB = i;
                 }
         }
         if(speciesA < 0)
                 error->all(FLERR,"The first element from the EAM file must be mapped to exactly one atom type.");
         if(speciesB < 0)
                 error->all(FLERR,"The second element from the EAM file must be mapped to exactly one atom type.");
 }
 
 /* ----------------------------------------------------------------------
    Reads in the h(x) polynomial coefficients
 ------------------------------------------------------------------------- */
 void PairCDEAM::read_h_coeff(char *filename)
 {
         if(comm->me == 0) {
                 // Open potential file
                 FILE *fp;
                 char line[MAXLINE];
                 char nextline[MAXLINE];
                 fp = force->open_potential(filename);
                 if (fp == NULL) {
                         char str[128];
                         sprintf(str,"Cannot open EAM potential file %s", filename);
                         error->one(FLERR,str);
                 }
 
                 // h coefficients are stored at the end of the file.
                 // Skip to last line of file.
                 while(fgets(nextline, MAXLINE, fp) != NULL) {
                         strcpy(line, nextline);
                 }
                 char* ptr = strtok(line, " \t\n\r\f");
                 int degree = atoi(ptr);
                 nhcoeff = degree+1;
                 hcoeff = new double[nhcoeff];
                 int i = 0;
                 while((ptr = strtok(NULL," \t\n\r\f")) != NULL && i < nhcoeff) {
                         hcoeff[i++] = atof(ptr);
                 }
                 if(i != nhcoeff || nhcoeff < 1)
                         error->one(FLERR,"Failed to read h(x) function coefficients from EAM file.");
 
                 // Close the potential file.
                 fclose(fp);
         }
 
         MPI_Bcast(&nhcoeff, 1, MPI_INT, 0, world);
         if(comm->me != 0) hcoeff = new double[nhcoeff];
         MPI_Bcast(hcoeff, nhcoeff, MPI_DOUBLE, 0, world);
 }
 
 
 /* ---------------------------------------------------------------------- */
 
 int PairCDEAM::pack_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
 {
         int i,j,m;
 
         m = 0;
         if(communicationStage == 2) {
                 if(cdeamVersion == 1) {
                         for (i = 0; i < n; i++) {
                                 j = list[i];
                                 buf[m++] = fp[j];
                                 buf[m++] = rho[j];
                                 buf[m++] = rhoB[j];
                                 buf[m++] = D_values[j];
                         }
                         return 4;
                 }
                 else if(cdeamVersion == 2) {
                         for (i = 0; i < n; i++) {
                                 j = list[i];
                                 buf[m++] = fp[j];
                                 buf[m++] = rho[j];
                                 buf[m++] = rhoB[j];
                         }
                         return 3;
                 }
                 else { ASSERT(false); return 0; }
         }
         else if(communicationStage == 4) {
                 for (i = 0; i < n; i++) {
                         j = list[i];
                         buf[m++] = D_values[j];
                 }
                 return 1;
         }
         else return 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairCDEAM::unpack_comm(int n, int first, double *buf)
 {
         int i,m,last;
 
         m = 0;
         last = first + n;
         if(communicationStage == 2) {
                 if(cdeamVersion == 1) {
                         for(i = first; i < last; i++) {
                                 fp[i] = buf[m++];
                                 rho[i] = buf[m++];
                                 rhoB[i] = buf[m++];
                                 D_values[i] = buf[m++];
                         }
                 }
                 else if(cdeamVersion == 2) {
                         for(i = first; i < last; i++) {
                                 fp[i] = buf[m++];
                                 rho[i] = buf[m++];
                                 rhoB[i] = buf[m++];
                         }
+                } else {
+                  ASSERT(false);
                 }
-                else ASSERT(false);
         }
         else if(communicationStage == 4) {
                 for(i = first; i < last; i++) {
                         D_values[i] = buf[m++];
                 }
         }
 }
 
 /* ---------------------------------------------------------------------- */
 int PairCDEAM::pack_reverse_comm(int n, int first, double *buf)
 {
         int i,m,last;
 
         m = 0;
         last = first + n;
 
         if(communicationStage == 1) {
                 if(cdeamVersion == 1) {
                         for(i = first; i < last; i++) {
                                 buf[m++] = rho[i];
                                 buf[m++] = rhoB[i];
                                 buf[m++] = D_values[i];
                         }
                         return 3;
                 }
                 else if(cdeamVersion == 2) {
                         for(i = first; i < last; i++) {
                                 buf[m++] = rho[i];
                                 buf[m++] = rhoB[i];
                         }
                         return 2;
                 }
                 else { ASSERT(false); return 0; }
         }
         else if(communicationStage == 3) {
                 for(i = first; i < last; i++) {
                         buf[m++] = D_values[i];
                 }
                 return 1;
         }
         else return 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairCDEAM::unpack_reverse_comm(int n, int *list, double *buf)
 {
         int i,j,m;
 
         m = 0;
         if(communicationStage == 1) {
                 if(cdeamVersion == 1) {
                         for(i = 0; i < n; i++) {
                                 j = list[i];
                                 rho[j] += buf[m++];
                                 rhoB[j] += buf[m++];
                                 D_values[j] += buf[m++];
                         }
-                }
-                else if(cdeamVersion == 2) {
+                } else if(cdeamVersion == 2) {
                         for(i = 0; i < n; i++) {
                                 j = list[i];
                                 rho[j] += buf[m++];
                                 rhoB[j] += buf[m++];
                         }
+                } else {
+                  ASSERT(false);
                 }
-                else ASSERT(false);
         }
         else if(communicationStage == 3) {
                 for(i = 0; i < n; i++) {
                         j = list[i];
                         D_values[j] += buf[m++];
                 }
         }
 }
 
 /* ----------------------------------------------------------------------
    memory usage of local atom-based arrays
 ------------------------------------------------------------------------- */
 double PairCDEAM::memory_usage()
 {
         double bytes = 2 * nmax * sizeof(double);
         return PairEAMAlloy::memory_usage() + bytes;
 }
diff --git a/src/USER-MISC/pair_edip.cpp b/src/USER-MISC/pair_edip.cpp
index bb4d67015..e9a83eded 100644
--- a/src/USER-MISC/pair_edip.cpp
+++ b/src/USER-MISC/pair_edip.cpp
@@ -1,1062 +1,1056 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Luca Ferraro (CASPUR)
    email: luca.ferraro@caspur.it
 
    Environment Dependent Interatomic Potential
    References:
     1) J. F. Justo, M. Z. Bazant, E. Kaxiras, V. V. Bulatov, S. Yip
        Phys. Rev. B 58, 2539 (1998)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "float.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_edip.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "force.h"
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define MAXLINE 1024
 #define DELTA 4
 
 #define GRIDDENSITY 8000
 #define GRIDSTART 0.1
 
 // max number of interaction per atom for f(Z) environment potential
 
 #define leadDimInteractionList 64
 
 /* ---------------------------------------------------------------------- */
 
 PairEDIP::PairEDIP(LAMMPS *lmp) : Pair(lmp)
 {
   single_enable = 0;
   restartinfo = 0;
   one_coeff = 1;
   manybody_flag = 1;
 
   nelements = 0;
   elements = NULL;
   nparams = maxparam = 0;
   params = NULL;
   elem2param = NULL;
 }
 
 /* ----------------------------------------------------------------------
    check if allocated, since class can be destructed when incomplete
 ------------------------------------------------------------------------- */
 
 PairEDIP::~PairEDIP()
 {
   if (elements)
     for (int i = 0; i < nelements; i++) delete [] elements[i];
   delete [] elements;
   memory->destroy(params);
   memory->destroy(elem2param);
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     delete [] map;
 
     deallocateGrids();
     deallocatePreLoops();
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEDIP::compute(int eflag, int vflag)
 {
   int i,j,k,ii,inum,jnum;
-  int itype,jtype,ktype,ijparam,ikparam,ijkparam;
+  int itype,jtype,ktype,ijparam,ikparam;
   double xtmp,ytmp,ztmp,evdwl;
   int *ilist,*jlist,*numneigh,**firstneigh;
   register int preForceCoord_counter;
 
   double invR_ij;
   double invR_ik;
   double directorCos_ij_x;
   double directorCos_ij_y;
   double directorCos_ij_z;
   double directorCos_ik_x;
   double directorCos_ik_y;
   double directorCos_ik_z;
   double cosTeta;
 
   int interpolIDX;
   double interpolTMP;
   double interpolDeltaX;
   double interpolY1;
   double interpolY2;
 
   double invRMinusCutoffA;
   double sigmaInvRMinusCutoffA;
   double gammInvRMinusCutoffA;
   double cosTetaDiff;
   double cosTetaDiffCosTetaDiff;
   double cutoffFunction_ij;
   double exp2B_ij;
   double exp2BDerived_ij;
   double pow2B_ij;
   double pow2BDerived_ij;
   double exp3B_ij;
   double exp3BDerived_ij;
   double exp3B_ik;
   double exp3BDerived_ik;
   double qFunction;
-  double qFunctionDerived;
   double tauFunction;
   double tauFunctionDerived;
   double expMinusBetaZeta_iZeta_i;
   double qFunctionCosTetaDiffCosTetaDiff;
   double expMinusQFunctionCosTetaDiffCosTetaDiff;
   double zeta_i;
   double zeta_iDerived;
   double zeta_iDerivedInvR_ij;
 
   double forceModCoord_factor;
   double forceModCoord;
   double forceModCoord_ij;
   double forceMod2B;
   double forceMod3B_factor1_ij;
   double forceMod3B_factor2_ij;
   double forceMod3B_factor2;
   double forceMod3B_factor1_ik;
   double forceMod3B_factor2_ik;
   double potentia3B_factor;
   double potential2B_factor;
 
   evdwl = 0.0;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
 
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
   // loop over full neighbor list of my atoms
 
   for (ii = 0; ii < inum; ii++) {
     zeta_i = 0.0;
     int numForceCoordPairs = 0;
 
     i = ilist[ii];
     itype = map[type[i]];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
 
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     // pre-loop to compute environment coordination f(Z)
 
     for (int neighbor_j = 0; neighbor_j < jnum; neighbor_j++) {
         j = jlist[neighbor_j];
         j &= NEIGHMASK;
 
         double dr_ij[3], r_ij;
 
         dr_ij[0] = xtmp - x[j][0];
         dr_ij[1] = ytmp - x[j][1];
         dr_ij[2] = ztmp - x[j][2];
         r_ij = dr_ij[0]*dr_ij[0] + dr_ij[1]*dr_ij[1] + dr_ij[2]*dr_ij[2];
 
         jtype = map[type[j]];
         ijparam = elem2param[itype][jtype][jtype];
         if (r_ij > params[ijparam].cutsq) continue;
 
         r_ij = sqrt(r_ij);
 
         invR_ij = 1.0 / r_ij;
         preInvR_ij[neighbor_j] = invR_ij;
 
         invRMinusCutoffA =  1.0 / (r_ij - cutoffA);
         sigmaInvRMinusCutoffA = sigma * invRMinusCutoffA;
         gammInvRMinusCutoffA = gamm * invRMinusCutoffA;
 
         interpolDeltaX = r_ij - GRIDSTART;
         interpolTMP = (interpolDeltaX * GRIDDENSITY);
         interpolIDX = (int) interpolTMP;
 
         interpolY1 = exp3B[interpolIDX];
         interpolY2 = exp3B[interpolIDX+1];
         exp3B_ij = interpolY1 + (interpolY2 - interpolY1) *
           (interpolTMP-interpolIDX);
 
         exp3BDerived_ij = - exp3B_ij * gammInvRMinusCutoffA * invRMinusCutoffA;
 
         preExp3B_ij[neighbor_j] = exp3B_ij;
         preExp3BDerived_ij[neighbor_j] = exp3BDerived_ij;
 
         interpolY1 = exp2B[interpolIDX];
         interpolY2 = exp2B[interpolIDX+1];
         exp2B_ij = interpolY1 + (interpolY2 - interpolY1) *
           (interpolTMP-interpolIDX);
 
         exp2BDerived_ij = - exp2B_ij * sigmaInvRMinusCutoffA * invRMinusCutoffA;
 
         preExp2B_ij[neighbor_j] = exp2B_ij;
         preExp2BDerived_ij[neighbor_j] = exp2BDerived_ij;
 
         interpolY1 = pow2B[interpolIDX];
         interpolY2 = pow2B[interpolIDX+1];
         pow2B_ij = interpolY1 + (interpolY2 - interpolY1) *
           (interpolTMP-interpolIDX);
 
         prePow2B_ij[neighbor_j] = pow2B_ij;
 
         // zeta and its derivative
 
         if (r_ij < cutoffC) zeta_i += 1.0;
         else {
             interpolY1 = cutoffFunction[interpolIDX];
             interpolY2 = cutoffFunction[interpolIDX+1];
             cutoffFunction_ij = interpolY1 + (interpolY2 - interpolY1) *
               (interpolTMP-interpolIDX);
 
             zeta_i += cutoffFunction_ij;
 
             interpolY1 = cutoffFunctionDerived[interpolIDX];
             interpolY2 = cutoffFunctionDerived[interpolIDX+1];
             zeta_iDerived = interpolY1 + (interpolY2 - interpolY1) *
               (interpolTMP-interpolIDX);
 
             zeta_iDerivedInvR_ij = zeta_iDerived * invR_ij;
 
             preForceCoord_counter=numForceCoordPairs*5;
             preForceCoord[preForceCoord_counter+0]=zeta_iDerivedInvR_ij;
             preForceCoord[preForceCoord_counter+1]=dr_ij[0];
             preForceCoord[preForceCoord_counter+2]=dr_ij[1];
             preForceCoord[preForceCoord_counter+3]=dr_ij[2];
             preForceCoord[preForceCoord_counter+4]=j;
             numForceCoordPairs++;
         }
     }
 
     // quantities depending on zeta_i
 
     interpolDeltaX = zeta_i;
     interpolTMP = (interpolDeltaX * GRIDDENSITY);
     interpolIDX = (int) interpolTMP;
 
     interpolY1 = expMinusBetaZeta_iZeta_iGrid[interpolIDX];
     interpolY2 = expMinusBetaZeta_iZeta_iGrid[interpolIDX+1];
     expMinusBetaZeta_iZeta_i = interpolY1 + (interpolY2 - interpolY1) *
       (interpolTMP-interpolIDX);
 
     interpolY1 = qFunctionGrid[interpolIDX];
     interpolY2 = qFunctionGrid[interpolIDX+1];
     qFunction = interpolY1 + (interpolY2 - interpolY1) *
       (interpolTMP-interpolIDX);
 
     interpolY1 = tauFunctionGrid[interpolIDX];
     interpolY2 = tauFunctionGrid[interpolIDX+1];
     tauFunction = interpolY1 + (interpolY2 - interpolY1) *
       (interpolTMP-interpolIDX);
 
     interpolY1 = tauFunctionDerivedGrid[interpolIDX];
     interpolY2 = tauFunctionDerivedGrid[interpolIDX+1];
     tauFunctionDerived = interpolY1 + (interpolY2 - interpolY1) *
       (interpolTMP-interpolIDX);
 
-    qFunctionDerived = -mu * qFunction;
-
     forceModCoord_factor = 2.0 * beta * zeta_i * expMinusBetaZeta_iZeta_i;
 
     forceModCoord = 0.0;
 
     // two-body interactions, skip half of them
 
     for (int neighbor_j = 0; neighbor_j < jnum; neighbor_j++) {
       double dr_ij[3], r_ij, f_ij[3];
 
       j = jlist[neighbor_j];
       j &= NEIGHMASK;
 
       dr_ij[0] = x[j][0] - xtmp;
       dr_ij[1] = x[j][1] - ytmp;
       dr_ij[2] = x[j][2] - ztmp;
       r_ij = dr_ij[0]*dr_ij[0] + dr_ij[1]*dr_ij[1] + dr_ij[2]*dr_ij[2];
 
       jtype = map[type[j]];
       ijparam = elem2param[itype][jtype][jtype];
       if (r_ij > params[ijparam].cutsq) continue;
 
       r_ij = sqrt(r_ij);
 
       invR_ij = preInvR_ij[neighbor_j];
       pow2B_ij = prePow2B_ij[neighbor_j];
 
       potential2B_factor = pow2B_ij - expMinusBetaZeta_iZeta_i;
 
       exp2B_ij = preExp2B_ij[neighbor_j];
 
       pow2BDerived_ij = - rho * invR_ij * pow2B_ij;
 
       forceModCoord += (forceModCoord_factor*exp2B_ij);
 
       exp2BDerived_ij = preExp2BDerived_ij[neighbor_j];
       forceMod2B = exp2BDerived_ij * potential2B_factor +
         exp2B_ij * pow2BDerived_ij;
 
       directorCos_ij_x = invR_ij * dr_ij[0];
       directorCos_ij_y = invR_ij * dr_ij[1];
       directorCos_ij_z = invR_ij * dr_ij[2];
 
       exp3B_ij = preExp3B_ij[neighbor_j];
       exp3BDerived_ij = preExp3BDerived_ij[neighbor_j];
 
       f_ij[0] = forceMod2B * directorCos_ij_x;
       f_ij[1] = forceMod2B * directorCos_ij_y;
       f_ij[2] = forceMod2B * directorCos_ij_z;
 
       f[i][0] += f_ij[0];
       f[i][1] += f_ij[1];
       f[i][2] += f_ij[2];
 
       f[j][0] -= f_ij[0];
       f[j][1] -= f_ij[1];
       f[j][2] -= f_ij[2];
 
       // potential energy
 
       evdwl = (exp2B_ij * potential2B_factor);
 
       if (evflag) ev_tally(i, j, nlocal, newton_pair, evdwl, 0.0,
                            -forceMod2B*invR_ij, dr_ij[0], dr_ij[1], dr_ij[2]);
 
       // three-body Forces
 
       for (int neighbor_k = neighbor_j + 1; neighbor_k < jnum; neighbor_k++) {
           double dr_ik[3], r_ik, f_ik[3];
 
           k = jlist[neighbor_k];
           k &= NEIGHMASK;
           ktype = map[type[k]];
           ikparam = elem2param[itype][ktype][ktype];
-          ijkparam = elem2param[itype][jtype][ktype];
 
           dr_ik[0] = x[k][0] - xtmp;
           dr_ik[1] = x[k][1] - ytmp;
           dr_ik[2] = x[k][2] - ztmp;
           r_ik = dr_ik[0]*dr_ik[0] + dr_ik[1]*dr_ik[1] + dr_ik[2]*dr_ik[2];
 
           if (r_ik > params[ikparam].cutsq) continue;
 
           r_ik = sqrt(r_ik);
 
           invR_ik = preInvR_ij[neighbor_k];
 
           directorCos_ik_x = invR_ik * dr_ik[0];
           directorCos_ik_y = invR_ik * dr_ik[1];
           directorCos_ik_z = invR_ik * dr_ik[2];
 
           cosTeta = directorCos_ij_x * directorCos_ik_x +
             directorCos_ij_y * directorCos_ik_y +
             directorCos_ij_z * directorCos_ik_z;
 
           cosTetaDiff = cosTeta + tauFunction;
           cosTetaDiffCosTetaDiff = cosTetaDiff * cosTetaDiff;
           qFunctionCosTetaDiffCosTetaDiff = cosTetaDiffCosTetaDiff * qFunction;
           expMinusQFunctionCosTetaDiffCosTetaDiff =
             exp(-qFunctionCosTetaDiffCosTetaDiff);
 
           potentia3B_factor = lambda *
             ((1.0 - expMinusQFunctionCosTetaDiffCosTetaDiff) +
              eta * qFunctionCosTetaDiffCosTetaDiff);
 
           exp3B_ik = preExp3B_ij[neighbor_k];
           exp3BDerived_ik = preExp3BDerived_ij[neighbor_k];
 
           forceMod3B_factor1_ij = - exp3BDerived_ij * exp3B_ik *
             potentia3B_factor;
           forceMod3B_factor2 = 2.0 * lambda * exp3B_ij * exp3B_ik *
             qFunction * cosTetaDiff *
             (eta + expMinusQFunctionCosTetaDiffCosTetaDiff);
           forceMod3B_factor2_ij = forceMod3B_factor2 * invR_ij;
 
           f_ij[0] = forceMod3B_factor1_ij * directorCos_ij_x +
             forceMod3B_factor2_ij *
             (cosTeta * directorCos_ij_x - directorCos_ik_x);
           f_ij[1] = forceMod3B_factor1_ij * directorCos_ij_y +
             forceMod3B_factor2_ij *
             (cosTeta * directorCos_ij_y - directorCos_ik_y);
           f_ij[2] = forceMod3B_factor1_ij * directorCos_ij_z +
             forceMod3B_factor2_ij *
             (cosTeta * directorCos_ij_z - directorCos_ik_z);
 
           forceMod3B_factor1_ik = - exp3BDerived_ik * exp3B_ij *
             potentia3B_factor;
           forceMod3B_factor2_ik = forceMod3B_factor2 * invR_ik;
 
           f_ik[0] = forceMod3B_factor1_ik * directorCos_ik_x +
             forceMod3B_factor2_ik *
             (cosTeta * directorCos_ik_x - directorCos_ij_x);
           f_ik[1] = forceMod3B_factor1_ik * directorCos_ik_y +
             forceMod3B_factor2_ik *
             (cosTeta * directorCos_ik_y - directorCos_ij_y);
           f_ik[2] = forceMod3B_factor1_ik * directorCos_ik_z +
             forceMod3B_factor2_ik *
             (cosTeta * directorCos_ik_z - directorCos_ij_z);
 
           forceModCoord += (forceMod3B_factor2 *
                             (tauFunctionDerived -  0.5 * mu * cosTetaDiff));
 
           f[j][0] += f_ij[0];
           f[j][1] += f_ij[1];
           f[j][2] += f_ij[2];
 
           f[k][0] += f_ik[0];
           f[k][1] += f_ik[1];
           f[k][2] += f_ik[2];
 
           f[i][0] -= f_ij[0] + f_ik[0];
           f[i][1] -= f_ij[1] + f_ik[1];
           f[i][2] -= f_ij[2] + f_ik[2];
 
           // potential energy
 
           evdwl = (exp3B_ij * exp3B_ik * potentia3B_factor);
 
           if (evflag) ev_tally3(i,j,k,evdwl,0.0,f_ij,f_ik,dr_ij,dr_ik);
       }
     }
 
     // forces due to environment coordination f(Z)
 
     for (int idx = 0; idx < numForceCoordPairs; idx++) {
         double dr_ij[3],f_ij[3];
 
         preForceCoord_counter = idx * 5;
         zeta_iDerivedInvR_ij=preForceCoord[preForceCoord_counter+0];
         dr_ij[0]=preForceCoord[preForceCoord_counter+1];
         dr_ij[1]=preForceCoord[preForceCoord_counter+2];
         dr_ij[2]=preForceCoord[preForceCoord_counter+3];
         j = static_cast<int> (preForceCoord[preForceCoord_counter+4]);
 
         forceModCoord_ij = forceModCoord * zeta_iDerivedInvR_ij;
 
         f_ij[0] = forceModCoord_ij * dr_ij[0];
         f_ij[1] = forceModCoord_ij * dr_ij[1];
         f_ij[2] = forceModCoord_ij * dr_ij[2];
 
         f[i][0] -= f_ij[0];
         f[i][1] -= f_ij[1];
         f[i][2] -= f_ij[2];
 
         f[j][0] += f_ij[0];
         f[j][1] += f_ij[1];
         f[j][2] += f_ij[2];
 
         // potential energy
 
         evdwl = 0.0;
         if (evflag) ev_tally(i, j, nlocal, newton_pair, evdwl, 0.0,
                              -forceModCoord_ij, dr_ij[0], dr_ij[1], dr_ij[2]);
     }
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEDIP::allocateGrids(void)
 {
   int numGridPointsOneCutoffFunction;
   int numGridPointsNotOneCutoffFunction;
   int numGridPointsCutoffFunction;
   int numGridPointsR;
   int numGridPointsRTotal;
   int numGridPointsQFunctionGrid;
   int numGridPointsExpMinusBetaZeta_iZeta_i;
   int numGridPointsTauFunctionGrid;
   double maxArgumentTauFunctionGrid;
   double maxArgumentQFunctionGrid;
   double maxArgumentExpMinusBetaZeta_iZeta_i;
   double const leftLimitToZero = -DBL_MIN * 1000.0;
 
   // tauFunctionGrid
 
   maxArgumentTauFunctionGrid = leadDimInteractionList;
   numGridPointsTauFunctionGrid = (int)
     ((maxArgumentTauFunctionGrid) * GRIDDENSITY) + 2;
 
   memory->create(tauFunctionGrid,numGridPointsTauFunctionGrid,
                  "edip:tauFunctionGrid");
   memory->create(tauFunctionDerivedGrid,numGridPointsTauFunctionGrid,
                  "edip:tauFunctionDerivedGrid");
 
   // expMinusBetaZeta_iZeta_iGrid
 
   maxArgumentExpMinusBetaZeta_iZeta_i = leadDimInteractionList;
   numGridPointsExpMinusBetaZeta_iZeta_i = (int)
     ((maxArgumentExpMinusBetaZeta_iZeta_i) * GRIDDENSITY) + 2;
   memory->create(expMinusBetaZeta_iZeta_iGrid,
                  numGridPointsExpMinusBetaZeta_iZeta_i,
                  "edip:expMinusBetaZeta_iZeta_iGrid");
 
   // qFunctionGrid
 
   maxArgumentQFunctionGrid = leadDimInteractionList;
   numGridPointsQFunctionGrid = (int)
     ((maxArgumentQFunctionGrid) * GRIDDENSITY) + 2;
   memory->create(qFunctionGrid,numGridPointsQFunctionGrid,"edip:qFunctionGrid");
 
   // cutoffFunction
 
   numGridPointsOneCutoffFunction = (int) ((cutoffC - GRIDSTART) * GRIDDENSITY);
   numGridPointsNotOneCutoffFunction = (int) ((cutoffA-cutoffC) * GRIDDENSITY);
   numGridPointsCutoffFunction = numGridPointsOneCutoffFunction +
     numGridPointsNotOneCutoffFunction+2;
 
   memory->create(cutoffFunction,numGridPointsCutoffFunction,
                  "edip:cutoffFunction");
   memory->create(cutoffFunctionDerived,numGridPointsCutoffFunction,
                  "edip:cutoffFunctionDerived");
 
   // pow2B
 
   numGridPointsR = (int)
     ((cutoffA + leftLimitToZero - GRIDSTART) * GRIDDENSITY);
   numGridPointsRTotal = numGridPointsR + 2;
 
   memory->create(pow2B,numGridPointsRTotal,"edip:pow2B");
   memory->create(exp2B,numGridPointsRTotal,"edip:exp2B");
   memory->create(exp3B,numGridPointsRTotal,"edip:exp3B");
 }
 
 /* ----------------------------------------------------------------------
    pre-calculated structures
 ------------------------------------------------------------------------- */
 
 void PairEDIP::allocatePreLoops(void)
 {
   int nthreads = comm->nthreads;
 
   memory->create(preInvR_ij,nthreads*leadDimInteractionList,"edip:preInvR_ij");
   memory->create(preExp3B_ij,nthreads*leadDimInteractionList,"edip:preExp3B_ij");
   memory->create(preExp3BDerived_ij,nthreads*leadDimInteractionList,
                  "edip:preExp3BDerived_ij");
   memory->create(preExp2B_ij,nthreads*leadDimInteractionList,"edip:preExp2B_ij");
   memory->create(preExp2BDerived_ij,nthreads*leadDimInteractionList,
                  "edip:preExp2BDerived_ij");
   memory->create(prePow2B_ij,nthreads*leadDimInteractionList,"edip:prePow2B_ij");
   memory->create(preForceCoord,5*nthreads*leadDimInteractionList,"edip:preForceCoord");
 }
 
 /* ----------------------------------------------------------------------
    deallocate grids
 ------------------------------------------------------------------------- */
 
 void PairEDIP::deallocateGrids(void)
 {
   memory->destroy(cutoffFunction);
   memory->destroy(cutoffFunctionDerived);
   memory->destroy(pow2B);
   memory->destroy(exp2B);
   memory->destroy(exp3B);
   memory->destroy(qFunctionGrid);
   memory->destroy(expMinusBetaZeta_iZeta_iGrid);
   memory->destroy(tauFunctionGrid);
   memory->destroy(tauFunctionDerivedGrid);
 }
 
 /* ----------------------------------------------------------------------
    deallocate preLoops
 ------------------------------------------------------------------------- */
 
 void PairEDIP::deallocatePreLoops(void)
 {
   memory->destroy(preInvR_ij);
   memory->destroy(preExp3B_ij);
   memory->destroy(preExp3BDerived_ij);
   memory->destroy(preExp2B_ij);
   memory->destroy(preExp2BDerived_ij);
   memory->destroy(prePow2B_ij);
   memory->destroy(preForceCoord);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEDIP::allocate()
 {
   allocated = 1;
   int n = atom->ntypes;
 
   memory->create(setflag,n+1,n+1,"pair:setflag");
   memory->create(cutsq,n+1,n+1,"pair:cutsq");
 
   map = new int[n+1];
 }
 
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
 
 void PairEDIP::settings(int narg, char **arg)
 {
   if (narg != 0) error->all(FLERR,"Illegal pair_style command");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEDIP::initGrids(void)
 {
   int l;
   int numGridPointsOneCutoffFunction;
   int numGridPointsNotOneCutoffFunction;
   int numGridPointsCutoffFunction;
   int numGridPointsR;
-  int numGridPointsRTotal;
   int numGridPointsQFunctionGrid;
   int numGridPointsExpMinusBetaZeta_iZeta_i;
   int numGridPointsTauFunctionGrid;
   double maxArgumentTauFunctionGrid;
   double maxArgumentQFunctionGrid;
   double maxArgumentExpMinusBetaZeta_iZeta_i;
   double r;
   double temp;
   double temp3;
   double temp4;
   double deltaArgumentR;
   double deltaArgumentCutoffFunction;
   double deltaArgumentQFunctionGrid;
   double deltaArgumentTauFunctionGrid;
   double deltaArgumentExpMinusBetaZeta_iZeta_i;
   double const leftLimitToZero = -DBL_MIN * 1000.0;
 
   // tauFunctionGrid
 
   maxArgumentTauFunctionGrid = leadDimInteractionList;
 
   numGridPointsTauFunctionGrid = (int)
     ((maxArgumentTauFunctionGrid) * GRIDDENSITY) + 2;
 
   r = 0.0;
   deltaArgumentTauFunctionGrid = 1.0 / GRIDDENSITY;
 
   for (l = 0; l < numGridPointsTauFunctionGrid; l++) {
       tauFunctionGrid[l] = u1 + u2 * u3 * exp(-u4 * r) -
         u2 * exp(-2.0 * u4 * r);
       tauFunctionDerivedGrid[l] = - u2 * u3 * u4 * exp(-u4 * r) +
         2.0 * u2 * u4 * exp(-2.0 * u4 * r);
       r += deltaArgumentTauFunctionGrid;
   }
 
   // expMinusBetaZeta_iZeta_iGrid
 
   maxArgumentExpMinusBetaZeta_iZeta_i = leadDimInteractionList;
 
   numGridPointsExpMinusBetaZeta_iZeta_i = (int)
     ((maxArgumentExpMinusBetaZeta_iZeta_i) * GRIDDENSITY) + 2;
 
   r = 0.0;
   deltaArgumentExpMinusBetaZeta_iZeta_i = 1.0 / GRIDDENSITY;
 
   for (l = 0; l < numGridPointsExpMinusBetaZeta_iZeta_i; l++) {
       expMinusBetaZeta_iZeta_iGrid[l] = exp(-beta * r * r);
       r += deltaArgumentExpMinusBetaZeta_iZeta_i;
   }
 
   // qFunctionGrid
 
   maxArgumentQFunctionGrid = leadDimInteractionList;
   numGridPointsQFunctionGrid =
     (int) ((maxArgumentQFunctionGrid) * GRIDDENSITY) + 2;
 
   r = 0.0;
   deltaArgumentQFunctionGrid = 1.0 / GRIDDENSITY;
 
   for (l = 0; l < numGridPointsQFunctionGrid; l++) {
       qFunctionGrid[l] = Q0 * exp(-mu * r);
       r += deltaArgumentQFunctionGrid;
   }
 
   // cutoffFunction
 
   numGridPointsOneCutoffFunction =
     (int) ((cutoffC - GRIDSTART) * GRIDDENSITY);
   numGridPointsNotOneCutoffFunction =
     (int) ((cutoffA-cutoffC) * GRIDDENSITY);
   numGridPointsCutoffFunction =
     numGridPointsOneCutoffFunction+numGridPointsNotOneCutoffFunction+2;
 
   r = GRIDSTART;
   deltaArgumentCutoffFunction = 1.0 / GRIDDENSITY;
 
   for (l = 0; l < numGridPointsOneCutoffFunction; l++) {
       cutoffFunction[l] = 1.0;
       cutoffFunctionDerived[l] = 0.0;
       r += deltaArgumentCutoffFunction;
   }
 
   for (l = numGridPointsOneCutoffFunction;
        l < numGridPointsCutoffFunction; l++) {
       temp = (cutoffA - cutoffC)/(r - cutoffC);
       temp3 = temp * temp * temp;
       temp4 = temp3 * temp;
       cutoffFunction[l] = exp(alpha/(1.0-temp3));
       cutoffFunctionDerived[l] = (-3*alpha/(cutoffA-cutoffC)) *
         (temp4/((1-temp3)*(1-temp3)))*exp(alpha/(1.0-temp3));
       r += deltaArgumentCutoffFunction;
   }
 
   // pow2B
 
   numGridPointsR = (int)
     ((cutoffA + leftLimitToZero - GRIDSTART) * GRIDDENSITY);
-  numGridPointsRTotal = numGridPointsR + 2;
 
   r = GRIDSTART;
   deltaArgumentR = 1.0 / GRIDDENSITY;
   for (l = 0; l < numGridPointsR; l++) {
       pow2B[l] = pow((B/r),rho);
       exp2B[l] = A * exp(sigma/(r-cutoffA));
       exp3B[l] = exp(gamm/(r-cutoffA));
       r += deltaArgumentR;
   }
 
   pow2B[numGridPointsR] = pow((B/r),rho);
   exp2B[numGridPointsR]=0;
   exp3B[numGridPointsR]=0;
   r += deltaArgumentR;
   pow2B[numGridPointsR+1] = pow((B/r),rho);
   exp2B[numGridPointsR+1]=0;
   exp3B[numGridPointsR+1]=0;
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
 void PairEDIP::coeff(int narg, char **arg)
 {
   int i,j,n;
 
   if (!allocated) allocate();
 
   if (narg != 3 + atom->ntypes)
     error->all(FLERR,"Incorrect args for pair coefficients");
 
   // insure I,J args are * *
 
   if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
     error->all(FLERR,"Incorrect args for pair coefficients");
 
   // read args that map atom types to elements in potential file
   // map[i] = which element the Ith atom type is, -1 if NULL
   // nelements = # of unique elements
   // elements = list of element names
 
   if (elements) {
     for (i = 0; i < nelements; i++) delete [] elements[i];
     delete [] elements;
   }
   elements = new char*[atom->ntypes];
   for (i = 0; i < atom->ntypes; i++) elements[i] = NULL;
 
   nelements = 0;
   for (i = 3; i < narg; i++) {
     if (strcmp(arg[i],"NULL") == 0) {
       map[i-2] = -1;
       continue;
     }
     for (j = 0; j < nelements; j++)
       if (strcmp(arg[i],elements[j]) == 0) break;
     map[i-2] = j;
     if (j == nelements) {
       n = strlen(arg[i]) + 1;
       elements[j] = new char[n];
       strcpy(elements[j],arg[i]);
       nelements++;
     }
   }
 
   // read potential file and initialize potential parameters
 
   read_file(arg[2]);
   setup();
 
   // clear setflag since coeff() called once with I,J = * *
 
   n = atom->ntypes;
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       setflag[i][j] = 0;
 
   // set setflag i,j for type pairs where both are mapped to elements
 
   int count = 0;
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       if (map[i] >= 0 && map[j] >= 0) {
         setflag[i][j] = 1;
         count++;
       }
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 
   // allocate tables and internal structures
 
   allocatePreLoops();
   allocateGrids();
   initGrids();
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairEDIP::init_style()
 {
   if (force->newton_pair == 0)
     error->all(FLERR,"Pair style EDIP requires newton pair on");
 
   // need a full neighbor list
 
   int irequest = neighbor->request(this);
   neighbor->requests[irequest]->half = 0;
   neighbor->requests[irequest]->full = 1;
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairEDIP::init_one(int i, int j)
 {
   if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
 
   return cutmax;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEDIP::read_file(char *file)
 {
   int params_per_line = 20;
   char **words = new char*[params_per_line+1];
 
   memory->sfree(params);
   params = NULL;
   nparams = maxparam = 0;
 
   // open file on proc 0
 
   FILE *fp;
   if (comm->me == 0) {
     fp = force->open_potential(file);
     if (fp == NULL) {
       char str[128];
       sprintf(str,"Cannot open EDIP potential file %s",file);
       error->one(FLERR,str);
     }
   }
 
   // read each set of params from potential file
   // one set of params can span multiple lines
   // store params if all 3 element tags are in element list
 
   int n,nwords,ielement,jelement,kelement;
   char line[MAXLINE],*ptr;
   int eof = 0;
 
   while (1) {
     if (comm->me == 0) {
       ptr = fgets(line,MAXLINE,fp);
       if (ptr == NULL) {
         eof = 1;
         fclose(fp);
       } else n = strlen(line) + 1;
     }
     MPI_Bcast(&eof,1,MPI_INT,0,world);
     if (eof) break;
     MPI_Bcast(&n,1,MPI_INT,0,world);
     MPI_Bcast(line,n,MPI_CHAR,0,world);
 
     // strip comment, skip line if blank
 
     if ((ptr = strchr(line,'#'))) *ptr = '\0';
     nwords = atom->count_words(line);
     if (nwords == 0) continue;
 
     // concatenate additional lines until have params_per_line words
 
     while (nwords < params_per_line) {
       n = strlen(line);
       if (comm->me == 0) {
         ptr = fgets(&line[n],MAXLINE-n,fp);
         if (ptr == NULL) {
           eof = 1;
           fclose(fp);
         } else n = strlen(line) + 1;
       }
       MPI_Bcast(&eof,1,MPI_INT,0,world);
       if (eof) break;
       MPI_Bcast(&n,1,MPI_INT,0,world);
       MPI_Bcast(line,n,MPI_CHAR,0,world);
       if ((ptr = strchr(line,'#'))) *ptr = '\0';
       nwords = atom->count_words(line);
     }
 
     if (nwords != params_per_line)
       error->all(FLERR,"Incorrect format in EDIP potential file");
 
     // words = ptrs to all words in line
 
     nwords = 0;
     words[nwords++] = strtok(line," \t\n\r\f");
     while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
 
     // ielement,jelement,kelement = 1st args
     // if all 3 args are in element list, then parse this line
     // else skip to next entry in file
 
     for (ielement = 0; ielement < nelements; ielement++)
       if (strcmp(words[0],elements[ielement]) == 0) break;
     if (ielement == nelements) continue;
     for (jelement = 0; jelement < nelements; jelement++)
       if (strcmp(words[1],elements[jelement]) == 0) break;
     if (jelement == nelements) continue;
     for (kelement = 0; kelement < nelements; kelement++)
       if (strcmp(words[2],elements[kelement]) == 0) break;
     if (kelement == nelements) continue;
 
     // load up parameter settings and error check their values
 
     if (nparams == maxparam) {
       maxparam += DELTA;
       params = (Param *) memory->srealloc(params,maxparam*sizeof(Param),
                                           "pair:params");
     }
 
     params[nparams].ielement = ielement;
     params[nparams].jelement = jelement;
     params[nparams].kelement = kelement;
     params[nparams].A = atof(words[3]);
     params[nparams].B = atof(words[4]);
     params[nparams].cutoffA = atof(words[5]);
     params[nparams].cutoffC = atof(words[6]);
     params[nparams].alpha = atof(words[7]);
     params[nparams].beta = atof(words[8]);
     params[nparams].eta = atof(words[9]);
     params[nparams].gamm = atof(words[10]);
     params[nparams].lambda = atof(words[11]);
     params[nparams].mu = atof(words[12]);
     params[nparams].rho = atof(words[13]);
     params[nparams].sigma = atof(words[14]);
     params[nparams].Q0 = atof(words[15]);
     params[nparams].u1 = atof(words[16]);
     params[nparams].u2 = atof(words[17]);
     params[nparams].u3 = atof(words[18]);
     params[nparams].u4 = atof(words[19]);
 
     if (params[nparams].A < 0.0 || params[nparams].B < 0.0 ||
         params[nparams].cutoffA < 0.0 || params[nparams].cutoffC < 0.0 ||
         params[nparams].alpha < 0.0 || params[nparams].beta < 0.0 ||
         params[nparams].eta < 0.0 || params[nparams].gamm < 0.0 ||
         params[nparams].lambda < 0.0 || params[nparams].mu < 0.0 ||
         params[nparams].rho < 0.0 || params[nparams].sigma < 0.0)
       error->all(FLERR,"Illegal EDIP parameter");
 
     nparams++;
   }
 
   delete [] words;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEDIP::setup()
 {
   int i,j,k,m,n;
   double rtmp;
 
   // set elem2param for all triplet combinations
   // must be a single exact match to lines read from file
   // do not allow for ACB in place of ABC
 
   memory->destroy(elem2param);
   memory->create(elem2param,nelements,nelements,nelements,"pair:elem2param");
 
   for (i = 0; i < nelements; i++)
     for (j = 0; j < nelements; j++)
       for (k = 0; k < nelements; k++) {
         n = -1;
         for (m = 0; m < nparams; m++) {
           if (i == params[m].ielement && j == params[m].jelement &&
               k == params[m].kelement) {
             if (n >= 0) error->all(FLERR,"Potential file has duplicate entry");
             n = m;
           }
         }
         if (n < 0) error->all(FLERR,"Potential file is missing an entry");
         elem2param[i][j][k] = n;
       }
 
   // set cutoff square
 
   for (m = 0; m < nparams; m++) {
     params[m].cutsq = params[m].cutoffA*params[m].cutoffA;
   }
 
   // set cutmax to max of all params
 
   cutmax = 0.0;
   for (m = 0; m < nparams; m++) {
     rtmp = sqrt(params[m].cutsq);
     if (rtmp > cutmax) cutmax = rtmp;
   }
 
   // this should be removed for multi species parametrizations
 
   A = params[0].A;
   B = params[0].B;
   rho = params[0].rho;
   cutoffA = params[0].cutoffA;
   cutoffC = params[0].cutoffC;
   sigma = params[0].sigma;
   lambda = params[0].lambda;
   gamm = params[0].gamm;
   eta = params[0].eta;
   Q0 = params[0].Q0;
   mu = params[0].mu;
   beta = params[0].beta;
   alpha = params[0].alpha;
   u1 = params[0].u1;
   u2 = params[0].u2;
   u3 = params[0].u3;
   u4 = params[0].u4;
 }
diff --git a/src/USER-OMP/pair_cdeam_omp.cpp b/src/USER-OMP/pair_cdeam_omp.cpp
index 56c9b9d33..8f0cd8072 100644
--- a/src/USER-OMP/pair_cdeam_omp.cpp
+++ b/src/USER-OMP/pair_cdeam_omp.cpp
@@ -1,537 +1,541 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "string.h"
 
 #include "pair_cdeam_omp.h"
 #include "atom.h"
 #include "comm.h"
 #include "error.h"
 #include "force.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 // This is for debugging purposes. The ASSERT() macro is used in the code to check
 // if everything runs as expected. Change this to #if 0 if you don't need the checking.
 #if 0
         #define ASSERT(cond) ((!(cond)) ? my_failure(error,__FILE__,__LINE__) : my_noop())
 
         inline void my_noop() {}
         inline void my_failure(Error* error, const char* file, int line) {
                 char str[1024];
                 sprintf(str,"Assertion failure: File %s, line %i", file, line);
                 error->one(FLERR,str);
         }
 #else
         #define ASSERT(cond)
 #endif
 
 /* ---------------------------------------------------------------------- */
 
 PairCDEAMOMP::PairCDEAMOMP(LAMMPS *lmp, int _cdeamVersion) :
   PairEAM(lmp), PairCDEAM(lmp,_cdeamVersion), ThrOMP(lmp, THR_PAIR)
 {
   suffix_flag |= Suffix::OMP;
   respa_enable = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairCDEAMOMP::compute(int eflag, int vflag)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
 
   const int nall = atom->nlocal + atom->nghost;
   const int nthreads = comm->nthreads;
   const int inum = list->inum;
 
   // grow energy and fp arrays if necessary
   // need to be atom->nmax in length
 
   if (atom->nmax > nmax) {
     memory->destroy(rho);
     memory->destroy(rhoB);
     memory->destroy(D_values);
     memory->destroy(fp);
     nmax = atom->nmax;
     memory->create(rho,nthreads*nmax,"pair:rho");
     memory->create(rhoB,nthreads*nmax,"pair:mu");
     memory->create(D_values,nthreads*nmax,"pair:D_values");
     memory->create(fp,nmax,"pair:fp");
   }
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none) shared(eflag,vflag)
 #endif
   {
     int ifrom, ito, tid;
 
     loop_setup_thr(ifrom, ito, tid, inum, nthreads);
     ThrData *thr = fix->get_thr(tid);
     thr->timer(Timer::START);
     ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
 
     if (force->newton_pair)
       thr->init_cdeam(nall, rho, rhoB, D_values);
     else
       thr->init_cdeam(atom->nlocal, rho, rhoB, D_values);
 
     switch (cdeamVersion) {
 
     case 1:
 
       if (evflag) {
         if (eflag) {
           if (force->newton_pair) eval<1,1,1,1>(ifrom, ito, thr);
           else eval<1,1,0,1>(ifrom, ito, thr);
         } else {
           if (force->newton_pair) eval<1,0,1,1>(ifrom, ito, thr);
           else eval<1,0,0,1>(ifrom, ito, thr);
         }
       } else {
         if (force->newton_pair) eval<0,0,1,1>(ifrom, ito, thr);
         else eval<0,0,0,1>(ifrom, ito, thr);
       }
       break;
 
     case 2:
 
       if (evflag) {
         if (eflag) {
           if (force->newton_pair) eval<1,1,1,2>(ifrom, ito, thr);
           else eval<1,1,0,2>(ifrom, ito, thr);
         } else {
           if (force->newton_pair) eval<1,0,1,2>(ifrom, ito, thr);
           else eval<1,0,0,2>(ifrom, ito, thr);
         }
       } else {
         if (force->newton_pair) eval<0,0,1,2>(ifrom, ito, thr);
         else eval<0,0,0,2>(ifrom, ito, thr);
       }
       break;
 
     default:
 #if defined(_OPENMP)
 #pragma omp master
 #endif
     error->all(FLERR,"unsupported eam/cd pair style variant");
     }
 
     thr->timer(Timer::PAIR);
     reduce_thr(this, eflag, vflag, thr);
   } // end of omp parallel region
 }
 
 template <int EVFLAG, int EFLAG, int NEWTON_PAIR, int CDEAMVERSION>
 void PairCDEAMOMP::eval(int iifrom, int iito, ThrData * const thr)
 {
   int i,j,ii,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,rhoip,rhojp,recip,phi;
   int *ilist,*jlist,*numneigh,**firstneigh;
 
   evdwl = 0.0;
 
   const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
   dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
   double * const rho_t = thr->get_rho();
   double * const rhoB_t = thr->get_rhoB();
   double * const D_values_t = thr->get_D_values();
   const int tid = thr->get_tid();
   const int nthreads = comm->nthreads;
 
   const int * _noalias const type = atom->type;
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   double fxtmp,fytmp,fztmp;
 
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
   // Stage I
 
   // Compute rho and rhoB at each local atom site.
   // Additionally calculate the D_i values here if we are using the one-site formulation.
   // For the two-site formulation we have to calculate the D values in an extra loop (Stage II).
 
   for (ii = iifrom; ii < iito; ii++) {
     i = ilist[ii];
     xtmp = x[i].x;
     ytmp = x[i].y;
     ztmp = x[i].z;
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j].x;
       dely = ytmp - x[j].y;
       delz = ztmp - x[j].z;
       rsq = delx*delx + dely*dely + delz*delz;
 
       if(rsq < cutforcesq) {
         jtype = type[j];
         double r = sqrt(rsq);
         const EAMTableIndex index = radiusToTableIndex(r);
         double localrho = RhoOfR(index, jtype, itype);
         rho_t[i] += localrho;
         if(jtype == speciesB) rhoB_t[i] += localrho;
         if(NEWTON_PAIR || j < nlocal) {
           localrho = RhoOfR(index, itype, jtype);
           rho_t[j] += localrho;
           if(itype == speciesB) rhoB_t[j] += localrho;
         }
 
         if(CDEAMVERSION == 1 && itype != jtype) {
           // Note: if the i-j interaction is not concentration dependent (because either
           // i or j are not species A or B) then its contribution to D_i and D_j should
           // be ignored.
           // This if-clause is only required for a ternary.
           if((itype == speciesA && jtype == speciesB)
              || (jtype == speciesA && itype == speciesB)) {
             double Phi_AB = PhiOfR(index, itype, jtype, 1.0 / r);
             D_values_t[i] += Phi_AB;
             if(NEWTON_PAIR || j < nlocal)
               D_values_t[j] += Phi_AB;
           }
         }
       }
     }
   }
 
   // wait until all threads are done with computation
   sync_threads();
 
   // communicate and sum densities
 
   if (NEWTON_PAIR) {
     // reduce per thread density
     thr->timer(Timer::PAIR);
     data_reduce_thr(rho, nall, nthreads, 1, tid);
     data_reduce_thr(rhoB, nall, nthreads, 1, tid);
     if (CDEAMVERSION==1)
       data_reduce_thr(D_values, nall, nthreads, 1, tid);
 
     // wait until reduction is complete
     sync_threads();
 
 #if defined(_OPENMP)
 #pragma omp master
 #endif
     { communicationStage = 1;
       comm->reverse_comm_pair(this); }
 
     // wait until master thread is done with communication
     sync_threads();
 
   } else {
     // reduce per thread density
     thr->timer(Timer::PAIR);
     data_reduce_thr(rho, nlocal, nthreads, 1, tid);
     data_reduce_thr(rhoB, nlocal, nthreads, 1, tid);
     if (CDEAMVERSION==1)
       data_reduce_thr(D_values, nlocal, nthreads, 1, tid);
 
     // wait until reduction is complete
     sync_threads();
   }
 
   // fp = derivative of embedding energy at each atom
   // phi = embedding energy at each atom
 
   for (ii = iifrom; ii < iito; ii++) {
     i = ilist[ii];
     EAMTableIndex index = rhoToTableIndex(rho[i]);
     fp[i] = FPrimeOfRho(index, type[i]);
     if(EFLAG) {
       phi = FofRho(index, type[i]);
       e_tally_thr(this, i, i, nlocal, NEWTON_PAIR, phi, 0.0, thr);
     }
   }
 
   // wait until all theads are done with computation
   sync_threads();
 
   // Communicate derivative of embedding function and densities
   // and D_values (this for one-site formulation only).
 #if defined(_OPENMP)
 #pragma omp master
 #endif
   { communicationStage = 2;
     comm->forward_comm_pair(this); }
 
   // wait until master thread is done with communication
   sync_threads();
 
 
   // The electron densities may not drop to zero because then the concentration would no longer be defined.
   // But the concentration is not needed anyway if there is no interaction with another atom, which is the case
   // if the electron density is exactly zero. That's why the following lines have been commented out.
   //
   //for(i = 0; i < nlocal + atom->nghost; i++) {
   //        if(rho[i] == 0 && (type[i] == speciesA || type[i] == speciesB))
   //                error->one(FLERR,"CD-EAM potential routine: Detected atom with zero electron density.");
   //}
 
   // Stage II
   // This is only required for the original two-site formulation of the CD-EAM potential.
 
   if(CDEAMVERSION == 2) {
     // Compute intermediate value D_i for each atom.
     for (ii = iifrom; ii < iito; ii++) {
       i = ilist[ii];
       xtmp = x[i].x;
       ytmp = x[i].y;
       ztmp = x[i].z;
       itype = type[i];
       jlist = firstneigh[i];
       jnum = numneigh[i];
 
       // This code line is required for ternary alloys.
       if(itype != speciesA && itype != speciesB) continue;
 
       double x_i = rhoB[i] / rho[i];        // Concentration at atom i.
 
       for(jj = 0; jj < jnum; jj++) {
         j = jlist[jj];
         j &= NEIGHMASK;
         jtype = type[j];
         if(itype == jtype) continue;
 
         // This code line is required for ternary alloys.
         if(jtype != speciesA && jtype != speciesB) continue;
 
         delx = xtmp - x[j].x;
         dely = ytmp - x[j].y;
         delz = ztmp - x[j].z;
         rsq = delx*delx + dely*dely + delz*delz;
 
         if(rsq < cutforcesq) {
           double r = sqrt(rsq);
           const EAMTableIndex index = radiusToTableIndex(r);
 
           // The concentration independent part of the cross pair potential.
           double Phi_AB = PhiOfR(index, itype, jtype, 1.0 / r);
 
           // Average concentration of two sites
           double x_ij = 0.5 * (x_i + rhoB[j]/rho[j]);
 
           // Calculate derivative of h(x_ij) polynomial function.
           double h_prime = evalHprime(x_ij);
 
           D_values_t[i] += h_prime * Phi_AB / (2.0 * rho[i] * rho[i]);
           if(NEWTON_PAIR || j < nlocal)
             D_values_t[j] += h_prime * Phi_AB / (2.0 * rho[j] * rho[j]);
         }
       }
     }
 
     if (NEWTON_PAIR) {
     thr->timer(Timer::PAIR);
       data_reduce_thr(D_values, nall, nthreads, 1, tid);
 
       // wait until reduction is complete
       sync_threads();
 
 #if defined(_OPENMP)
 #pragma omp master
 #endif
       { communicationStage = 3;
         comm->reverse_comm_pair(this); }
 
       // wait until master thread is done with communication
       sync_threads();
 
   } else {
     thr->timer(Timer::PAIR);
       data_reduce_thr(D_values, nlocal, nthreads, 1, tid);
 
     // wait until reduction is complete
     sync_threads();
   }
 
 #if defined(_OPENMP)
 #pragma omp master
 #endif
     { communicationStage = 4;
       comm->forward_comm_pair(this); }
 
     // wait until master thread is done with communication
     sync_threads();
   }
 
   // Stage III
 
   // Compute force acting on each atom.
   for (ii = iifrom; ii < iito; ii++) {
     i = ilist[ii];
     xtmp = x[i].x;
     ytmp = x[i].y;
     ztmp = x[i].z;
     itype = type[i];
     fxtmp = fytmp = fztmp = 0.0;
 
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     // Concentration at site i
     double x_i = -1.0;                // The value -1 indicates: no concentration dependence for all interactions of atom i.
     // It will be replaced by the concentration at site i if atom i is either A or B.
 
     double D_i, h_prime_i;
 
     // This if-clause is only required for ternary alloys.
     if((itype == speciesA || itype == speciesB) && rho[i] != 0.0) {
 
       // Compute local concentration at site i.
       x_i = rhoB[i]/rho[i];
       ASSERT(x_i >= 0 && x_i<=1.0);
 
       if(CDEAMVERSION == 1) {
         // Calculate derivative of h(x_i) polynomial function.
         h_prime_i = evalHprime(x_i);
         D_i = D_values[i] * h_prime_i / (2.0 * rho[i] * rho[i]);
       } else if(CDEAMVERSION == 2) {
         D_i = D_values[i];
-      } else ASSERT(false);
+      } else {
+        ASSERT(false);
+      }
     }
 
     for(jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j].x;
       dely = ytmp - x[j].y;
       delz = ztmp - x[j].z;
       rsq = delx*delx + dely*dely + delz*delz;
 
       if(rsq < cutforcesq) {
         jtype = type[j];
         double r = sqrt(rsq);
         const EAMTableIndex index = radiusToTableIndex(r);
 
         // rhoip = derivative of (density at atom j due to atom i)
         // rhojp = derivative of (density at atom i due to atom j)
         // psip needs both fp[i] and fp[j] terms since r_ij appears in two
         //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
         //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
         rhoip = RhoPrimeOfR(index, itype, jtype);
         rhojp = RhoPrimeOfR(index, jtype, itype);
         fpair = fp[i]*rhojp + fp[j]*rhoip;
         recip = 1.0/r;
 
         double x_j = -1;  // The value -1 indicates: no concentration dependence for this i-j pair
         // because atom j is not of species A nor B.
 
         // This code line is required for ternary alloy.
         if(jtype == speciesA || jtype == speciesB) {
           ASSERT(rho[i] != 0.0);
           ASSERT(rho[j] != 0.0);
 
           // Compute local concentration at site j.
           x_j = rhoB[j]/rho[j];
           ASSERT(x_j >= 0 && x_j<=1.0);
 
           double D_j;
           if(CDEAMVERSION == 1) {
             // Calculate derivative of h(x_j) polynomial function.
             double h_prime_j = evalHprime(x_j);
             D_j = D_values[j] * h_prime_j / (2.0 * rho[j] * rho[j]);
           } else if(CDEAMVERSION == 2) {
             D_j = D_values[j];
-          } else ASSERT(false);
-
+          } else {
+            ASSERT(false);
+          }
           double t2 = -rhoB[j];
           if(itype == speciesB) t2 += rho[j];
           fpair += D_j * rhoip * t2;
         }
 
         // This if-clause is only required for a ternary alloy.
         // Actually we don't need it at all because D_i should be zero anyway if
         // atom i has no concentration dependent interactions (because it is not species A or B).
         if(x_i != -1.0) {
           double t1 = -rhoB[i];
           if(jtype == speciesB) t1 += rho[i];
           fpair += D_i * rhojp * t1;
         }
 
         double phip;
         double phi = PhiOfR(index, itype, jtype, recip, phip);
         if(itype == jtype || x_i == -1.0 || x_j == -1.0) {
           // Case of no concentration dependence.
           fpair += phip;
         } else {
           // We have a concentration dependence for the i-j interaction.
           double h;
           if(CDEAMVERSION == 1) {
             // Calculate h(x_i) polynomial function.
             double h_i = evalH(x_i);
             // Calculate h(x_j) polynomial function.
             double h_j = evalH(x_j);
             h = 0.5 * (h_i + h_j);
           } else if(CDEAMVERSION == 2) {
             // Average concentration.
             double x_ij = 0.5 * (x_i + x_j);
             // Calculate h(x_ij) polynomial function.
             h = evalH(x_ij);
-          } else ASSERT(false);
-
+          } else {
+            ASSERT(false);
+          }
           fpair += h * phip;
           phi *= h;
         }
 
         // Divide by r_ij and negate to get forces from gradient.
         fpair /= -r;
 
         fxtmp += delx*fpair;
         fytmp += dely*fpair;
         fztmp += delz*fpair;
         if(NEWTON_PAIR || j < nlocal) {
           f[j].x -= delx*fpair;
           f[j].y -= dely*fpair;
           f[j].z -= delz*fpair;
         }
 
         if(EFLAG) evdwl = phi;
         if(EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,evdwl,0.0,
                                 fpair,delx,dely,delz,thr);
       }
     }
     f[i].x += fxtmp;
     f[i].y += fytmp;
     f[i].z += fztmp;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairCDEAMOMP::memory_usage()
 {
   double bytes = memory_usage_thr();
   bytes += PairCDEAM::memory_usage();
 
   return bytes;
 }
diff --git a/src/USER-OMP/pair_gran_hooke_omp.cpp b/src/USER-OMP/pair_gran_hooke_omp.cpp
index d3988bc5b..c4e792708 100644
--- a/src/USER-OMP/pair_gran_hooke_omp.cpp
+++ b/src/USER-OMP/pair_gran_hooke_omp.cpp
@@ -1,270 +1,270 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "pair_gran_hooke_omp.h"
 #include "atom.h"
 #include "comm.h"
 #include "fix.h"
 #include "force.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairGranHookeOMP::PairGranHookeOMP(LAMMPS *lmp) :
   PairGranHooke(lmp), ThrOMP(lmp, THR_PAIR)
 {
   suffix_flag |= Suffix::OMP;
   respa_enable = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGranHookeOMP::compute(int eflag, int vflag)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int nall = atom->nlocal + atom->nghost;
   const int nthreads = comm->nthreads;
   const int inum = list->inum;
 
   // update rigid body info for owned & ghost atoms if using FixRigid masses
   // body[i] = which body atom I is in, -1 if none
   // mass_body = mass of each rigid body
 
   if (fix_rigid && neighbor->ago == 0) {
     int tmp;
     int *body = (int *) fix_rigid->extract("body",tmp);
     double *mass_body = (double *) fix_rigid->extract("masstotal",tmp);
     if (atom->nmax > nmax) {
       memory->destroy(mass_rigid);
       nmax = atom->nmax;
       memory->create(mass_rigid,nmax,"pair:mass_rigid");
     }
     int nlocal = atom->nlocal;
     for (int i = 0; i < nlocal; i++)
       if (body[i] >= 0) mass_rigid[i] = mass_body[body[i]];
       else mass_rigid[i] = 0.0;
     comm->forward_comm_pair(this);
   }
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none) shared(eflag,vflag)
 #endif
   {
     int ifrom, ito, tid;
 
     loop_setup_thr(ifrom, ito, tid, inum, nthreads);
     ThrData *thr = fix->get_thr(tid);
     thr->timer(Timer::START);
     ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
 
     if (evflag)
       if (force->newton_pair) eval<1,1>(ifrom, ito, thr);
       else eval<1,0>(ifrom, ito, thr);
     else
       if (force->newton_pair) eval<0,1>(ifrom, ito, thr);
       else eval<0,0>(ifrom, ito, thr);
 
     thr->timer(Timer::PAIR);
     reduce_thr(this, eflag, vflag, thr);
   } // end of omp parallel region
 }
 
 template <int EVFLAG, int NEWTON_PAIR>
 void PairGranHookeOMP::eval(int iifrom, int iito, ThrData * const thr)
 {
-  int i,j,ii,jj,jnum,itype,jtype;
+  int i,j,ii,jj,jnum;
   double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz;
   double radi,radj,radsum,rsq,r,rinv,rsqinv;
   double vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3;
   double wr1,wr2,wr3;
   double vtr1,vtr2,vtr3,vrel;
   double mi,mj,meff,damp,ccel,tor1,tor2,tor3;
   double fn,fs,ft,fs1,fs2,fs3;
   int *ilist,*jlist,*numneigh,**firstneigh;
 
   const double * const * const x = atom->x;
   const double * const * const v = atom->v;
   const double * const * const omega = atom->omega;
   const double * const radius = atom->radius;
   const double * const rmass = atom->rmass;
   const double * const mass = atom->mass;
   double * const * const f = thr->get_f();
   double * const * const torque = thr->get_torque();
   const int * const type = atom->type;
   const int * const mask = atom->mask;
   const int nlocal = atom->nlocal;
   double fxtmp,fytmp,fztmp;
   double t1tmp,t2tmp,t3tmp;
 
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
   // loop over neighbors of my atoms
 
   for (ii = iifrom; ii < iito; ++ii) {
 
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     radi = radius[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
     fxtmp=fytmp=fztmp=t1tmp=t2tmp=t3tmp=0.0;
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       radj = radius[j];
       radsum = radi + radj;
 
       if (rsq < radsum*radsum) {
         r = sqrt(rsq);
         rinv = 1.0/r;
         rsqinv = 1.0/rsq;
 
         // relative translational velocity
 
         vr1 = v[i][0] - v[j][0];
         vr2 = v[i][1] - v[j][1];
         vr3 = v[i][2] - v[j][2];
 
         // normal component
 
         vnnr = vr1*delx + vr2*dely + vr3*delz;
         vn1 = delx*vnnr * rsqinv;
         vn2 = dely*vnnr * rsqinv;
         vn3 = delz*vnnr * rsqinv;
 
         // tangential component
 
         vt1 = vr1 - vn1;
         vt2 = vr2 - vn2;
         vt3 = vr3 - vn3;
 
         // relative rotational velocity
 
         wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
         wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
         wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
 
         // meff = effective mass of pair of particles
         // if I or J part of rigid body, use body mass
         // if I or J is frozen, meff is other particle
 
         if (rmass) {
           mi = rmass[i];
           mj = rmass[j];
         } else {
           mi = mass[type[i]];
           mj = mass[type[j]];
         }
         if (fix_rigid) {
           if (mass_rigid[i] > 0.0) mi = mass_rigid[i];
           if (mass_rigid[j] > 0.0) mj = mass_rigid[j];
         }
 
         meff = mi*mj / (mi+mj);
         if (mask[i] & freeze_group_bit) meff = mj;
         if (mask[j] & freeze_group_bit) meff = mi;
 
         // normal forces = Hookian contact + normal velocity damping
 
         damp = meff*gamman*vnnr*rsqinv;
         ccel = kn*(radsum-r)*rinv - damp;
 
         // relative velocities
 
         vtr1 = vt1 - (delz*wr2-dely*wr3);
         vtr2 = vt2 - (delx*wr3-delz*wr1);
         vtr3 = vt3 - (dely*wr1-delx*wr2);
         vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
         vrel = sqrt(vrel);
 
         // force normalization
 
         fn = xmu * fabs(ccel*r);
         fs = meff*gammat*vrel;
         if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
         else ft = 0.0;
 
         // tangential force due to tangential velocity damping
 
         fs1 = -ft*vtr1;
         fs2 = -ft*vtr2;
         fs3 = -ft*vtr3;
 
         // forces & torques
 
         fx = delx*ccel + fs1;
         fy = dely*ccel + fs2;
         fz = delz*ccel + fs3;
         fxtmp  += fx;
         fytmp  += fy;
         fztmp  += fz;
 
         tor1 = rinv * (dely*fs3 - delz*fs2);
         tor2 = rinv * (delz*fs1 - delx*fs3);
         tor3 = rinv * (delx*fs2 - dely*fs1);
         t1tmp -= radi*tor1;
         t2tmp -= radi*tor2;
         t3tmp -= radi*tor3;
 
         if (NEWTON_PAIR || j < nlocal) {
           f[j][0] -= fx;
           f[j][1] -= fy;
           f[j][2] -= fz;
           torque[j][0] -= radj*tor1;
           torque[j][1] -= radj*tor2;
           torque[j][2] -= radj*tor3;
         }
 
         if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,NEWTON_PAIR,
                                      0.0,0.0,fx,fy,fz,delx,dely,delz,thr);
 
       }
     }
     f[i][0] += fxtmp;
     f[i][1] += fytmp;
     f[i][2] += fztmp;
     torque[i][0] += t1tmp;
     torque[i][1] += t2tmp;
     torque[i][2] += t3tmp;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairGranHookeOMP::memory_usage()
 {
   double bytes = memory_usage_thr();
   bytes += PairGranHooke::memory_usage();
 
   return bytes;
 }
diff --git a/src/USER-OMP/pppm_tip4p_cg_omp.cpp b/src/USER-OMP/pppm_tip4p_cg_omp.cpp
index 05229a380..4854aa84f 100644
--- a/src/USER-OMP/pppm_tip4p_cg_omp.cpp
+++ b/src/USER-OMP/pppm_tip4p_cg_omp.cpp
@@ -1,809 +1,808 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */
 
 #include "pppm_tip4p_cg_omp.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
 #include "fix_omp.h"
 #include "force.h"
 #include "memory.h"
 #include "math_const.h"
 #include "math_special.h"
 
 #include <string.h>
 #include <math.h>
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
 using namespace MathSpecial;
 
 #ifdef FFT_SINGLE
 #define ZEROF 0.0f
 #else
 #define ZEROF 0.0
 #endif
 
 #define EPS_HOC 1.0e-7
 #define OFFSET 16384
 
 /* ---------------------------------------------------------------------- */
 
 PPPMTIP4PCGOMP::PPPMTIP4PCGOMP(LAMMPS *lmp, int narg, char **arg) :
   PPPMTIP4PCG(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
 {
   triclinic_support = 0;
   suffix_flag |= Suffix::OMP;
 }
 
 /* ----------------------------------------------------------------------
    allocate memory that depends on # of K-vectors and order
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::allocate()
 {
   PPPMTIP4PCG::allocate();
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
 #if defined(_OPENMP)
     const int tid = omp_get_thread_num();
 #else
     const int tid = 0;
 #endif
     ThrData *thr = fix->get_thr(tid);
     thr->init_pppm(order,memory);
   }
 }
 
 /* ----------------------------------------------------------------------
    free memory that depends on # of K-vectors and order
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::deallocate()
 {
   PPPMTIP4PCG::deallocate();
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
 #if defined(_OPENMP)
     const int tid = omp_get_thread_num();
 #else
     const int tid = 0;
 #endif
     ThrData *thr = fix->get_thr(tid);
     thr->init_pppm(-order,memory);
   }
 }
 
 /* ----------------------------------------------------------------------
    pre-compute modified (Hockney-Eastwood) Coulomb Green's function
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::compute_gf_ik()
 {
   const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
 
   const double xprd = prd[0];
   const double yprd = prd[1];
   const double zprd = prd[2];
   const double zprd_slab = zprd*slab_volfactor;
   const double unitkx = (MY_2PI/xprd);
   const double unitky = (MY_2PI/yprd);
   const double unitkz = (MY_2PI/zprd_slab);
 
   const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
                                     pow(-log(EPS_HOC),0.25));
   const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
                                     pow(-log(EPS_HOC),0.25));
   const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
                                     pow(-log(EPS_HOC),0.25));
   const int numk = nxhi_fft - nxlo_fft + 1;
   const int numl = nyhi_fft - nylo_fft + 1;
 
   const int twoorder = 2*order;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
     double snx,sny,snz;
     double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
     double sum1,dot1,dot2;
     double numerator,denominator;
     double sqk;
 
     int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid;
 
     loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
     ThrData *thr = fix->get_thr(tid);
     thr->timer(Timer::START);
 
     for (n = nfrom; n < nto; ++n) {
       m = n / (numl*numk);
       l = (n - m*numl*numk) / numk;
       k = n - m*numl*numk - l*numk;
       m += nzlo_fft;
       l += nylo_fft;
       k += nxlo_fft;
 
       mper = m - nz_pppm*(2*m/nz_pppm);
       snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
 
       lper = l - ny_pppm*(2*l/ny_pppm);
       sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
 
       kper = k - nx_pppm*(2*k/nx_pppm);
       snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
 
       sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
 
       if (sqk != 0.0) {
         numerator = 12.5663706/sqk;
         denominator = gf_denom(snx,sny,snz);
         sum1 = 0.0;
 
         for (nx = -nbx; nx <= nbx; nx++) {
           qx = unitkx*(kper+nx_pppm*nx);
           sx = exp(-0.25*square(qx/g_ewald));
           argx = 0.5*qx*xprd/nx_pppm;
           wx = powsinxx(argx,twoorder);
 
           for (ny = -nby; ny <= nby; ny++) {
             qy = unitky*(lper+ny_pppm*ny);
             sy = exp(-0.25*square(qy/g_ewald));
             argy = 0.5*qy*yprd/ny_pppm;
             wy = powsinxx(argy,twoorder);
 
             for (nz = -nbz; nz <= nbz; nz++) {
               qz = unitkz*(mper+nz_pppm*nz);
               sz = exp(-0.25*square(qz/g_ewald));
               argz = 0.5*qz*zprd_slab/nz_pppm;
               wz = powsinxx(argz,twoorder);
 
               dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
               dot2 = qx*qx+qy*qy+qz*qz;
               sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
             }
           }
         }
         greensfn[n] = numerator*sum1/denominator;
       } else greensfn[n] = 0.0;
     }
     thr->timer(Timer::KSPACE);
   } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
    compute optimized Green's function for energy calculation
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::compute_gf_ad()
 {
 
   const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
 
   const double xprd = prd[0];
   const double yprd = prd[1];
   const double zprd = prd[2];
   const double zprd_slab = zprd*slab_volfactor;
   const double unitkx = (MY_2PI/xprd);
   const double unitky = (MY_2PI/yprd);
   const double unitkz = (MY_2PI/zprd_slab);
 
   const int numk = nxhi_fft - nxlo_fft + 1;
   const int numl = nyhi_fft - nylo_fft + 1;
 
   const int twoorder = 2*order;
   double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5)
 #endif
   {
     double snx,sny,snz,sqk;
     double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
     double numerator,denominator;
     int k,l,m,kper,lper,mper,n,nfrom,nto,tid;
 
     loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
     ThrData *thr = fix->get_thr(tid);
     thr->timer(Timer::START);
 
     for (n = nfrom; n < nto; ++n) {
 
       m = n / (numl*numk);
       l = (n - m*numl*numk) / numk;
       k = n - m*numl*numk - l*numk;
       m += nzlo_fft;
       l += nylo_fft;
       k += nxlo_fft;
 
       mper = m - nz_pppm*(2*m/nz_pppm);
       qz = unitkz*mper;
       snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
       sz = exp(-0.25*square(qz/g_ewald));
       argz = 0.5*qz*zprd_slab/nz_pppm;
       wz = powsinxx(argz,twoorder);
 
       lper = l - ny_pppm*(2*l/ny_pppm);
       qy = unitky*lper;
       sny = square(sin(0.5*qy*yprd/ny_pppm));
       sy = exp(-0.25*square(qy/g_ewald));
       argy = 0.5*qy*yprd/ny_pppm;
       wy = powsinxx(argy,twoorder);
 
       kper = k - nx_pppm*(2*k/nx_pppm);
       qx = unitkx*kper;
       snx = square(sin(0.5*qx*xprd/nx_pppm));
       sx = exp(-0.25*square(qx/g_ewald));
       argx = 0.5*qx*xprd/nx_pppm;
       wx = powsinxx(argx,twoorder);
 
       sqk = qx*qx + qy*qy + qz*qz;
 
       if (sqk != 0.0) {
         numerator = MY_4PI/sqk;
         denominator = gf_denom(snx,sny,snz);
         greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
         sf0 += sf_precoeff1[n]*greensfn[n];
         sf1 += sf_precoeff2[n]*greensfn[n];
         sf2 += sf_precoeff3[n]*greensfn[n];
         sf3 += sf_precoeff4[n]*greensfn[n];
         sf4 += sf_precoeff5[n]*greensfn[n];
         sf5 += sf_precoeff6[n]*greensfn[n];
       } else {
         greensfn[n] = 0.0;
         sf0 += sf_precoeff1[n]*greensfn[n];
         sf1 += sf_precoeff2[n]*greensfn[n];
         sf2 += sf_precoeff3[n]*greensfn[n];
         sf3 += sf_precoeff4[n]*greensfn[n];
         sf4 += sf_precoeff5[n]*greensfn[n];
         sf5 += sf_precoeff6[n]*greensfn[n];
       }
     }
     thr->timer(Timer::KSPACE);
   } // end of paralle region
 
   // compute the coefficients for the self-force correction
 
   double prex, prey, prez, tmp[6];
   prex = prey = prez = MY_PI/volume;
   prex *= nx_pppm/xprd;
   prey *= ny_pppm/yprd;
   prez *= nz_pppm/zprd_slab;
   tmp[0] = sf0 * prex;
   tmp[1] = sf1 * prex*2;
   tmp[2] = sf2 * prey;
   tmp[3] = sf3 * prey*2;
   tmp[4] = sf4 * prez;
   tmp[5] = sf5 * prez*2;
 
   // communicate values with other procs
 
   MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world);
 }
 
 /* ----------------------------------------------------------------------
    run the regular toplevel compute method from plain PPPM
    which will have individual methods replaced by our threaded
    versions and then call the obligatory force reduction.
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::compute(int eflag, int vflag)
 {
 
   PPPMTIP4PCG::compute(eflag,vflag);
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none) shared(eflag,vflag)
 #endif
   {
 #if defined(_OPENMP)
     const int tid = omp_get_thread_num();
 #else
     const int tid = 0;
 #endif
     ThrData *thr = fix->get_thr(tid);
     thr->timer(Timer::START);
     reduce_thr(this, eflag, vflag, thr);
   } // end of omp parallel region
 }
 
 /* ----------------------------------------------------------------------
    find center grid pt for each of my particles
    check that full stencil for the particle will fit in my 3d brick
    store central grid pt indices in part2grid array
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::particle_map()
 {
   // no local atoms => nothing to do
 
   if (num_charged == 0) return;
 
   const int * _noalias const type = atom->type;
   const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
   int3_t * _noalias const p2g = (int3_t *) part2grid[0];
   const double boxlox = boxlo[0];
   const double boxloy = boxlo[1];
   const double boxloz = boxlo[2];
-  const int nlocal = atom->nlocal;
 
   int j, flag = 0;
 #if defined(_OPENMP)
 #pragma omp parallel for private(j) default(none) reduction(+:flag) schedule(static)
 #endif
   for (int j = 0; j < num_charged; j++) {
     const int i = is_charged[j];
 
     dbl3_t xM;
     int iH1,iH2;
 
     if (type[i] == typeO) {
       find_M_thr(i,iH1,iH2,xM);
     } else {
       xM = x[i];
     }
 
     // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
     // current particle coord can be outside global and local box
     // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
 
     const int nx = static_cast<int> ((xM.x-boxlox)*delxinv+shift) - OFFSET;
     const int ny = static_cast<int> ((xM.y-boxloy)*delyinv+shift) - OFFSET;
     const int nz = static_cast<int> ((xM.z-boxloz)*delzinv+shift) - OFFSET;
 
     p2g[i].a = nx;
     p2g[i].b = ny;
     p2g[i].t = nz;
 
     // check that entire stencil around nx,ny,nz will fit in my 3d brick
 
     if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
         ny+nlower < nylo_out || ny+nupper > nyhi_out ||
         nz+nlower < nzlo_out || nz+nupper > nzhi_out)
       flag++;
   }
 
   int flag_all;
   MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
   if (flag_all) error->all(FLERR,"Out of range atoms - cannot compute PPPM");
 }
 
 /* ----------------------------------------------------------------------
    create discretized "density" on section of global grid due to my particles
    density(x,y,z) = charge "density" at grid points of my 3d brick
    (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
    in global grid
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::make_rho()
 {
 
   // clear 3d density array
 
   FFT_SCALAR * _noalias const d = &(density_brick[nzlo_out][nylo_out][nxlo_out]);
   memset(d,0,ngrid*sizeof(FFT_SCALAR));
 
   // no charged atoms => nothing else to do
 
   if (num_charged == 0) return;
 
   const int ix = nxhi_out - nxlo_out + 1;
   const int iy = nyhi_out - nylo_out + 1;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
     const double * _noalias const q = atom->q;
     const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
     const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
     const int * _noalias const type = atom->type;
     dbl3_t xM;
 
     const double boxlox = boxlo[0];
     const double boxloy = boxlo[1];
     const double boxloz = boxlo[2];
 
     // determine range of grid points handled by this thread
     int i,j,jfrom,jto,tid,iH1,iH2;
     loop_setup_thr(jfrom,jto,tid,ngrid,comm->nthreads);
 
     // get per thread data
     ThrData *thr = fix->get_thr(tid);
     thr->timer(Timer::START);
     FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
 
     // loop over my charges, add their contribution to nearby grid points
     // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
     // (dx,dy,dz) = distance to "lower left" grid pt
 
     // loop over all charged atoms for all threads
     for (j = 0; j < num_charged; j++) {
       i = is_charged[j];
 
       const int nx = p2g[i].a;
       const int ny = p2g[i].b;
       const int nz = p2g[i].t;
 
       // pre-screen whether this atom will ever come within
       // reach of the data segement this thread is updating.
       if ( ((nz+nlower-nzlo_out)*ix*iy >= jto)
            || ((nz+nupper-nzlo_out+1)*ix*iy < jfrom) ) continue;
 
       if (type[i] == typeO) {
         find_M_thr(i,iH1,iH2,xM);
       } else {
         xM = x[i];
       }
       const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
       const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
       const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
 
       compute_rho1d_thr(r1d,dx,dy,dz);
 
       const FFT_SCALAR z0 = delvolinv * q[i];
 
       for (int n = nlower; n <= nupper; ++n) {
         const int jn = (nz+n-nzlo_out)*ix*iy;
         const FFT_SCALAR y0 = z0*r1d[2][n];
 
         for (int m = nlower; m <= nupper; ++m) {
           const int jm = jn+(ny+m-nylo_out)*ix;
           const FFT_SCALAR x0 = y0*r1d[1][m];
 
           for (int l = nlower; l <= nupper; ++l) {
             const int jl = jm+nx+l-nxlo_out;
             // make sure each thread only updates
             // "his" elements of the density grid
             if (jl >= jto) break;
             if (jl < jfrom) continue;
 
             d[jl] += x0*r1d[0][l];
           }
         }
       }
     }
     thr->timer(Timer::KSPACE);
   }
 }
 
 /* ----------------------------------------------------------------------
    interpolate from grid to get electric field & force on my particles for ik
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::fieldforce_ik()
 {
   const int nthreads = comm->nthreads;
 
   // no local atoms => nothing to do
 
   if (num_charged == 0) return;
 
   // loop over my charges, interpolate electric field from nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
   // (dx,dy,dz) = distance to "lower left" grid pt
   // (mx,my,mz) = global coords of moving stencil pt
   // ek = 3 components of E-field on particle
 
   const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
   const double * _noalias const q = atom->q;
   const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
   const int * _noalias const type = atom->type;
 
   const double qqrd2e = force->qqrd2e;
   const double boxlox = boxlo[0];
   const double boxloy = boxlo[1];
   const double boxloz = boxlo[2];
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
     dbl3_t xM;
     FFT_SCALAR x0,y0,z0,ekx,eky,ekz;
     int i,j,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;
 
     loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);
 
     // get per thread data
     ThrData *thr = fix->get_thr(tid);
     thr->timer(Timer::START);
     dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
     FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
 
     for (j = ifrom; j < ito; ++j) {
       i = is_charged[j];
       if (type[i] == typeO) {
         find_M_thr(i,iH1,iH2,xM);
       } else xM = x[i];
 
       const int nx = p2g[i].a;
       const int ny = p2g[i].b;
       const int nz = p2g[i].t;
       const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
       const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
       const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
 
       compute_rho1d_thr(r1d,dx,dy,dz);
 
       ekx = eky = ekz = ZEROF;
       for (n = nlower; n <= nupper; n++) {
         mz = n+nz;
         z0 = r1d[2][n];
         for (m = nlower; m <= nupper; m++) {
           my = m+ny;
           y0 = z0*r1d[1][m];
           for (l = nlower; l <= nupper; l++) {
             mx = l+nx;
             x0 = y0*r1d[0][l];
             ekx -= x0*vdx_brick[mz][my][mx];
             eky -= x0*vdy_brick[mz][my][mx];
             ekz -= x0*vdz_brick[mz][my][mx];
           }
         }
       }
 
       // convert E-field to force
 
       const double qfactor = qqrd2e * scale * q[i];
       if (type[i] != typeO) {
         f[i].x += qfactor*ekx;
         f[i].y += qfactor*eky;
         if (slabflag != 2) f[i].z += qfactor*ekz;
 
       } else {
         const double fx = qfactor * ekx;
         const double fy = qfactor * eky;
         const double fz = qfactor * ekz;
 
         f[i].x += fx*(1 - alpha);
         f[i].y += fy*(1 - alpha);
         if (slabflag != 2) f[i].z += fz*(1 - alpha);
 
         f[iH1].x += 0.5*alpha*fx;
         f[iH1].y += 0.5*alpha*fy;
         if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;
 
         f[iH2].x += 0.5*alpha*fx;
         f[iH2].y += 0.5*alpha*fy;
         if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
       }
     }
     thr->timer(Timer::KSPACE);
   } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
    interpolate from grid to get electric field & force on my particles for ad
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::fieldforce_ad()
 {
   const int nthreads = comm->nthreads;
 
   // no local atoms => nothing to do
 
   if (num_charged == 0) return;
 
   const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
   const double hx_inv = nx_pppm/prd[0];
   const double hy_inv = ny_pppm/prd[1];
   const double hz_inv = nz_pppm/prd[2];
 
   // loop over my charges, interpolate electric field from nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
   // (dx,dy,dz) = distance to "lower left" grid pt
   // (mx,my,mz) = global coords of moving stencil pt
   // ek = 3 components of E-field on particle
 
   const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
   const double * _noalias const q = atom->q;
   const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
   const int * _noalias const type = atom->type;
 
   const double qqrd2e = force->qqrd2e;
   const double boxlox = boxlo[0];
   const double boxloy = boxlo[1];
   const double boxloz = boxlo[2];
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
     double s1,s2,s3,sf;
     dbl3_t xM;
     FFT_SCALAR ekx,eky,ekz;
     int i,j,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;
 
     loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);
 
     // get per thread data
     ThrData *thr = fix->get_thr(tid);
     thr->timer(Timer::START);
     dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
     FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
     FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());
 
     for (j = ifrom; j < ito; ++j) {
       i = is_charged[j];
       if (type[i] == typeO) {
         find_M_thr(i,iH1,iH2,xM);
       } else xM = x[i];
 
       const int nx = p2g[i].a;
       const int ny = p2g[i].b;
       const int nz = p2g[i].t;
       const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
       const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
       const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
 
       compute_rho1d_thr(r1d,dx,dy,dz);
       compute_drho1d_thr(d1d,dx,dy,dz);
 
       ekx = eky = ekz = ZEROF;
       for (n = nlower; n <= nupper; n++) {
         mz = n+nz;
         for (m = nlower; m <= nupper; m++) {
           my = m+ny;
           for (l = nlower; l <= nupper; l++) {
             mx = l+nx;
             ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
             eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
             ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
           }
         }
       }
       ekx *= hx_inv;
       eky *= hy_inv;
       ekz *= hz_inv;
 
       // convert E-field to force and substract self forces
 
       const double qi = q[i];
       const double qfactor = qqrd2e * scale * qi;
 
       s1 = x[i].x*hx_inv;
       sf = sf_coeff[0]*sin(MY_2PI*s1);
       sf += sf_coeff[1]*sin(MY_4PI*s1);
       sf *= 2.0*qi;
       const double fx = qfactor*(ekx - sf);
 
       s2 = x[i].y*hy_inv;
       sf = sf_coeff[2]*sin(MY_2PI*s2);
       sf += sf_coeff[3]*sin(MY_4PI*s2);
       sf *= 2.0*qi;
       const double fy = qfactor*(eky - sf);
 
       s3 = x[i].z*hz_inv;
       sf = sf_coeff[4]*sin(MY_2PI*s3);
       sf += sf_coeff[5]*sin(MY_4PI*s3);
       sf *= 2.0*qi;
       const double fz = qfactor*(ekz - sf);
 
       if (type[i] != typeO) {
         f[i].x += fx;
         f[i].y += fy;
         if (slabflag != 2) f[i].z += fz;
 
       } else {
         f[i].x += fx*(1 - alpha);
         f[i].y += fy*(1 - alpha);
         if (slabflag != 2) f[i].z += fz*(1 - alpha);
 
         f[iH1].x += 0.5*alpha*fx;
         f[iH1].y += 0.5*alpha*fy;
         if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;
 
         f[iH2].x += 0.5*alpha*fx;
         f[iH2].y += 0.5*alpha*fy;
         if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
       }
     }
     thr->timer(Timer::KSPACE);
   } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
   find 2 H atoms bonded to O atom i
   compute position xM of fictitious charge site for O atom
   also return local indices iH1,iH2 of H atoms
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::find_M_thr(int i, int &iH1, int &iH2, dbl3_t &xM)
 {
   iH1 = atom->map(atom->tag[i] + 1);
   iH2 = atom->map(atom->tag[i] + 2);
 
   if (iH1 == -1 || iH2 == -1) error->one(FLERR,"TIP4P hydrogen is missing");
   if (atom->type[iH1] != typeH || atom->type[iH2] != typeH)
     error->one(FLERR,"TIP4P hydrogen has incorrect atom type");
 
   const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
 
   double delx1 = x[iH1].x - x[i].x;
   double dely1 = x[iH1].y - x[i].y;
   double delz1 = x[iH1].z - x[i].z;
   domain->minimum_image(delx1,dely1,delz1);
 
   double delx2 = x[iH2].x - x[i].x;
   double dely2 = x[iH2].y - x[i].y;
   double delz2 = x[iH2].z - x[i].z;
   domain->minimum_image(delx2,dely2,delz2);
 
   xM.x = x[i].x + alpha * 0.5 * (delx1 + delx2);
   xM.y = x[i].y + alpha * 0.5 * (dely1 + dely2);
   xM.z = x[i].z + alpha * 0.5 * (delz1 + delz2);
 }
 
 
 /* ----------------------------------------------------------------------
    charge assignment into rho1d
    dx,dy,dz = distance of particle from "lower left" grid point
 ------------------------------------------------------------------------- */
 void PPPMTIP4PCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
                                 const FFT_SCALAR &dy, const FFT_SCALAR &dz)
 {
   int k,l;
   FFT_SCALAR r1,r2,r3;
 
   for (k = (1-order)/2; k <= order/2; k++) {
     r1 = r2 = r3 = ZEROF;
 
     for (l = order-1; l >= 0; l--) {
       r1 = rho_coeff[l][k] + r1*dx;
       r2 = rho_coeff[l][k] + r2*dy;
       r3 = rho_coeff[l][k] + r3*dz;
     }
     r1d[0][k] = r1;
     r1d[1][k] = r2;
     r1d[2][k] = r3;
   }
 }
 
 /* ----------------------------------------------------------------------
    charge assignment into drho1d
    dx,dy,dz = distance of particle from "lower left" grid point
 ------------------------------------------------------------------------- */
 
 void PPPMTIP4PCGOMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx,
 			      const FFT_SCALAR &dy, const FFT_SCALAR &dz)
 {
   int k,l;
   FFT_SCALAR r1,r2,r3;
 
   for (k = (1-order)/2; k <= order/2; k++) {
     r1 = r2 = r3 = ZEROF;
 
     for (l = order-2; l >= 0; l--) {
       r1 = drho_coeff[l][k] + r1*dx;
       r2 = drho_coeff[l][k] + r2*dy;
       r3 = drho_coeff[l][k] + r3*dz;
     }
     d1d[0][k] = r1;
     d1d[1][k] = r2;
     d1d[2][k] = r3;
   }
 }