diff --git a/src/Depend.sh b/src/Depend.sh
index 3786172d6..632aa4a3e 100644
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -1,110 +1,113 @@
 # Depend.sh = Install/unInstall files due to package dependencies
 # this script is invoked after any package is installed/uninstalled
 
 # all parent/child package dependencies should be listed below
 # parent package = has files that files in another package derive from
 # child package = has files that derive from files in another package
 
 # update child packages that depend on the parent,
 #   but only if the child package is already installed
 # this is necessary to insure the child package installs
 #   only child files whose parent package files are now installed
 # decisions on (un)installing individual child files are made by
 #   the Install.sh script in the child package
 
 # depend function: arg = child-package
 # checks if child-package is installed, if not just return
 # otherwise invoke update of child package via its Install.sh
 
 depend () {
   cd $1
   installed=0
   for file in *.cpp *.h; do
     if (test -e ../$file) then
       installed=1
     fi
   done
 
   cd ..
   if (test $installed = 0) then
     return
   fi
 
   echo "  updating package $1"
   if (test -e $1/Install.sh) then
     cd $1; /bin/sh Install.sh 2; cd ..
   else
     cd $1; /bin/sh ../Install.sh 2; cd ..
   fi
 }
 
 # add one if statement per parent package
 # add one depend() call per child package that depends on that parent
 
 if (test $1 = "ASPHERE") then
   depend GPU
   depend USER-OMP
+  depend USER-INTEL
 fi
 
 if (test $1 = "CLASS2") then
   depend GPU
   depend USER-CUDA
   depend USER-OMP
 fi
 
 if (test $1 = "COLLOID") then
   depend GPU
   depend USER-OMP
 fi
 
 if (test $1 = "DIPOLE") then
   depend USER-MISC
   depend USER-OMP
 fi
 
 if (test $1 = "GRANULAR") then
   depend USER-CUDA
   depend USER-OMP
 fi
 
 if (test $1 = "KSPACE") then
   depend GPU
   depend OPT
   depend USER-CUDA
   depend USER-OMP
+  depend USER-INTEL
   depend USER-PHONON
 fi
 
 if (test $1 = "MANYBODY") then
   depend GPU
   depend OPT
   depend USER-CUDA
   depend USER-MISC
   depend USER-OMP
 fi
 
 if (test $1 = "MOLECULE") then
   depend GPU
   depend USER-CUDA
   depend USER-MISC
   depend USER-OMP
+  depend USER-INTEL
 fi
 
 if (test $1 = "PERI") then
   depend USER-OMP
 fi
 
 if (test $1 = "RIGID") then
   depend USER-OMP
 fi
 
 if (test $1 = "USER-CG-CMM") then
   depend GPU
   depend USER-CUDA
   depend USER-OMP
 fi
 
 if (test $1 = "USER-MISC") then
   depend GPU
   depend USER-OMP
 fi
diff --git a/src/GRANULAR/pair_gran_hooke_history.cpp b/src/GRANULAR/pair_gran_hooke_history.cpp
index 77d2f0d70..7f98cfa4e 100644
--- a/src/GRANULAR/pair_gran_hooke_history.cpp
+++ b/src/GRANULAR/pair_gran_hooke_history.cpp
@@ -1,808 +1,806 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Leo Silbert (SNL), Gary Grest (SNL)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_gran_hooke_history.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "domain.h"
 #include "force.h"
 #include "update.h"
 #include "modify.h"
 #include "fix.h"
 #include "fix_shear_history.h"
 #include "comm.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairGranHookeHistory::PairGranHookeHistory(LAMMPS *lmp) : Pair(lmp)
 {
   single_enable = 1;
   no_virial_fdotr_compute = 1;
   history = 1;
   fix_history = NULL;
-  suffix = NULL;
 
   single_extra = 4;
   svector = new double[4];
 
   computeflag = 0;
   neighprev = 0;
 
   nmax = 0;
   mass_rigid = NULL;
 
   // set comm size needed by this Pair if used with fix rigid
 
   comm_forward = 1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairGranHookeHistory::~PairGranHookeHistory()
 {
   delete [] svector;
   if (fix_history) modify->delete_fix("SHEAR_HISTORY");
-  if (suffix) delete[] suffix;
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
 
     delete [] onerad_dynamic;
     delete [] onerad_frozen;
     delete [] maxrad_dynamic;
     delete [] maxrad_frozen;
   }
 
   memory->destroy(mass_rigid);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGranHookeHistory::compute(int eflag, int vflag)
 {
   int i,j,ii,jj,inum,jnum;
   double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz;
   double radi,radj,radsum,rsq,r,rinv,rsqinv;
   double vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3;
   double wr1,wr2,wr3;
   double vtr1,vtr2,vtr3,vrel;
   double mi,mj,meff,damp,ccel,tor1,tor2,tor3;
   double fn,fs,fs1,fs2,fs3;
   double shrmag,rsht;
   int *ilist,*jlist,*numneigh,**firstneigh;
   int *touch,**firsttouch;
   double *shear,*allshear,**firstshear;
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
   computeflag = 1;
   int shearupdate = 1;
   if (update->setupflag) shearupdate = 0;
 
   // update rigid body info for owned & ghost atoms if using FixRigid masses
   // body[i] = which body atom I is in, -1 if none
   // mass_body = mass of each rigid body
 
   if (fix_rigid && neighbor->ago == 0) {
     int tmp;
     int *body = (int *) fix_rigid->extract("body",tmp);
     double *mass_body = (double *) fix_rigid->extract("masstotal",tmp);
     if (atom->nmax > nmax) {
       memory->destroy(mass_rigid);
       nmax = atom->nmax;
       memory->create(mass_rigid,nmax,"pair:mass_rigid");
     }
     int nlocal = atom->nlocal;
     for (i = 0; i < nlocal; i++)
       if (body[i] >= 0) mass_rigid[i] = mass_body[body[i]];
       else mass_rigid[i] = 0.0;
     comm->forward_comm_pair(this);
   }
 
   double **x = atom->x;
   double **v = atom->v;
   double **f = atom->f;
   double **omega = atom->omega;
   double **torque = atom->torque;
   double *radius = atom->radius;
   double *rmass = atom->rmass;
   double *mass = atom->mass;
   int *type = atom->type;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
   firsttouch = listgranhistory->firstneigh;
   firstshear = listgranhistory->firstdouble;
 
   // loop over neighbors of my atoms
 
   for (ii = 0; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     radi = radius[i];
     touch = firsttouch[i];
     allshear = firstshear[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       radj = radius[j];
       radsum = radi + radj;
 
       if (rsq >= radsum*radsum) {
 
         // unset non-touching neighbors
 
         touch[jj] = 0;
         shear = &allshear[3*jj];
         shear[0] = 0.0;
         shear[1] = 0.0;
         shear[2] = 0.0;
 
       } else {
         r = sqrt(rsq);
         rinv = 1.0/r;
         rsqinv = 1.0/rsq;
 
         // relative translational velocity
 
         vr1 = v[i][0] - v[j][0];
         vr2 = v[i][1] - v[j][1];
         vr3 = v[i][2] - v[j][2];
 
         // normal component
 
         vnnr = vr1*delx + vr2*dely + vr3*delz;
         vn1 = delx*vnnr * rsqinv;
         vn2 = dely*vnnr * rsqinv;
         vn3 = delz*vnnr * rsqinv;
 
         // tangential component
 
         vt1 = vr1 - vn1;
         vt2 = vr2 - vn2;
         vt3 = vr3 - vn3;
 
         // relative rotational velocity
 
         wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
         wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
         wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
 
         // meff = effective mass of pair of particles
         // if I or J part of rigid body, use body mass
         // if I or J is frozen, meff is other particle
 
         if (rmass) {
           mi = rmass[i];
           mj = rmass[j];
         } else {
           mi = mass[type[i]];
           mj = mass[type[j]];
         }
         if (fix_rigid) {
           if (mass_rigid[i] > 0.0) mi = mass_rigid[i];
           if (mass_rigid[j] > 0.0) mj = mass_rigid[j];
         }
 
         meff = mi*mj / (mi+mj);
         if (mask[i] & freeze_group_bit) meff = mj;
         if (mask[j] & freeze_group_bit) meff = mi;
 
         // normal forces = Hookian contact + normal velocity damping
 
         damp = meff*gamman*vnnr*rsqinv;
         ccel = kn*(radsum-r)*rinv - damp;
 
         // relative velocities
 
         vtr1 = vt1 - (delz*wr2-dely*wr3);
         vtr2 = vt2 - (delx*wr3-delz*wr1);
         vtr3 = vt3 - (dely*wr1-delx*wr2);
         vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
         vrel = sqrt(vrel);
 
         // shear history effects
 
         touch[jj] = 1;
         shear = &allshear[3*jj];
 
         if (shearupdate) {
           shear[0] += vtr1*dt;
           shear[1] += vtr2*dt;
           shear[2] += vtr3*dt;
         }
         shrmag = sqrt(shear[0]*shear[0] + shear[1]*shear[1] +
                       shear[2]*shear[2]);
 
         // rotate shear displacements
 
         rsht = shear[0]*delx + shear[1]*dely + shear[2]*delz;
         rsht *= rsqinv;
         if (shearupdate) {
           shear[0] -= rsht*delx;
           shear[1] -= rsht*dely;
           shear[2] -= rsht*delz;
         }
 
         // tangential forces = shear + tangential velocity damping
 
         fs1 = - (kt*shear[0] + meff*gammat*vtr1);
         fs2 = - (kt*shear[1] + meff*gammat*vtr2);
         fs3 = - (kt*shear[2] + meff*gammat*vtr3);
 
         // rescale frictional displacements and forces if needed
 
         fs = sqrt(fs1*fs1 + fs2*fs2 + fs3*fs3);
         fn = xmu * fabs(ccel*r);
 
         if (fs > fn) {
           if (shrmag != 0.0) {
             shear[0] = (fn/fs) * (shear[0] + meff*gammat*vtr1/kt) -
               meff*gammat*vtr1/kt;
             shear[1] = (fn/fs) * (shear[1] + meff*gammat*vtr2/kt) -
               meff*gammat*vtr2/kt;
             shear[2] = (fn/fs) * (shear[2] + meff*gammat*vtr3/kt) -
               meff*gammat*vtr3/kt;
             fs1 *= fn/fs;
             fs2 *= fn/fs;
             fs3 *= fn/fs;
           } else fs1 = fs2 = fs3 = 0.0;
         }
 
         // forces & torques
 
         fx = delx*ccel + fs1;
         fy = dely*ccel + fs2;
         fz = delz*ccel + fs3;
         f[i][0] += fx;
         f[i][1] += fy;
         f[i][2] += fz;
 
         tor1 = rinv * (dely*fs3 - delz*fs2);
         tor2 = rinv * (delz*fs1 - delx*fs3);
         tor3 = rinv * (delx*fs2 - dely*fs1);
         torque[i][0] -= radi*tor1;
         torque[i][1] -= radi*tor2;
         torque[i][2] -= radi*tor3;
 
         if (j < nlocal) {
           f[j][0] -= fx;
           f[j][1] -= fy;
           f[j][2] -= fz;
           torque[j][0] -= radj*tor1;
           torque[j][1] -= radj*tor2;
           torque[j][2] -= radj*tor3;
         }
 
         if (evflag) ev_tally_xyz(i,j,nlocal,0,
                                  0.0,0.0,fx,fy,fz,delx,dely,delz);
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
 
 void PairGranHookeHistory::allocate()
 {
   allocated = 1;
   int n = atom->ntypes;
 
   memory->create(setflag,n+1,n+1,"pair:setflag");
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       setflag[i][j] = 0;
 
   memory->create(cutsq,n+1,n+1,"pair:cutsq");
 
   onerad_dynamic = new double[n+1];
   onerad_frozen = new double[n+1];
   maxrad_dynamic = new double[n+1];
   maxrad_frozen = new double[n+1];
 }
 
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
 
 void PairGranHookeHistory::settings(int narg, char **arg)
 {
   if (narg != 6) error->all(FLERR,"Illegal pair_style command");
 
   kn = force->numeric(FLERR,arg[0]);
   if (strcmp(arg[1],"NULL") == 0) kt = kn * 2.0/7.0;
   else kt = force->numeric(FLERR,arg[1]);
 
   gamman = force->numeric(FLERR,arg[2]);
   if (strcmp(arg[3],"NULL") == 0) gammat = 0.5 * gamman;
   else gammat = force->numeric(FLERR,arg[3]);
 
   xmu = force->numeric(FLERR,arg[4]);
   dampflag = force->inumeric(FLERR,arg[5]);
   if (dampflag == 0) gammat = 0.0;
 
   if (kn < 0.0 || kt < 0.0 || gamman < 0.0 || gammat < 0.0 ||
       xmu < 0.0 || xmu > 10000.0 || dampflag < 0 || dampflag > 1)
     error->all(FLERR,"Illegal pair_style command");
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
 void PairGranHookeHistory::coeff(int narg, char **arg)
 {
   if (narg > 2) error->all(FLERR,"Incorrect args for pair coefficients");
   if (!allocated) allocate();
 
   int ilo,ihi,jlo,jhi;
   force->bounds(arg[0],atom->ntypes,ilo,ihi);
   force->bounds(arg[1],atom->ntypes,jlo,jhi);
 
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
       setflag[i][j] = 1;
       count++;
     }
   }
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairGranHookeHistory::init_style()
 {
   int i;
 
   // error and warning checks
 
   if (!atom->sphere_flag)
     error->all(FLERR,"Pair granular requires atom style sphere");
   if (comm->ghost_velocity == 0)
     error->all(FLERR,"Pair granular requires ghost atoms store velocity");
 
   // need a granular neigh list and optionally a granular history neigh list
 
   int irequest = neighbor->request(this);
   neighbor->requests[irequest]->half = 0;
   neighbor->requests[irequest]->gran = 1;
   if (history) {
     irequest = neighbor->request(this);
     neighbor->requests[irequest]->id = 1;
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->granhistory = 1;
     neighbor->requests[irequest]->dnum = 3;
   }
 
   dt = update->dt;
 
   // if shear history is stored:
   // check if newton flag is valid
   // if first init, create Fix needed for storing shear history
 
   if (history && force->newton_pair == 1)
     error->all(FLERR,
                "Pair granular with shear history requires newton pair off");
 
   if (history && fix_history == NULL) {
     char **fixarg = new char*[3];
     fixarg[0] = (char *) "SHEAR_HISTORY";
     fixarg[1] = (char *) "all";
     fixarg[2] = (char *) "SHEAR_HISTORY";
-    modify->add_fix(3,fixarg,suffix);
+    modify->add_fix(3,fixarg,1);
     delete [] fixarg;
     fix_history = (FixShearHistory *) modify->fix[modify->nfix-1];
     fix_history->pair = this;
   }
 
   // check for FixFreeze and set freeze_group_bit
 
   for (i = 0; i < modify->nfix; i++)
     if (strcmp(modify->fix[i]->style,"freeze") == 0) break;
   if (i < modify->nfix) freeze_group_bit = modify->fix[i]->groupbit;
   else freeze_group_bit = 0;
 
   // check for FixRigid so can extract rigid body masses
 
   fix_rigid = NULL;
   for (i = 0; i < modify->nfix; i++)
     if (modify->fix[i]->rigid_flag) break;
   if (i < modify->nfix) fix_rigid = modify->fix[i];
 
   // check for FixPour and FixDeposit so can extract particle radii
 
   int ipour;
   for (ipour = 0; ipour < modify->nfix; ipour++)
     if (strcmp(modify->fix[ipour]->style,"pour") == 0) break;
   if (ipour == modify->nfix) ipour = -1;
 
   int idep;
   for (idep = 0; idep < modify->nfix; idep++)
     if (strcmp(modify->fix[idep]->style,"deposit") == 0) break;
   if (idep == modify->nfix) idep = -1;
 
   // set maxrad_dynamic and maxrad_frozen for each type
   // include future FixPour and FixDeposit particles as dynamic
 
   int itype;
   for (i = 1; i <= atom->ntypes; i++) {
     onerad_dynamic[i] = onerad_frozen[i] = 0.0;
     if (ipour >= 0) {
       itype = i;
       onerad_dynamic[i] = 
         *((double *) modify->fix[ipour]->extract("radius",itype));
     }
     if (idep >= 0) {
       itype = i;
       onerad_dynamic[i] = 
         *((double *) modify->fix[idep]->extract("radius",itype));
     }
   }
 
   double *radius = atom->radius;
   int *mask = atom->mask;
   int *type = atom->type;
   int nlocal = atom->nlocal;
 
   for (i = 0; i < nlocal; i++)
     if (mask[i] & freeze_group_bit)
       onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]],radius[i]);
     else
       onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]],radius[i]);
 
   MPI_Allreduce(&onerad_dynamic[1],&maxrad_dynamic[1],atom->ntypes,
                 MPI_DOUBLE,MPI_MAX,world);
   MPI_Allreduce(&onerad_frozen[1],&maxrad_frozen[1],atom->ntypes,
                 MPI_DOUBLE,MPI_MAX,world);
 }
 
 /* ----------------------------------------------------------------------
    neighbor callback to inform pair style of neighbor list to use
    optional granular history list
 ------------------------------------------------------------------------- */
 
 void PairGranHookeHistory::init_list(int id, NeighList *ptr)
 {
   if (id == 0) list = ptr;
   else if (id == 1) listgranhistory = ptr;
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairGranHookeHistory::init_one(int i, int j)
 {
   if (!allocated) allocate();
 
   // cutoff = sum of max I,J radii for
   // dynamic/dynamic & dynamic/frozen interactions, but not frozen/frozen
 
   double cutoff = maxrad_dynamic[i]+maxrad_dynamic[j];
   cutoff = MAX(cutoff,maxrad_frozen[i]+maxrad_dynamic[j]);
   cutoff = MAX(cutoff,maxrad_dynamic[i]+maxrad_frozen[j]);
   return cutoff;
 }
 
 /* ----------------------------------------------------------------------
   proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairGranHookeHistory::write_restart(FILE *fp)
 {
   write_restart_settings(fp);
 
   int i,j;
   for (i = 1; i <= atom->ntypes; i++)
     for (j = i; j <= atom->ntypes; j++)
       fwrite(&setflag[i][j],sizeof(int),1,fp);
 }
 
 /* ----------------------------------------------------------------------
   proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairGranHookeHistory::read_restart(FILE *fp)
 {
   read_restart_settings(fp);
   allocate();
 
   int i,j;
   int me = comm->me;
   for (i = 1; i <= atom->ntypes; i++)
     for (j = i; j <= atom->ntypes; j++) {
       if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
       MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
     }
 }
 
 /* ----------------------------------------------------------------------
   proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairGranHookeHistory::write_restart_settings(FILE *fp)
 {
   fwrite(&kn,sizeof(double),1,fp);
   fwrite(&kt,sizeof(double),1,fp);
   fwrite(&gamman,sizeof(double),1,fp);
   fwrite(&gammat,sizeof(double),1,fp);
   fwrite(&xmu,sizeof(double),1,fp);
   fwrite(&dampflag,sizeof(int),1,fp);
 }
 
 /* ----------------------------------------------------------------------
   proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairGranHookeHistory::read_restart_settings(FILE *fp)
 {
   if (comm->me == 0) {
     fread(&kn,sizeof(double),1,fp);
     fread(&kt,sizeof(double),1,fp);
     fread(&gamman,sizeof(double),1,fp);
     fread(&gammat,sizeof(double),1,fp);
     fread(&xmu,sizeof(double),1,fp);
     fread(&dampflag,sizeof(int),1,fp);
   }
   MPI_Bcast(&kn,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&kt,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&gamman,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&gammat,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&xmu,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&dampflag,1,MPI_INT,0,world);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGranHookeHistory::reset_dt()
 {
   dt = update->dt;
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairGranHookeHistory::single(int i, int j, int itype, int jtype,
                                     double rsq,
                                     double factor_coul, double factor_lj,
                                     double &fforce)
 {
   double radi,radj,radsum;
   double r,rinv,rsqinv,delx,dely,delz;
   double vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3,wr1,wr2,wr3;
   double mi,mj,meff,damp,ccel;
   double vtr1,vtr2,vtr3,vrel,shrmag,rsht;
   double fs1,fs2,fs3,fs,fn;
 
   double *radius = atom->radius;
   radi = radius[i];
   radj = radius[j];
   radsum = radi + radj;
 
   if (rsq >= radsum*radsum) {
     fforce = 0.0;
     svector[0] = svector[1] = svector[2] = svector[3] = 0.0;
     return 0.0;
   }
 
   r = sqrt(rsq);
   rinv = 1.0/r;
   rsqinv = 1.0/rsq;
 
   // relative translational velocity
 
   double **v = atom->v;
   vr1 = v[i][0] - v[j][0];
   vr2 = v[i][1] - v[j][1];
   vr3 = v[i][2] - v[j][2];
 
   // normal component
 
   double **x = atom->x;
   delx = x[i][0] - x[j][0];
   dely = x[i][1] - x[j][1];
   delz = x[i][2] - x[j][2];
 
   vnnr = vr1*delx + vr2*dely + vr3*delz;
   vn1 = delx*vnnr * rsqinv;
   vn2 = dely*vnnr * rsqinv;
   vn3 = delz*vnnr * rsqinv;
 
   // tangential component
 
   vt1 = vr1 - vn1;
   vt2 = vr2 - vn2;
   vt3 = vr3 - vn3;
 
   // relative rotational velocity
 
   double **omega = atom->omega;
   wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
   wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
   wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
 
   // meff = effective mass of pair of particles
   // if I or J part of rigid body, use body mass
   // if I or J is frozen, meff is other particle
 
   double *rmass = atom->rmass;
   double *mass = atom->mass;
   int *type = atom->type;
   int *mask = atom->mask;
 
   if (rmass) {
     mi = rmass[i];
     mj = rmass[j];
   } else {
     mi = mass[type[i]];
     mj = mass[type[j]];
   }
   if (fix_rigid) {
     // NOTE: insure mass_rigid is current for owned+ghost atoms?
     if (mass_rigid[i] > 0.0) mi = mass_rigid[i];
     if (mass_rigid[j] > 0.0) mj = mass_rigid[j];
   }
 
   meff = mi*mj / (mi+mj);
   if (mask[i] & freeze_group_bit) meff = mj;
   if (mask[j] & freeze_group_bit) meff = mi;
 
   // normal forces = Hookian contact + normal velocity damping
 
   damp = meff*gamman*vnnr*rsqinv;
   ccel = kn*(radsum-r)*rinv - damp;
 
   // relative velocities
 
   vtr1 = vt1 - (delz*wr2-dely*wr3);
   vtr2 = vt2 - (delx*wr3-delz*wr1);
   vtr3 = vt3 - (dely*wr1-delx*wr2);
   vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
   vrel = sqrt(vrel);
 
   // shear history effects
   // neighprev = index of found neigh on previous call
   // search entire jnum list of neighbors of I for neighbor J
   // start from neighprev, since will typically be next neighbor
   // reset neighprev to 0 as necessary
 
   int jnum = list->numneigh[i];
   int *touch = list->listgranhistory->firstneigh[i];
   double *allshear = list->listgranhistory->firstdouble[i];
 
   for (int jj = 0; jj < jnum; jj++) {
     neighprev++;
     if (neighprev >= jnum) neighprev = 0;
     if (touch[neighprev] == j) break;
   }
 
   double *shear = &allshear[3*neighprev];
   shrmag = sqrt(shear[0]*shear[0] + shear[1]*shear[1] +
                 shear[2]*shear[2]);
 
   // rotate shear displacements
 
   rsht = shear[0]*delx + shear[1]*dely + shear[2]*delz;
   rsht *= rsqinv;
 
   // tangential forces = shear + tangential velocity damping
 
   fs1 = - (kt*shear[0] + meff*gammat*vtr1);
   fs2 = - (kt*shear[1] + meff*gammat*vtr2);
   fs3 = - (kt*shear[2] + meff*gammat*vtr3);
 
   // rescale frictional displacements and forces if needed
 
   fs = sqrt(fs1*fs1 + fs2*fs2 + fs3*fs3);
   fn = xmu * fabs(ccel*r);
 
   if (fs > fn) {
     if (shrmag != 0.0) {
       fs1 *= fn/fs;
       fs2 *= fn/fs;
       fs3 *= fn/fs;
       fs *= fn/fs;
     } else fs1 = fs2 = fs3 = fs = 0.0;
   }
 
   // set all forces and return no energy
 
   fforce = ccel;
   svector[0] = fs1;
   svector[1] = fs2;
   svector[2] = fs3;
   svector[3] = fs;
   return 0.0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 int PairGranHookeHistory::pack_forward_comm(int n, int *list, double *buf, 
                                             int pbc_flag, int *pbc)
 {
   int i,j,m;
 
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
     buf[m++] = mass_rigid[j];
   }
   return m;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGranHookeHistory::unpack_forward_comm(int n, int first, double *buf)
 {
   int i,m,last;
 
   m = 0;
   last = first + n;
   for (i = first; i < last; i++)
     mass_rigid[i] = buf[m++];
 }
 
 /* ---------------------------------------------------------------------- */
 
 void *PairGranHookeHistory::extract(const char *str, int &dim)
 {
   dim = 0;
   if (strcmp(str,"computeflag") == 0) return (void *) &computeflag;
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    memory usage of local atom-based arrays
 ------------------------------------------------------------------------- */
 
 double PairGranHookeHistory::memory_usage()
 {
   double bytes = nmax * sizeof(double);
   return bytes;
 }
diff --git a/src/GRANULAR/pair_gran_hooke_history.h b/src/GRANULAR/pair_gran_hooke_history.h
index 4e2e51a4c..25762ca65 100644
--- a/src/GRANULAR/pair_gran_hooke_history.h
+++ b/src/GRANULAR/pair_gran_hooke_history.h
@@ -1,103 +1,102 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifdef PAIR_CLASS
 
 PairStyle(gran/hooke/history,PairGranHookeHistory)
 
 #else
 
 #ifndef LMP_PAIR_GRAN_HOOKE_HISTORY_H
 #define LMP_PAIR_GRAN_HOOKE_HISTORY_H
 
 #include "pair.h"
 
 namespace LAMMPS_NS {
 
 class PairGranHookeHistory : public Pair {
  public:
   int computeflag;
 
   PairGranHookeHistory(class LAMMPS *);
   virtual ~PairGranHookeHistory();
   virtual void compute(int, int);
   virtual void settings(int, char **);
   void coeff(int, char **);
   void init_style();
   void init_list(int, class NeighList *);
   double init_one(int, int);
   void write_restart(FILE *);
   void read_restart(FILE *);
   void write_restart_settings(FILE *);
   void read_restart_settings(FILE *);
   void reset_dt();
   virtual double single(int, int, int, int, double, double, double, double &);
   int pack_forward_comm(int, int *, double *, int, int *);
   void unpack_forward_comm(int, int, double *);
   void *extract(const char *, int &);
   double memory_usage();
 
  protected:
   double kn,kt,gamman,gammat,xmu;
   int dampflag;
   double dt;
   int freeze_group_bit;
   int history;
 
-  char *suffix;
   int neighprev;
   double *onerad_dynamic,*onerad_frozen;
   double *maxrad_dynamic,*maxrad_frozen;
 
   class FixShearHistory *fix_history;
 
   // storage of rigid body masses for use in granular interactions
 
   class Fix *fix_rigid;    // ptr to rigid body fix, NULL if none
   double *mass_rigid;      // rigid mass for owned+ghost atoms
   int nmax;                // allocated size of mass_rigid
 
   void allocate();
 };
 
 }
 
 #endif
 #endif
 
 /* ERROR/WARNING messages:
 
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
 E: Incorrect args for pair coefficients
 
 Self-explanatory.  Check the input script or data file.
 
 E: Pair granular requires atom style sphere
 
 Self-explanatory.
 
 E: Pair granular requires ghost atoms store velocity
 
 Use the comm_modify vel yes command to enable this.
 
 E: Pair granular with shear history requires newton pair off
 
 This is a current restriction of the implementation of pair
 granular styles with history.
 
 */
diff --git a/src/KSPACE/fix_tune_kspace.cpp b/src/KSPACE/fix_tune_kspace.cpp
index 9abfc9d1b..e3d5a5b5c 100644
--- a/src/KSPACE/fix_tune_kspace.cpp
+++ b/src/KSPACE/fix_tune_kspace.cpp
@@ -1,542 +1,543 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Paul Crozier (SNL)
 ------------------------------------------------------------------------- */
 
 #include "string.h"
 #include "stdlib.h"
 #include "fix_tune_kspace.h"
 #include "update.h"
 #include "domain.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "kspace.h"
 #include "pair.h"
 #include "error.h"
 #include "memory.h"
 #include "timer.h"
 #include "neighbor.h"
 #include "modify.h"
 #include "compute.h"
 #include <iostream>
 #include <cmath>
 #include <limits>
 #define SWAP(a,b) {temp=(a);(a)=(b);(b)=temp;}
 #define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
 #define GOLD 1.618034
 
 using namespace std;
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 /* ---------------------------------------------------------------------- */
 
 FixTuneKspace::FixTuneKspace(LAMMPS *lmp, int narg, char **arg) :
   Fix(lmp, narg, arg)
 {
   if (narg < 3) error->all(FLERR,"Illegal fix tune/kspace command");
 
   global_freq = 1;
   firststep = 0;
   niter = 0;
   niter_adjust_rcut = 0;
   keep_bracketing = true;
   first_brent_pass = true;
   converged = false;
   need_fd2_brent = false;
 
   ewald_time = pppm_time = msm_time = 0.0;
 
   // parse arguments
 
   nevery = force->inumeric(FLERR,arg[3]);
 
   // set up reneighboring
 
   force_reneighbor = 1;
   next_reneighbor = update->ntimestep + 1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 int FixTuneKspace::setmask()
 {
   int mask = 0;
   mask |= PRE_EXCHANGE;
   mask |= PRE_NEIGHBOR;
   return mask;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixTuneKspace::init()
 {
   if (!force->kspace) 
     error->all(FLERR,"Cannot use fix tune/kspace without a kspace style");
   if (!force->pair) 
     error->all(FLERR,"Cannot use fix tune/kspace without a pair style");
 
   double old_acc = force->kspace->accuracy/force->kspace->two_charge_force;
   char old_acc_str[12];
   sprintf(old_acc_str,"%g",old_acc);
   strcpy(new_acc_str,old_acc_str);
 
   int itmp;
   double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
   pair_cut_coul = *p_cutoff;
 }
 
 /* ----------------------------------------------------------------------
    perform dynamic kspace parameter optimization
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::pre_exchange()
 {
   if (!nevery) return;
   if (!force->kspace) return;
   if (!force->pair) return;
   if (next_reneighbor != update->ntimestep) return;
   next_reneighbor = update->ntimestep + nevery;
 
   double time = get_timing_info();
 
   if (strcmp(force->kspace_style,"ewald") == 0) ewald_time = time;
   if (strcmp(force->kspace_style,"pppm") == 0) pppm_time = time;
   if (strcmp(force->kspace_style,"msm") == 0) msm_time = time;
 
   niter++;
   if (niter == 1) {
     // test Ewald
     store_old_kspace_settings();
     strcpy(new_kspace_style,"ewald");
     sprintf(new_pair_style,"%s/long",base_pair_style);
     update_pair_style(new_pair_style,pair_cut_coul);
     update_kspace_style(new_kspace_style,new_acc_str);
   } else if (niter == 2) {
     // test PPPM
     store_old_kspace_settings();
     strcpy(new_kspace_style,"pppm");
     sprintf(new_pair_style,"%s/long",base_pair_style);
     update_pair_style(new_pair_style,pair_cut_coul);
     update_kspace_style(new_kspace_style,new_acc_str);
   } else if (niter == 3) {
     // test MSM
     store_old_kspace_settings();
     strcpy(new_kspace_style,"msm");
     sprintf(new_pair_style,"%s/msm",base_pair_style);
     update_pair_style(new_pair_style,pair_cut_coul);
     update_kspace_style(new_kspace_style,new_acc_str);
   } else if (niter == 4) {
     store_old_kspace_settings();
     cout << "ewald_time = " << ewald_time << endl;
     cout << "pppm_time = " << pppm_time << endl;
     cout << "msm_time = " << msm_time << endl;
     // switch to fastest one
     strcpy(new_kspace_style,"ewald");
     sprintf(new_pair_style,"%s/long",base_pair_style);
     if (pppm_time < ewald_time && pppm_time < msm_time)
       strcpy(new_kspace_style,"pppm");
     else if (msm_time < pppm_time && msm_time < ewald_time) {
       strcpy(new_kspace_style,"msm");
       sprintf(new_pair_style,"%s/msm",base_pair_style);
     }
     update_pair_style(new_pair_style,pair_cut_coul);
     update_kspace_style(new_kspace_style,new_acc_str);
   } else {
     adjust_rcut(time);
   }
 
   last_spcpu = timer->elapsed(TIME_LOOP);
 }
 
 /* ----------------------------------------------------------------------
    figure out CPU time per timestep since last time checked
 ------------------------------------------------------------------------- */
 
 double FixTuneKspace::get_timing_info()
 {
   double dvalue;
   double new_cpu;
   int new_step = update->ntimestep;
 
   if (firststep == 0) {
     new_cpu = 0.0;
     dvalue = 0.0;
     firststep = 1;
   } else {
     new_cpu = timer->elapsed(TIME_LOOP);
     double cpu_diff = new_cpu - last_spcpu;
     int step_diff = new_step - last_step;
     if (step_diff > 0.0) dvalue = cpu_diff/step_diff;
     else dvalue = 0.0;
   }
 
   last_step = new_step;
   last_spcpu = new_cpu;
 
   return dvalue;
 }
 
 /* ----------------------------------------------------------------------
    store old kspace settings: style, accuracy, order, etc
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::store_old_kspace_settings()
 {
   int n = strlen(force->kspace_style) + 1;
   char *old_kspace_style = new char[n];
   strcpy(old_kspace_style,force->kspace_style);
   strcpy(new_kspace_style,old_kspace_style);
   double old_acc = force->kspace->accuracy_relative;
   char old_acc_str[12];
   sprintf(old_acc_str,"%g",old_acc);
   strcpy(new_pair_style,force->pair_style);
   strcpy(base_pair_style,force->pair_style);
   char *trunc;
   if ((trunc = strstr(base_pair_style, "/long")) != NULL) *trunc = '\0';
   if ((trunc = strstr(base_pair_style, "/msm" )) != NULL) *trunc = '\0';
 
   old_differentiation_flag = force->kspace->differentiation_flag;
   old_slabflag = force->kspace->slabflag;
   old_slab_volfactor = force->kspace->slab_volfactor;
 }
 
 /* ----------------------------------------------------------------------
    update the pair style if necessary, preserving the settings
 ------------------------------------------------------------------------- */
 
-void FixTuneKspace::update_pair_style(char *new_pair_style, double pair_cut_coul)
+void FixTuneKspace::update_pair_style(char *new_pair_style, 
+                                      double pair_cut_coul)
 {
   int itmp;
   double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
   *p_cutoff = pair_cut_coul;
 
   // check to see if we need to change pair styles
   if (strcmp(new_pair_style,force->pair_style) == 0) return;
 
   // create a temporary file to store current pair settings
   FILE *p_pair_settings_file;
   p_pair_settings_file = tmpfile();
   force->pair->write_restart(p_pair_settings_file);
   rewind(p_pair_settings_file);
 
   cout << "Creating new pair style: " << new_pair_style << endl;
   // delete old pair style and create new one
-  force->create_pair(new_pair_style,lmp->suffix);
+  force->create_pair(new_pair_style,1);
 
   // restore current pair settings from temporary file
   force->pair->read_restart(p_pair_settings_file);
 
   double *pcutoff = (double *) force->pair->extract("cut_coul",itmp);
   double current_cutoff = *pcutoff;
   cout << "Coulomb cutoff for real space: " << current_cutoff << endl;
 
   // close temporary file
   fclose(p_pair_settings_file);
 }
 
 /* ----------------------------------------------------------------------
    update the kspace style if necessary
 ------------------------------------------------------------------------- */
 
-void FixTuneKspace::update_kspace_style(char *new_kspace_style, char *new_acc_str)
+void FixTuneKspace::update_kspace_style(char *new_kspace_style, 
+                                        char *new_acc_str)
 {
   // create kspace style char string
 
   int narg = 2;
   char **arg;
   arg = NULL;
   int maxarg = 100;
   arg = (char **) memory->srealloc(arg,maxarg*sizeof(char *),"tune/kspace:arg");
   int n = 12;
   arg[0] = new char[n];
   strcpy(arg[0],new_kspace_style);
   arg[1] = new char[n];
   strcpy(arg[1],new_acc_str);
 
   // delete old kspace style and create new one
 
-  force->create_kspace(narg,arg,lmp->suffix);
-
+  force->create_kspace(narg,arg,1);
   force->kspace->differentiation_flag = old_differentiation_flag;
   force->kspace->slabflag = old_slabflag;
   force->kspace->slab_volfactor = old_slab_volfactor;
 
   // initialize new kspace style, pair style, molecular styles
 
   force->init();
 
   // set up grid
   force->kspace->setup_grid();
 
   // Re-init neighbor list. Probably only needed when redefining the pair style. Should happen after pair->init() to get pair style neighbor list request registered
 
   neighbor->init();
 
   // Re-init computes to update pointers to virials, etc.
 
   for (int i = 0; i < modify->ncompute; i++) modify->compute[i]->init();
 
   memory->sfree(arg);
 }
 
 /* ----------------------------------------------------------------------
    find the optimal real space coulomb cutoff
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::adjust_rcut(double time)
 {
   if (strcmp(force->kspace_style,"msm") == 0) return;
   if (converged) return;
 
   double temp;
   const double TINY = 1.0e-20;
 
   // get the current cutoff
   int itmp;
   double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
   double current_cutoff = *p_cutoff;
   cout << "Old Coulomb cutoff for real space: " << current_cutoff << endl;
 
   // use Brent's method from Numerical Recipes to find optimal real space cutoff
 
   // first time through, get ax_brent and fa_brent, and adjust cutoff
   if (keep_bracketing) {
     if (niter_adjust_rcut == 0) {
       pair_cut_coul /= 2;
     } else if (niter_adjust_rcut == 1) {
       ax_brent = current_cutoff;
       fa_brent = time;
       pair_cut_coul *= 2;
 
     // second time through, get bx_brent and fb_brent, and adjust cutoff
     } else if (niter_adjust_rcut == 2) {
       bx_brent = current_cutoff;
       fb_brent = time;
       if (fb_brent > fa_brent) {
         SWAP(ax_brent,bx_brent);
         SWAP(fb_brent,fa_brent);
         pair_cut_coul /= 4;
       } else {
         pair_cut_coul *= 2;
       }
 
     // third time through, get cx_brent and fc_brent, and adjust cutoff if needed
     } else if (niter_adjust_rcut == 3) {
       cx_brent = current_cutoff;
       fc_brent = time;
       if (fc_brent > fb_brent) keep_bracketing = false;
       else {
         double r = (bx_brent - ax_brent)*(fb_brent - fc_brent);
         double q = (bx_brent - cx_brent)*(fb_brent - fa_brent);
         dx_brent = bx_brent - ((bx_brent - cx_brent)*q - (bx_brent - ax_brent)*r)/
          (2.0*SIGN(MAX(fabs(q - r),TINY),q - r));
         pair_cut_coul = dx_brent;
       }
 
     // after third time through, bracket the minimum, and adjust cutoff
     } else if (niter_adjust_rcut > 3) {
       dx_brent = current_cutoff;
       if (need_fd2_brent) fd2_brent = time;
       else fd_brent = time;
       mnbrak();
       pair_cut_coul = dx_brent;
     }
   }
 
   if (!keep_bracketing) {
     dx_brent = current_cutoff;
     fd_brent = time;
     if (first_brent_pass) brent0();
     else brent2();
     brent1();
     pair_cut_coul = dx_brent;
   }
 
   niter_adjust_rcut++;
 
   if (pair_cut_coul <= 0.0) pair_cut_coul = fabs(MIN(ax_brent,MIN(bx_brent,(MIN(cx_brent,dx_brent))))/2.0) + TINY;
 
   if (pair_cut_coul != pair_cut_coul)
     error->all(FLERR,"Bad real space Coulomb cutoff in fix tune/kspace");
 
   // change the cutoff to pair_cut_coul
   *p_cutoff = pair_cut_coul;
 
   // report the new cutoff
   double *new_cutoff = (double *) force->pair->extract("cut_coul",itmp);
   current_cutoff = *new_cutoff;
   cout << "Adjusted Coulomb cutoff for real space: " << current_cutoff << endl;
 
   store_old_kspace_settings();
   update_pair_style(new_pair_style,pair_cut_coul);
   update_kspace_style(new_kspace_style,new_acc_str);
 }
 
 /* ----------------------------------------------------------------------
    bracket a minimum using parabolic extrapolation
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::mnbrak()
 {
   const double GLIMIT = 100.0, TINY = 1.0e-20;
   double r,q;
   r = (bx_brent - ax_brent)*(fb_brent - fc_brent);
   q = (bx_brent - cx_brent)*(fb_brent - fa_brent);
   dx_brent = bx_brent - ((bx_brent - cx_brent)*q - (bx_brent - ax_brent)*r)/
    (2.0*SIGN(MAX(fabs(q - r),TINY),q - r));
   dxlim = bx_brent + GLIMIT*(cx_brent - bx_brent);
 
   if ((bx_brent - dx_brent)*(dx_brent - cx_brent) > 0.0) {
     if (fd_brent < fc_brent) {
       ax_brent = bx_brent;
       bx_brent = dx_brent;
       fa_brent = fb_brent;
       fb_brent = fd_brent;
       keep_bracketing = false;
       return;
     } else if (fd_brent > fb_brent) {
       cx_brent = dx_brent;
       fc_brent = fd_brent;
       keep_bracketing = false;
       return;
     }
     dx_brent = cx_brent + GOLD*(cx_brent - bx_brent);
     if (need_fd2_brent) {
       fd_brent = fd2_brent;
       need_fd2_brent = false;
     } else {
       need_fd2_brent = true;
       return;
     }
   } else if ((cx_brent - dx_brent)*(dx_brent - dxlim) > 0.0) {
     if (fd_brent < fc_brent) {
       if (need_fd2_brent) {
         need_fd2_brent = false;
       } else {
         need_fd2_brent = true;
         dx_brent += GOLD*(dx_brent - cx_brent);
         return;
       }
       shft3(bx_brent,cx_brent,dx_brent,dx_brent + GOLD*(dx_brent - cx_brent));
       shft3(fb_brent,fc_brent,fd_brent,fd2_brent);
     }
   } else if ((dx_brent - dxlim)*(dxlim - cx_brent) >= 0.0) {
     dx_brent = dxlim;
     if (need_fd2_brent) {
       fd_brent = fd2_brent;
       need_fd2_brent = false;
     } else {
       need_fd2_brent = true;
       return;
     }
   } else {
     dx_brent = cx_brent + GOLD*(cx_brent - bx_brent);
     if (need_fd2_brent) {
       fd_brent = fd2_brent;
       need_fd2_brent = false;
     } else {
       need_fd2_brent = true;
       return;
     }
   }
   shft3(ax_brent,bx_brent,cx_brent,dx_brent);
   shft3(fa_brent,fb_brent,fc_brent,fd_brent);
 }
 
 /* ----------------------------------------------------------------------
    Brent's method from Numerical Recipes
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::brent0()
 {
   a_brent=(ax_brent < cx_brent ? ax_brent : cx_brent);
   b_brent=(ax_brent > cx_brent ? ax_brent : cx_brent);
   x_brent=w_brent=v_brent=bx_brent;
   fw_brent=fv_brent=fx_brent=fb_brent;
 }
 
 /* ----------------------------------------------------------------------
    Brent's method from Numerical Recipes
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::brent1()
 {
   const double CGOLD=0.3819660;
   const double ZEPS=numeric_limits<double>::epsilon()*1.0e-3;
   double d=0.0,etemp;
   double p,q,r,tol1,tol2,xm;
   double e=0.0;
   double tol=0.001;
 
   xm=0.5*(a_brent+b_brent);
   tol2=2.0*(tol1=tol*fabs(x_brent)+ZEPS);
   if (fabs(x_brent-xm) <= (tol2-0.5*(b_brent-a_brent))) {
     converged = true;
     dx_brent = x_brent;
     return;
   }
   if (fabs(e) > tol1) {
     r=(x_brent-w_brent)*(fx_brent-fv_brent);
     q=(x_brent-v_brent)*(fx_brent-fw_brent);
     p=(x_brent-v_brent)*q-(x_brent-w_brent)*r;
     q=2.0*(q-r);
     if (q > 0.0) p = -p;
     q=fabs(q);
     etemp=e;
     e=d;
     if (fabs(p) >= fabs(0.5*q*etemp) || p <= q*(a_brent-x_brent) || p >= q*(b_brent-x_brent))
       d=CGOLD*(e=(x_brent >= xm ? a_brent-x_brent : b_brent-x_brent));
     else {
       d=p/q;
       dx_brent=x_brent+d;
       if (dx_brent-a_brent < tol2 || b_brent-dx_brent < tol2)
         d=SIGN(tol1,xm-x_brent);
     }
   } else {
     d=CGOLD*(e=(x_brent >= xm ? a_brent-x_brent : b_brent-x_brent));
   }
   dx_brent=(fabs(d) >= tol1 ? x_brent+d : x_brent+SIGN(tol1,d));
 
   first_brent_pass = false;
 
   return;
 }
 
 /* ----------------------------------------------------------------------
    Brent's method from Numerical Recipes
 ------------------------------------------------------------------------- */
 
 void FixTuneKspace::brent2()
 {
   if (fd_brent <= fx_brent) {
     if (dx_brent >= x_brent) a_brent=x_brent; else b_brent=x_brent;
     shft3(v_brent,w_brent,x_brent,dx_brent);
     shft3(fv_brent,fw_brent,fx_brent,fd_brent);
   } else {
     if (dx_brent < x_brent) a_brent=dx_brent; else b_brent=dx_brent;
     if (fd_brent <= fw_brent || w_brent == x_brent) {
       v_brent=w_brent;
       w_brent=dx_brent;
       fv_brent=fw_brent;
       fw_brent=fd_brent;
     } else if (fd_brent <= fv_brent || v_brent == x_brent || v_brent == w_brent) {
       v_brent=dx_brent;
       fv_brent=fd_brent;
     }
   }
 }
 
diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.beacon
similarity index 84%
copy from src/MAKE/Makefile.linux
copy to src/MAKE/Makefile.beacon
index c4264dc22..98e816a43 100755
--- a/src/MAKE/Makefile.linux
+++ b/src/MAKE/Makefile.beacon
@@ -1,108 +1,109 @@
 # linux = RedHat Linux box, Intel icc, MPICH2, FFTW
 
 SHELL = /bin/sh
 
 # ---------------------------------------------------------------------
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
-CC =		icc
-CCFLAGS =	-O
+CC =		mpiicpc -openmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64
+MIC_OPT =       -offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
+CCFLAGS =	-O3 -xAVX -fno-alias -ansi-alias -restrict -override-limits $(MIC_OPT)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		icc
-LINKFLAGS =	-O
-LIB =           -lstdc++
+LINK =		mpiicpc -openmp
+LINKFLAGS =	-O3 -xAVX
+LIB =           
 SIZE =		size
 
 ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
 # specify settings for LAMMPS features you will use
 # if you change any -D setting, do full re-compile after "make clean"
 
 # LAMMPS ifdef settings, OPTIONAL
 # see possible settings in doc/Section_start.html#2_2 (step 4)
 
 LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
 
 # MPI library, REQUIRED
 # see discussion in doc/Section_start.html#2_2 (step 5)
 # can point to dummy MPI library in src/STUBS as in Makefile.serial
 # INC = path for mpi.h, MPI compiler settings
 # PATH = path for MPI library
 # LIB = name of MPI library
 
 MPI_INC =       -DMPICH_SKIP_MPICXX 
 MPI_PATH = 
-MPI_LIB =	-lmpich -lmpl -lpthread
+MPI_LIB =  
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
 # can be left blank to use provided KISS FFT library
 # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
 # PATH = path for FFT library
 # LIB = name of FFT library
 
-FFT_INC =       -DFFT_FFTW
+FFT_INC =      -DFFT_MKL -DFFT_SINGLE -I$(MKLROOT)
 FFT_PATH = 
-FFT_LIB =	-lfftw
+FFT_LIB =	-L$(MKLROOT) -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core
 
 # JPEG and/or PNG library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 7)
 # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
 # INC = path(s) for jpeglib.h and/or png.h
 # PATH = path(s) for JPEG library and/or PNG library
 # LIB = name(s) of JPEG library and/or PNG library
 
 JPG_INC =       
 JPG_PATH = 	
 JPG_LIB =	-ljpeg
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
 # no need to edit this section
 
 include	Makefile.package.settings
 include	Makefile.package
 
 EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 
 # Path to src files
 
 vpath %.cpp ..
 vpath %.h ..
 
 # Link target
 
 $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
 # Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
 shlib:	$(OBJ)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
         $(OBJ) $(EXTRA_LIB) $(LIB)
 
 # Compilation rules
 
 %.o:%.cpp
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 # Individual dependencies
 
 DEPENDS = $(OBJ:.o=.d)
 sinclude $(DEPENDS)
diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.g++_openmpi
similarity index 90%
copy from src/MAKE/Makefile.linux
copy to src/MAKE/Makefile.g++_openmpi
index c4264dc22..c8912f171 100755
--- a/src/MAKE/Makefile.linux
+++ b/src/MAKE/Makefile.g++_openmpi
@@ -1,108 +1,108 @@
-# linux = RedHat Linux box, Intel icc, MPICH2, FFTW
+# g++ = RedHat Linux box, g++4, OpenMPI, FFTW
 
 SHELL = /bin/sh
 
 # ---------------------------------------------------------------------
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
-CC =		icc
-CCFLAGS =	-O
+CC =		g++
+CCFLAGS =	-g -O # -Wunused
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		icc
-LINKFLAGS =	-O
-LIB =           -lstdc++
+LINK =		g++
+LINKFLAGS =	-g -O
+LIB = 
 SIZE =		size
 
 ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
 # specify settings for LAMMPS features you will use
 # if you change any -D setting, do full re-compile after "make clean"
 
 # LAMMPS ifdef settings, OPTIONAL
 # see possible settings in doc/Section_start.html#2_2 (step 4)
 
 LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
 
 # MPI library, REQUIRED
 # see discussion in doc/Section_start.html#2_2 (step 5)
 # can point to dummy MPI library in src/STUBS as in Makefile.serial
 # INC = path for mpi.h, MPI compiler settings
 # PATH = path for MPI library
 # LIB = name of MPI library
 
-MPI_INC =       -DMPICH_SKIP_MPICXX 
-MPI_PATH = 
-MPI_LIB =	-lmpich -lmpl -lpthread
+MPI_INC =       -DMPICH_SKIP_MPICXX -I/usr/local/openmpi/include
+MPI_PATH =      -L/usr/local/openmpi/lib
+MPI_LIB =	-lmpi -lmpi_cxx
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
 # can be left blank to use provided KISS FFT library
 # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
 # PATH = path for FFT library
 # LIB = name of FFT library
 
-FFT_INC =       -DFFT_FFTW
+FFT_INC =    	-DFFT_FFTW
 FFT_PATH = 
 FFT_LIB =	-lfftw
 
 # JPEG and/or PNG library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 7)
 # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
 # INC = path(s) for jpeglib.h and/or png.h
 # PATH = path(s) for JPEG library and/or PNG library
 # LIB = name(s) of JPEG library and/or PNG library
 
 JPG_INC =       
 JPG_PATH = 	
 JPG_LIB =	-ljpeg
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
 # no need to edit this section
 
 include	Makefile.package.settings
 include	Makefile.package
 
 EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 
 # Path to src files
 
 vpath %.cpp ..
 vpath %.h ..
 
 # Link target
 
 $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
 # Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
 shlib:	$(OBJ)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
         $(OBJ) $(EXTRA_LIB) $(LIB)
 
 # Compilation rules
 
 %.o:%.cpp
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 # Individual dependencies
 
 DEPENDS = $(OBJ:.o=.d)
 sinclude $(DEPENDS)
diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.intel
similarity index 86%
copy from src/MAKE/Makefile.linux
copy to src/MAKE/Makefile.intel
index c4264dc22..2b209e27b 100755
--- a/src/MAKE/Makefile.linux
+++ b/src/MAKE/Makefile.intel
@@ -1,108 +1,108 @@
-# linux = RedHat Linux box, Intel icc, MPICH2, FFTW
+# Intel compiler, Intel MPI, MKL FFT, no offload to coprocessor
 
 SHELL = /bin/sh
 
 # ---------------------------------------------------------------------
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
-CC =		icc
-CCFLAGS =	-O
+CC =		mpiicpc -openmp -DLAMMPS_MEMALIGN=64 -no-offload
+CCFLAGS =	-O3 -xHost -fno-alias -ansi-alias -restrict -override-limits
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		icc
-LINKFLAGS =	-O
-LIB =           -lstdc++
+LINK =		mpiicpc -openmp
+LINKFLAGS =	-O3 -xHost
+LIB =           
 SIZE =		size
 
 ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
 # specify settings for LAMMPS features you will use
 # if you change any -D setting, do full re-compile after "make clean"
 
 # LAMMPS ifdef settings, OPTIONAL
 # see possible settings in doc/Section_start.html#2_2 (step 4)
 
 LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
 
 # MPI library, REQUIRED
 # see discussion in doc/Section_start.html#2_2 (step 5)
 # can point to dummy MPI library in src/STUBS as in Makefile.serial
 # INC = path for mpi.h, MPI compiler settings
 # PATH = path for MPI library
 # LIB = name of MPI library
 
 MPI_INC =       -DMPICH_SKIP_MPICXX 
 MPI_PATH = 
-MPI_LIB =	-lmpich -lmpl -lpthread
+MPI_LIB =
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
 # can be left blank to use provided KISS FFT library
 # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
 # PATH = path for FFT library
 # LIB = name of FFT library
 
-FFT_INC =       -DFFT_FFTW
+FFT_INC =      -DFFT_MKL -DFFT_SINGLE
 FFT_PATH = 
-FFT_LIB =	-lfftw
+FFT_LIB = -L$MKLROOT/lib/intel64/ -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core	
 
 # JPEG and/or PNG library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 7)
 # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
 # INC = path(s) for jpeglib.h and/or png.h
 # PATH = path(s) for JPEG library and/or PNG library
 # LIB = name(s) of JPEG library and/or PNG library
 
 JPG_INC =       
 JPG_PATH = 	
 JPG_LIB =	-ljpeg
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
 # no need to edit this section
 
 include	Makefile.package.settings
 include	Makefile.package
 
 EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 
 # Path to src files
 
 vpath %.cpp ..
 vpath %.h ..
 
 # Link target
 
 $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
 # Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
 shlib:	$(OBJ)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
         $(OBJ) $(EXTRA_LIB) $(LIB)
 
 # Compilation rules
 
 %.o:%.cpp
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 # Individual dependencies
 
 DEPENDS = $(OBJ:.o=.d)
 sinclude $(DEPENDS)
diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.intel_offload
similarity index 82%
copy from src/MAKE/Makefile.linux
copy to src/MAKE/Makefile.intel_offload
index c4264dc22..eb4415fc8 100755
--- a/src/MAKE/Makefile.linux
+++ b/src/MAKE/Makefile.intel_offload
@@ -1,108 +1,109 @@
-# linux = RedHat Linux box, Intel icc, MPICH2, FFTW
+# Intel compiler, Intel MPI, MKL FFT, no offload to coprocessor
 
 SHELL = /bin/sh
 
 # ---------------------------------------------------------------------
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
-CC =		icc
-CCFLAGS =	-O
+CC =		mpiicpc -openmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64
+MIC_OPT =       -offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
+CCFLAGS =	-g -O3 -xHost -fno-alias -ansi-alias -restrict -override-limits $(MIC_OPT)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		icc
-LINKFLAGS =	-O
-LIB =           -lstdc++
+LINK =		mpiicpc -openmp -offload
+LINKFLAGS =	-O3 -xHost
+LIB =           
 SIZE =		size
 
 ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
 # specify settings for LAMMPS features you will use
 # if you change any -D setting, do full re-compile after "make clean"
 
 # LAMMPS ifdef settings, OPTIONAL
 # see possible settings in doc/Section_start.html#2_2 (step 4)
 
 LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
 
 # MPI library, REQUIRED
 # see discussion in doc/Section_start.html#2_2 (step 5)
 # can point to dummy MPI library in src/STUBS as in Makefile.serial
 # INC = path for mpi.h, MPI compiler settings
 # PATH = path for MPI library
 # LIB = name of MPI library
 
 MPI_INC =       -DMPICH_SKIP_MPICXX 
 MPI_PATH = 
-MPI_LIB =	-lmpich -lmpl -lpthread
+MPI_LIB =
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
 # can be left blank to use provided KISS FFT library
 # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
 # PATH = path for FFT library
 # LIB = name of FFT library
 
-FFT_INC =       -DFFT_FFTW
+FFT_INC =      -DFFT_MKL -DFFT_SINGLE
 FFT_PATH = 
-FFT_LIB =	-lfftw
+FFT_LIB =	-L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core
 
 # JPEG and/or PNG library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 7)
 # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
 # INC = path(s) for jpeglib.h and/or png.h
 # PATH = path(s) for JPEG library and/or PNG library
 # LIB = name(s) of JPEG library and/or PNG library
 
 JPG_INC =       
 JPG_PATH = 	
 JPG_LIB =	-ljpeg
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
 # no need to edit this section
 
 include	Makefile.package.settings
 include	Makefile.package
 
 EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 
 # Path to src files
 
 vpath %.cpp ..
 vpath %.h ..
 
 # Link target
 
 $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
 # Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
 shlib:	$(OBJ)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
         $(OBJ) $(EXTRA_LIB) $(LIB)
 
 # Compilation rules
 
 %.o:%.cpp
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 # Individual dependencies
 
 DEPENDS = $(OBJ:.o=.d)
 sinclude $(DEPENDS)
diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.linux
index c4264dc22..d835bed04 100755
--- a/src/MAKE/Makefile.linux
+++ b/src/MAKE/Makefile.linux
@@ -1,108 +1,108 @@
 # linux = RedHat Linux box, Intel icc, MPICH2, FFTW
 
 SHELL = /bin/sh
 
 # ---------------------------------------------------------------------
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
 CC =		icc
-CCFLAGS =	-O
+CCFLAGS =	-O -DLAMMPS_MEMALIGN=64 -openmp -restrict
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		icc
-LINKFLAGS =	-O
+LINKFLAGS =	-O -openmp
 LIB =           -lstdc++
 SIZE =		size
 
 ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
 # specify settings for LAMMPS features you will use
 # if you change any -D setting, do full re-compile after "make clean"
 
 # LAMMPS ifdef settings, OPTIONAL
 # see possible settings in doc/Section_start.html#2_2 (step 4)
 
 LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
 
 # MPI library, REQUIRED
 # see discussion in doc/Section_start.html#2_2 (step 5)
 # can point to dummy MPI library in src/STUBS as in Makefile.serial
 # INC = path for mpi.h, MPI compiler settings
 # PATH = path for MPI library
 # LIB = name of MPI library
 
 MPI_INC =       -DMPICH_SKIP_MPICXX 
 MPI_PATH = 
 MPI_LIB =	-lmpich -lmpl -lpthread
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
 # can be left blank to use provided KISS FFT library
 # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
 # PATH = path for FFT library
 # LIB = name of FFT library
 
 FFT_INC =       -DFFT_FFTW
 FFT_PATH = 
 FFT_LIB =	-lfftw
 
 # JPEG and/or PNG library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 7)
 # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
 # INC = path(s) for jpeglib.h and/or png.h
 # PATH = path(s) for JPEG library and/or PNG library
 # LIB = name(s) of JPEG library and/or PNG library
 
 JPG_INC =       
 JPG_PATH = 	
 JPG_LIB =	-ljpeg
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
 # no need to edit this section
 
 include	Makefile.package.settings
 include	Makefile.package
 
 EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 
 # Path to src files
 
 vpath %.cpp ..
 vpath %.h ..
 
 # Link target
 
 $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
 # Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
 shlib:	$(OBJ)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
         $(OBJ) $(EXTRA_LIB) $(LIB)
 
 # Compilation rules
 
 %.o:%.cpp
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 # Individual dependencies
 
 DEPENDS = $(OBJ:.o=.d)
 sinclude $(DEPENDS)
diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.stampede
similarity index 82%
copy from src/MAKE/Makefile.linux
copy to src/MAKE/Makefile.stampede
index c4264dc22..8c9591d11 100755
--- a/src/MAKE/Makefile.linux
+++ b/src/MAKE/Makefile.stampede
@@ -1,108 +1,109 @@
-# linux = RedHat Linux box, Intel icc, MPICH2, FFTW
+# Stampede, Intel Compiler, MKL FFT, Offload to Xeon Phi
 
 SHELL = /bin/sh
 
 # ---------------------------------------------------------------------
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
-CC =		icc
-CCFLAGS =	-O
+CC =		mpicc -openmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64
+MIC_OPT =       -offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
+CCFLAGS =	-O3 -xAVX -fno-alias -ansi-alias -restrict -override-limits $(MIC_OPT)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		icc
-LINKFLAGS =	-O
-LIB =           -lstdc++
+LINK =		mpicc -openmp
+LINKFLAGS =	-O3 -xAVX
+LIB =           
 SIZE =		size
 
 ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings
 # specify settings for LAMMPS features you will use
 # if you change any -D setting, do full re-compile after "make clean"
 
 # LAMMPS ifdef settings, OPTIONAL
 # see possible settings in doc/Section_start.html#2_2 (step 4)
 
 LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
 
 # MPI library, REQUIRED
 # see discussion in doc/Section_start.html#2_2 (step 5)
 # can point to dummy MPI library in src/STUBS as in Makefile.serial
 # INC = path for mpi.h, MPI compiler settings
 # PATH = path for MPI library
 # LIB = name of MPI library
 
 MPI_INC =       -DMPICH_SKIP_MPICXX 
 MPI_PATH = 
-MPI_LIB =	-lmpich -lmpl -lpthread
+MPI_LIB =
 
 # FFT library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 6)
 # can be left blank to use provided KISS FFT library
 # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
 # PATH = path for FFT library
 # LIB = name of FFT library
 
-FFT_INC =       -DFFT_FFTW
+FFT_INC =      -DFFT_MKL -DFFT_SINGLE -I$(TACC_MKL_INC)
 FFT_PATH = 
-FFT_LIB =	-lfftw
+FFT_LIB =	-L$(TACC_MKL_LIB) -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core
 
 # JPEG and/or PNG library, OPTIONAL
 # see discussion in doc/Section_start.html#2_2 (step 7)
 # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
 # INC = path(s) for jpeglib.h and/or png.h
 # PATH = path(s) for JPEG library and/or PNG library
 # LIB = name(s) of JPEG library and/or PNG library
 
 JPG_INC =       
 JPG_PATH = 	
 JPG_LIB =	-ljpeg
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
 # no need to edit this section
 
 include	Makefile.package.settings
 include	Makefile.package
 
 EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 
 # Path to src files
 
 vpath %.cpp ..
 vpath %.h ..
 
 # Link target
 
 $(EXE):	$(OBJ)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
 # Library targets
 
 lib:	$(OBJ)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
 shlib:	$(OBJ)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
         $(OBJ) $(EXTRA_LIB) $(LIB)
 
 # Compilation rules
 
 %.o:%.cpp
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 # Individual dependencies
 
 DEPENDS = $(OBJ:.o=.d)
 sinclude $(DEPENDS)
diff --git a/src/Makefile b/src/Makefile
index f8e70a94d..2c4bb15fa 100755
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,240 +1,240 @@
 # LAMMPS multiple-machine Makefile
 
 SHELL = /bin/bash
 #.IGNORE:
 
 # Definitions
 
 ROOT =	lmp
 EXE =	$(ROOT)_$@
 SRC =	$(wildcard *.cpp)
 INC =	$(wildcard *.h)
 OBJ = 	$(SRC:.cpp=.o)
 
 # Package variables
 
 PACKAGE = asphere body class2 colloid dipole fld gpu granular kim \
 	  kokkos kspace manybody mc meam misc molecule mpiio opt peri poems \
 	  reax replica rigid shock srd voronoi xtc
 
 PACKUSER = user-atc user-awpmd user-cg-cmm user-colvars \
-	   user-cuda user-eff user-fep user-lb user-misc user-molfile \
-	   user-omp user-phonon user-qmmm user-reaxc user-sph
+	   user-cuda user-eff user-fep user-intel user-lb user-misc \
+	   user-molfile user-omp user-phonon user-qmmm user-reaxc user-sph
 
 PACKLIB = gpu kim meam poems reax voronoi \
 	  user-atc user-awpmd user-colvars user-qmmm user-cuda user-molfile
 
 PACKALL = $(PACKAGE) $(PACKUSER)
 
 PACKAGEUC = $(shell echo $(PACKAGE) | tr a-z A-Z)
 PACKUSERUC = $(shell echo $(PACKUSER) | tr a-z A-Z)
 
 YESDIR = $(shell echo $(@:yes-%=%) | tr a-z A-Z)
 NODIR  = $(shell echo $(@:no-%=%) | tr a-z A-Z)
 
 # List of all targets
 
 help:
 	@echo ''
 	@echo 'make clean-all           delete all object files'
 	@echo 'make clean-machine       delete object files for one machine'
 	@echo 'make purge               purge obsolete copies of package sources'
 	@echo 'make tar                 create lmp_src.tar.gz of src dir and packages'
 	@echo 'make makelib             create Makefile.lib for static library build'
 	@echo 'make makeshlib           create Makefile.shlib for shared library build'
 	@echo 'make makelist            create Makefile.list used by old makes'
 	@echo 'make -f Makefile.lib machine      build LAMMPS as static library for machine'
 	@echo 'make -f Makefile.shlib machine    build LAMMPS as shared library for machine'
 	@echo 'make -f Makefile.list machine     build LAMMPS from explicit list of files'
 	@echo 'make stubs               build dummy MPI library in STUBS'
 	@echo 'make install-python      install LAMMPS wrapper in Python'
 	@echo ''
 	@echo 'make package             list available packages'
 	@echo 'make package-status (ps) status of all packages'
 	@echo 'make yes-package         install a single package in src dir'
 	@echo 'make no-package          remove a single package from src dir'
 	@echo 'make yes-all             install all packages in src dir'
 	@echo 'make no-all              remove all packages from src dir'
 	@echo 'make yes-standard        install all standard packages'
 	@echo 'make no-standard         remove all standard packages'
 	@echo 'make yes-user            install all user packages'
 	@echo 'make no-user             remove all user packages'
 	@echo 'make no-lib              remove all packages with external libs'
 	@echo ''
 	@echo 'make package-update (pu) replace src files with updated package files'
 	@echo 'make package-overwrite   replace package files with src files'
 	@echo 'make package-diff (pd)   diff src files against package files'
 	@echo ''
 	@echo 'make machine             build LAMMPS where machine is one of:'
 	@echo ''
 	@files="`ls MAKE/Makefile.*`"; \
 	  for file in $$files; do head -1 $$file; done
 	@echo ''
 
 # Build the code
 
 .DEFAULT:
 	@test -f MAKE/Makefile.$@
 	@if [ ! -d Obj_$@ ]; then mkdir Obj_$@; fi
 	@$(SHELL) Make.sh style
 	@cp MAKE/Makefile.$@ Obj_$@/Makefile
 	@if [ ! -e Makefile.package ]; \
 	  then cp Makefile.package.empty Makefile.package; fi
 	@if [ ! -e Makefile.package.settings ]; \
 	  then cp Makefile.package.settings.empty Makefile.package.settings; fi
 	@cp Makefile.package Makefile.package.settings Obj_$@
 	@cd Obj_$@; \
 	$(MAKE) $(MFLAGS) "OBJ = $(OBJ)" "INC = $(INC)" "SHFLAGS =" \
 	  "EXE = ../$(EXE)" ../$(EXE)
 
 # Remove machine-specific object files
 
 clean:
 	@echo 'make clean-all           delete all object files'
 	@echo 'make clean-machine       delete object files for one machine'
 
 clean-all:
 	rm -rf Obj_*
 
 clean-%:
 	rm -rf Obj_$(@:clean-%=%)
 
 purge: Purge.list
 	@echo 'Purging obsolete and auto-generated source files'
 	@for f in `grep -v '#' Purge.list` ;		\
 	    do test -f $$f && rm $$f && echo $$f || : ;		\
 	done
 
 # Create a tarball of src dir and packages
 
 tar:
 	@cd STUBS; $(MAKE) clean
 	@cd ..; tar cvzf src/$(ROOT)_src.tar.gz \
 	  src/Make* src/Package.sh src/MAKE src/*.cpp src/*.h src/STUBS \
 	  $(patsubst %,src/%,$(PACKAGEUC)) $(patsubst %,src/%,$(PACKUSERUC)) \
           --exclude=*/.svn
 	@cd STUBS; $(MAKE)
 	@echo "Created $(ROOT)_src.tar.gz"
 
 # Make MPI STUBS library
 
 stubs:
 	@cd STUBS; $(MAKE) clean; $(MAKE)
 
 # Create Makefile.lib, Makefile.shlib, and Makefile.list
 
 makelib:
 	@$(SHELL) Make.sh style
 	@$(SHELL) Make.sh Makefile.lib
 
 makeshlib:
 	@$(SHELL) Make.sh style
 	@$(SHELL) Make.sh Makefile.shlib
 
 makelist:
 	@$(SHELL) Make.sh style
 	@$(SHELL) Make.sh Makefile.list
 
 # install LAMMPS shared lib and Python wrapper for Python usage
 
 install-python:
 	@python ../python/install.py
 
 # Package management
 
 package:
 	@echo 'Standard packages:' $(PACKAGE)
 	@echo ''
 	@echo 'User-contributed packages:' $(PACKUSER)
 	@echo ''
 	@echo 'make package              list available packages'
 	@echo 'make package-status (ps)  status of all packages'
 	@echo 'make yes-package          install a single package in src dir'
 	@echo 'make no-package           remove a single package from src dir'
 	@echo 'make yes-all              install all packages in src dir'
 	@echo 'make no-all               remove all packages from src dir'
 	@echo 'make yes-standard         install all standard packages'
 	@echo 'make no-standard          remove all standard packages'
 	@echo 'make yes-user             install all user packages'
 	@echo 'make no-user              remove all user packages'
 	@echo 'make no-lib               remove all packages with external libs'
 	@echo ''
 	@echo 'make package-update (pu)  replace src files with package files'
 	@echo 'make package-overwrite    replace package files with src files'
 	@echo 'make package-diff (pd)    diff src files against package file'
 
 yes-all:
 	@for p in $(PACKALL); do $(MAKE) yes-$$p; done
 
 no-all:
 	@for p in $(PACKALL); do $(MAKE) no-$$p; done
 
 yes-standard:
 	@for p in $(PACKAGE); do $(MAKE) yes-$$p; done
 
 no-standard:
 	@for p in $(PACKAGE); do $(MAKE) no-$$p; done
 
 yes-user:
 	@for p in $(PACKUSER); do $(MAKE) yes-$$p; done
 
 no-user:
 	@for p in $(PACKUSER); do $(MAKE) no-$$p; done
 
 no-lib:
 	@for p in $(PACKLIB); do $(MAKE) no-$$p; done
 
 yes-%:
 	@if [ ! -e Makefile.package ]; \
 	  then cp Makefile.package.empty Makefile.package; fi
 	@if [ ! -e Makefile.package.settings ]; \
 	  then cp Makefile.package.settings.empty Makefile.package.settings; fi
 	@if [ ! -e $(YESDIR) ]; then \
 	  echo "Package $(@:yes-%=%) does not exist"; \
 	elif [ -e $(YESDIR)/Install.sh ]; then \
 	  echo "Installing package $(@:yes-%=%)"; \
 	  cd $(YESDIR); $(SHELL) Install.sh 1; cd ..; \
 		$(SHELL) Depend.sh $(YESDIR) 1; \
 	else \
 	  echo "Installing package $(@:yes-%=%)"; \
 	  cd $(YESDIR); $(SHELL) ../Install.sh 1; cd ..; \
 		$(SHELL) Depend.sh $(YESDIR) 1; \
 	fi;
 
 no-%:
 	@if [ ! -e $(NODIR) ]; then \
 	  echo "Package $(@:no-%=%) does not exist"; \
 	elif [ -e $(NODIR)/Install.sh ]; then \
 	  echo "Uninstalling package $(@:no-%=%)"; \
 	  cd $(NODIR); $(SHELL) Install.sh 0; cd ..; \
 		$(SHELL) Depend.sh $(NODIR) 0; \
 	else \
 	  echo "Uninstalling package $(@:no-%=%)"; \
 	  cd $(NODIR); $(SHELL) ../Install.sh 0; cd ..; \
 		$(SHELL) Depend.sh $(NODIR) 0; \
         fi;
 
 # status = list src files that differ from package files
 # update = replace src files with newer package files
 # overwrite = overwrite package files with newer src files
 # diff = show differences between src and package files
 
 package-status ps:
 	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p status; done
 	@echo ''
 	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p status; done
 
 package-update pu:
 	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p update; done
 	@echo ''
 	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p update; done
 
 package-overwrite:
 	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p overwrite; done
 	@echo ''
 	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p overwrite; done
 
 package-diff pd:
 	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p diff; done
 	@echo ''
 	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p diff; done
diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh
new file mode 100644
index 000000000..70fc48306
--- /dev/null
+++ b/src/USER-INTEL/Install.sh
@@ -0,0 +1,107 @@
+# Install/unInstall package files in LAMMPS
+# mode = 0/1/2 for uninstall/install/update
+
+mode=$1
+
+# arg1 = file, arg2 = file it depends on
+
+action () {
+  if (test $mode = 0) then
+    rm -f ../$1
+  elif (! cmp -s $1 ../$1) then
+    if (test -z "$2" || test -e ../$2) then
+      cp $1 ..
+      if (test $mode = 2) then
+        echo "  updating src/$1"
+      fi
+    fi
+  elif (test -n "$2") then
+    if (test ! -e ../$2) then
+      rm -f ../$1
+    fi
+  fi
+}
+
+# step 1: process all *_intel.cpp and *_intel.h files.
+# do not install child files if parent does not exist
+
+for file in *_intel.cpp; do
+  test $file = thr_intel.cpp && continue
+  dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \
+      sed 's/_offload_intel//g' | sed 's/_intel//g'`
+  action $file $dep
+done
+
+for file in *_intel.h; do
+  test $file = thr_intel.h && continue
+  dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'`
+  action $file $dep
+done
+
+action intel_preprocess.h
+action intel_buffers.h
+action intel_buffers.cpp
+action math_extra_intel.h
+
+# step 2: handle cases and tasks not handled in step 1.
+
+if (test $mode = 1) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's/[^ \t]*INTEL[^ \t]* //' ../Makefile.package
+    sed -i -e 's|^PKG_INC =[ \t]*|&-DLMP_USER_INTEL |' ../Makefile.package
+  fi
+
+  # force rebuild of files with LMP_USER_INTEL switch
+
+  touch ../accelerator_intel.h
+
+elif (test $mode = 0) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's/[^ \t]*INTEL[^ \t]* //' ../Makefile.package
+  fi
+
+  # force rebuild of files with LMP_USER_INTEL switch
+
+  touch ../accelerator_intel.h
+
+fi
+
+# step 3: map omp styles that are not in the intel package to intel suffix
+
+#if (test $mode = 0) then
+#
+#  rm -f ../*ompinto_intel*
+#
+#else
+#
+#  echo "  The 'intel' suffix will use the USER-OMP package for all"
+#  echo "  angle, bond, dihedral, kspace, and improper styles:"
+#  stylelist="pair fix angle bond dihedral improper"
+#  for header in $stylelist; do
+#    HEADER=`echo $header | sed 's/\(.*\)/\U\1/'`
+#    outfile=../$header"_ompinto_intel.h"
+#    echo "    Creating $header style map: $outfile"
+#    echo -n "// -- Header to map USER-OMP " > $outfile  
+#    echo "styles to the intel suffix" >> $outfile
+#    echo >> $outfile
+#    echo "#ifdef "$HEADER"_CLASS" >> $outfile
+#    grep -h 'Style(' ../$header*_omp.h | grep -v 'charmm/coul/long' | \
+#	grep -v 'lj/cut' | grep -v 'gayberne' | \
+#	sed 's/\/omp/\/intel/g' >> $outfile
+#    echo "#endif" >> $outfile
+#  done
+#
+#  header="kspace"
+#  HEADER="KSPACE"
+#  outfile=../$header"_ompinto_intel.h"
+#  echo "    Creating $header style map: $outfile"
+#  echo -n "// -- Header to map USER-OMP " > $outfile  
+#  echo "styles to the intel suffix" >> $outfile
+#  echo >> $outfile
+#  echo "#ifdef "$HEADER"_CLASS" >> $outfile
+#  grep -h 'KSpaceStyle(' ../*_omp.h | sed 's/\/omp/\/intel/g' >> $outfile
+#  echo "#endif" >> $outfile
+#
+#fi
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
new file mode 100644
index 000000000..0b38928b2
--- /dev/null
+++ b/src/USER-INTEL/README
@@ -0,0 +1,35 @@
+
+                     --------------------------------
+                          LAMMPS Intel Package
+                     --------------------------------
+                     
+                        W. Michael Brown (Intel)
+                       michael.w.brown at intel.com
+
+-----------------------------------------------------------------------------
+
+This package is based on the USER-OMP package and provides LAMMPS styles that:
+
+   1. include support for single and mixed precision in addition to double.
+   2. include modifications to support vectorization for key routines
+   3. include modifications to support offload to Xeon Phi coprocessors
+
+-----------------------------------------------------------------------------
+
+When using the suffix command with "intel", intel styles will be used if they
+exist; if they do not, and an omp version exists, that style will be used.
+This is accomplished through the files *ompinto_intel.h that are created
+in the src directory when the intel package is installed. For example,
+
+      kspace_style     pppm/intel 1e-4
+
+is equivalent to:
+
+      kspace_style     pppm/omp   1e-4
+
+because no pppm style has been implemented for the Intel package.
+
+-----------------------------------------------------------------------------
+
+In order to use offload to Xeon Phi, the flag -DLMP_INTEL_OFFLOAD should be
+set in the Makefile. Offload requires the use of Intel compilers.
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
new file mode 100644
index 000000000..8fd3003b4
--- /dev/null
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -0,0 +1,530 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_request.h"
+#include "pair.h"
+#include "pair_hybrid.h"
+#include "pair_hybrid_overlay.h"
+#include "timer.h"
+#include "universe.h"
+#include "update.h"
+#include "fix_intel.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "suffix.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#ifdef __INTEL_OFFLOAD
+#ifndef _LMP_INTEL_OFFLOAD
+#warning "Not building Intel package with Xeon Phi offload support."
+#endif
+#endif
+
+enum{NSQ,BIN,MULTI};
+
+/* ---------------------------------------------------------------------- */
+
+FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
+{
+  if (narg < 4)
+    error->all(FLERR, "Illegal package intel command");
+  if (strcmp(arg[1],"all") != 0)
+    error->all(FLERR, "fix Intel has to operate on group 'all'");
+
+  _precision_mode = PREC_MODE_MIXED;
+  _offload_balance = 1.0;
+  _overflow_flag[LMP_OVERFLOW] = 0;
+  _off_overflow_flag[LMP_OVERFLOW] = 0;
+
+  _offload_affinity_balanced = 0;
+  _offload_threads = 1;
+  _offload_tpc = 4;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _offload_affinity_set = 0;
+  _off_force_array_s = 0;
+  _off_force_array_m = 0;
+  _off_force_array_d = 0;
+  _off_ev_array_s = 0;
+  _off_ev_array_d = 0;
+  _balance_fixed = 0.0;
+
+  _cop = 0;
+
+  int max_offload_threads, offload_cores;
+  #pragma offload target(mic:_cop) mandatory \
+    out(max_offload_threads,offload_cores)
+  {
+    offload_cores = omp_get_num_procs();
+    omp_set_num_threads(offload_cores);
+    max_offload_threads = omp_get_max_threads();
+  }
+  _max_offload_threads = max_offload_threads;
+  _offload_cores = offload_cores;
+  _offload_threads = offload_cores;
+  #endif
+  int ncops = 1;
+  _allow_separate_buffers = 1;
+  _offload_ghost = -1;
+
+  int iarg = 4;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg], "mixed") == 0)
+      _precision_mode = PREC_MODE_MIXED;
+    else if (strcmp(arg[iarg], "double") == 0)
+      _precision_mode = PREC_MODE_DOUBLE;
+    else if (strcmp(arg[iarg], "single") == 0)
+      _precision_mode = PREC_MODE_SINGLE;
+    else if (strcmp(arg[iarg], "offload_affinity_balanced") == 0)
+      _offload_affinity_balanced = 1;
+    else if (strcmp(arg[iarg], "balance") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _offload_balance = force->numeric(FLERR,arg[iarg]);
+    } else if (strcmp(arg[iarg], "offload_threads") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _offload_threads = atoi(arg[iarg]);
+    } else if (strcmp(arg[iarg], "offload_tpc") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _offload_tpc = atoi(arg[iarg]);
+    } else if (strcmp(arg[iarg], "offload_cards") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      ncops = atoi(arg[iarg]);
+    } else if (strcmp(arg[iarg], "buffers") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _allow_separate_buffers = atoi(arg[iarg]);
+    } else if (strcmp(arg[iarg], "offload_ghost") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _offload_ghost = atoi(arg[iarg]);
+    } else
+      error->all(FLERR, "Illegal package intel mode requested");
+    ++iarg;
+  }
+
+  if (_offload_balance > 1.0 || _offload_threads <= 0 ||
+      _offload_tpc <= 0 || _offload_tpc > 4)
+    error->all(FLERR, "Illegal package intel mode requested");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _ncops = ncops;
+  if (_offload_balance < 0.0) {
+    _balance_neighbor = 0.9;
+    _balance_pair = 0.9;
+  } else {
+    _balance_neighbor = _offload_balance;
+    _balance_pair = _offload_balance;
+  }
+
+  _tscreen = screen;
+  zero_timers();
+  _setup_time_cleared = false;
+  _timers_allocated = false;
+  #else
+  _offload_balance = 0.0;
+  #endif
+
+  if (_precision_mode == PREC_MODE_SINGLE)
+    _single_buffers = new IntelBuffers<float,float>(lmp);
+  else if (_precision_mode == PREC_MODE_MIXED)
+    _mixed_buffers = new IntelBuffers<float,double>(lmp);
+  else
+    _double_buffers = new IntelBuffers<double,double>(lmp);
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixIntel::~FixIntel()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  output_timing_data();
+  if (_timers_allocated) {
+    double *time1 = off_watch_pair();
+    double *time2 = off_watch_neighbor();
+    int *overflow = get_off_overflow_flag();
+    if (time1 != NULL && time2 != NULL && overflow != NULL) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
+    }
+  }
+  #endif
+
+  if (_precision_mode == PREC_MODE_SINGLE)
+    delete _single_buffers;
+  else if (_precision_mode == PREC_MODE_MIXED)
+    delete _mixed_buffers;
+  else
+    delete _double_buffers;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixIntel::setmask()
+{
+  int mask = 0;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::init()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_offload_balance != 0.0) atom->sortfreq = 1;
+  
+  if (force->newton_pair == 0)
+    _offload_noghost = 0;
+  else if (_offload_ghost == 0)
+    _offload_noghost = 1;
+
+  set_offload_affinity();
+
+  output_timing_data();
+  if (!_timers_allocated) {
+    double *time1 = off_watch_pair();
+    double *time2 = off_watch_neighbor();
+    int *overflow = get_off_overflow_flag();
+    if (time1 != NULL && time2 != NULL && overflow != NULL) {
+      #pragma offload_transfer target(mic:_cop)  \
+        nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
+        in(overflow:length(5) alloc_if(1) free_if(0))
+    }
+    _timers_allocated = true;
+  }
+
+  char kmode[80];
+  if (_precision_mode == PREC_MODE_SINGLE)
+    strcpy(kmode, "single");
+  else if (_precision_mode == PREC_MODE_MIXED)
+    strcpy(kmode, "mixed");
+  else
+    strcpy(kmode, "double");
+
+  // print summary of settings
+  if (comm->me == 0) {
+    if (screen) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (_offload_balance != 0.0) {
+        fprintf(screen,"using offload with %d threads per core, ",_offload_tpc);
+        fprintf(screen,"%d threads per task\n",_offload_threads);
+      }
+      #endif
+    }
+  }
+  if (update->whichflag == 2 && _offload_balance != 0.0) {
+    if (_offload_balance == 1.0 && _offload_noghost == 0)
+      _sync_at_pair = 1;
+    else
+      _sync_at_pair = 2;
+  } else {
+    _sync_at_pair = 0;
+    if (strstr(update->integrate_style,"intel") == 0)
+      error->all(FLERR,
+		 "Specified run_style does not support the Intel package.");
+  }
+  #endif
+
+  if (neighbor->style != BIN)
+    error->all(FLERR,
+	    "Currently, neighbor style BIN must be used with Intel package.");
+  if (neighbor->exclude_setting() != 0)
+    error->all(FLERR,
+	    "Currently, cannot use neigh_modify exclude with Intel package.");
+  int nstyles = 0;
+  if (force->pair_match("hybrid", 1) != NULL) {
+    PairHybrid *hybrid = (PairHybrid *) force->pair;
+    for (int i = 0; i < hybrid->nstyles; i++)
+      if (strstr(hybrid->keywords[i], "/intel") == NULL)
+        nstyles++;
+  } else if (force->pair_match("hybrid/overlay", 1) != NULL) {
+    PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
+    for (int i = 0; i < hybrid->nstyles; i++)
+      if (strstr(hybrid->keywords[i], "/intel") == NULL)
+        nstyles++;
+      else
+	force->pair->no_virial_fdotr_compute = 1;
+  }
+  if (nstyles > 1)
+    error->all(FLERR,
+	       "Currently, cannot use more than one intel style with hybrid.");
+
+  neighbor->fix_intel = (void *)this;
+  _nthreads = comm->nthreads;
+
+  check_neighbor_intel();
+  if (_precision_mode == PREC_MODE_SINGLE)
+    _single_buffers->zero_ev();
+  else if (_precision_mode == PREC_MODE_MIXED)
+    _mixed_buffers->zero_ev();
+  else
+    _double_buffers->zero_ev();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::check_neighbor_intel()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  _full_host_list = 0;
+  #endif
+  const int nrequest = neighbor->nrequest;
+
+  for (int i = 0; i < nrequest; ++i) {
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) {
+      _full_host_list = 1;
+      _offload_noghost = 0;
+    }	
+    #endif
+    if (neighbor->requests[i]->skip)
+      error->all(FLERR, "Cannot yet use hybrid styles with Intel package.");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::sync_coprocessor()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_offload_balance != 0.0) {
+    if (_off_force_array_m != 0) {
+      add_off_results(_off_force_array_m, _off_ev_array_d);
+      _off_force_array_m = 0;
+    } else if (_off_force_array_d != 0) {
+      add_off_results(_off_force_array_d, _off_ev_array_d);
+      _off_force_array_d = 0;
+    } else if (_off_force_array_s != 0) {
+      add_off_results(_off_force_array_s, _off_ev_array_s);
+      _off_force_array_s = 0;
+    }
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixIntel::memory_usage()
+{
+  double bytes;
+  if (_precision_mode == PREC_MODE_SINGLE)
+    bytes = _single_buffers->memory_usage(_nthreads);
+  else if (_precision_mode == PREC_MODE_MIXED)
+    bytes = _mixed_buffers->memory_usage(_nthreads);
+  else
+    bytes = _double_buffers->memory_usage(_nthreads);
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef _LMP_INTEL_OFFLOAD
+
+void FixIntel::output_timing_data() {
+  if (_im_real_space_task == 0 || _offload_affinity_set == 0) return;
+
+  double timer_total = 0.0;
+  int size, rank;
+  double timers[NUM_ITIMERS];
+  MPI_Comm_size(_real_space_comm, &size);
+  MPI_Comm_rank(_real_space_comm, &rank);
+  MPI_Allreduce(&_timers, &timers, NUM_ITIMERS, MPI_DOUBLE, MPI_SUM,
+                _real_space_comm);
+  for (int i=0; i < NUM_ITIMERS; i++) {
+    timers[i] /= size;
+    timer_total += timers[i];
+  }
+  #ifdef TIME_BALANCE
+  double timers_min[NUM_ITIMERS], timers_max[NUM_ITIMERS];
+  MPI_Allreduce(&_timers, &timers_max, NUM_ITIMERS, MPI_DOUBLE, MPI_MAX,
+                _real_space_comm);
+  MPI_Allreduce(&_timers, &timers_min, NUM_ITIMERS, MPI_DOUBLE, MPI_MIN,
+                _real_space_comm);
+  #endif
+
+  if (timer_total > 0.0) {
+    double balance_out[2], balance_in[2];
+    balance_out[0] = _balance_pair;
+    balance_out[1] = _balance_neighbor;
+    MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM,
+	       0, _real_space_comm);
+    balance_in[0] /= size;
+    balance_in[1] /= size;
+
+    if (rank == 0 && _tscreen) {
+      fprintf(_tscreen, "\n------------------------------------------------\n");
+      fprintf(_tscreen, "               Offload Timing Data\n");
+      fprintf(_tscreen, "------------------------------------------------\n");
+      fprintf(_tscreen, "  Data Pack/Cast Seconds    %f\n",
+              timers[TIME_PACK]);
+      if (_offload_balance != 0.0) {
+        fprintf(_tscreen, "  Host Neighbor Seconds     %f\n",
+                timers[TIME_HOST_NEIGHBOR]);
+        fprintf(_tscreen, "  Host Pair Seconds         %f\n",
+                timers[TIME_HOST_PAIR]);
+        fprintf(_tscreen, "  Offload Neighbor Seconds  %f\n",
+                timers[TIME_OFFLOAD_NEIGHBOR]);
+        fprintf(_tscreen, "  Offload Pair Seconds      %f\n",
+                timers[TIME_OFFLOAD_PAIR]);
+        fprintf(_tscreen, "  Offload Wait Seconds      %f\n",
+                timers[TIME_OFFLOAD_WAIT]);
+        fprintf(_tscreen, "  Offload Latency Seconds   %f\n",
+                timers[TIME_OFFLOAD_LATENCY]);
+        fprintf(_tscreen, "  Offload Neighbor Balance  %f\n",
+                balance_in[1]);
+        fprintf(_tscreen, "  Offload Pair Balance      %f\n",
+                balance_in[0]);
+	fprintf(_tscreen, "  Offload Ghost Atoms       ");
+	if (_offload_noghost) fprintf(_tscreen,"No\n");
+	else fprintf(_tscreen,"Yes\n");
+        #ifdef TIME_BALANCE
+        fprintf(_tscreen, "  Offload Imbalance Seconds %f\n",
+                timers[TIME_IMBALANCE]);
+	fprintf(_tscreen, "  Offload Min/Max Seconds   ");
+	for (int i = 0; i < NUM_ITIMERS; i++)
+	  fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
+	fprintf(_tscreen, "\n");
+        #endif
+      }
+      fprintf(_tscreen, "------------------------------------------------\n");
+    }
+    zero_timers();
+    _setup_time_cleared = false;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixIntel::get_ppn(int &node_rank) {
+  int nprocs;
+  int rank;
+  MPI_Comm_size(_real_space_comm, &nprocs);
+  MPI_Comm_rank(_real_space_comm, &rank);
+
+  int name_length;
+  char node_name[MPI_MAX_PROCESSOR_NAME];
+  MPI_Get_processor_name(node_name,&name_length);
+  node_name[name_length] = '\0';
+  char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs];
+  MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names,
+		MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
+  int ppn = 0;
+  node_rank = 0;
+  for (int i = 0; i < nprocs; i++) {
+    if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) {
+      ppn++;
+      if (i < rank)
+	node_rank++;
+    }
+  }
+
+  return ppn;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::set_offload_affinity()
+{
+  _separate_buffers = 0;
+  if (_allow_separate_buffers)
+    if (_offload_balance != 0.0 && _offload_balance < 1.0)
+      _separate_buffers = 1;
+
+  _im_real_space_task = 1;
+  if (strncmp(update->integrate_style,"verlet/split",12) == 0) {
+    _real_space_comm = world;
+    if (universe->iworld != 0) {
+      _im_real_space_task = 0;
+      return;
+    }
+  } else
+    _real_space_comm = universe->uworld;
+
+  if (_offload_balance == 0.0) _cop = -1;
+  if (_offload_balance == 0.0 || _offload_affinity_set == 1)
+    return;
+
+  _offload_affinity_set = 1;
+  int node_rank;
+  int ppn = get_ppn(node_rank);
+
+  if (ppn % _ncops != 0)
+    error->all(FLERR, "MPI tasks per node must be multiple of offload_cards");
+  ppn = ppn / _ncops;
+  _cop = node_rank / ppn;
+  node_rank = node_rank % ppn;
+
+  int max_threads_per_task = _offload_cores / 4 * _offload_tpc / ppn;
+  if (_offload_threads > max_threads_per_task)
+    _offload_threads = max_threads_per_task;
+  if (_offload_threads > _max_offload_threads)
+    _offload_threads = _max_offload_threads;
+
+  int offload_threads = _offload_threads;
+  int offload_tpc = _offload_tpc;
+  int offload_affinity_balanced = _offload_affinity_balanced;
+  #pragma offload target(mic:_cop) mandatory \
+    in(node_rank,offload_threads,offload_tpc,offload_affinity_balanced)
+  {
+    omp_set_num_threads(offload_threads);
+    #pragma omp parallel
+    {
+      int tnum = omp_get_thread_num();
+      kmp_affinity_mask_t mask;
+      kmp_create_affinity_mask(&mask);
+      int proc;
+      if (offload_affinity_balanced) {
+	proc = offload_threads * node_rank + tnum;
+	proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
+      } else {
+	proc = offload_threads * node_rank + tnum;
+	proc += (proc / 4) * (4 - offload_tpc) + 1;
+      }
+      kmp_set_affinity_mask_proc(proc, &mask);
+      if (kmp_set_affinity(&mask) != 0)
+	printf("Could not set affinity on rank %d thread %d to %d\n",
+	       node_rank, tnum, proc);
+    }
+  }
+  if (_precision_mode == PREC_MODE_SINGLE)
+    _single_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
+  else if (_precision_mode == PREC_MODE_MIXED)
+    _mixed_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
+  else
+    _double_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
+}
+
+#endif
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
new file mode 100644
index 000000000..82ebc734a
--- /dev/null
+++ b/src/USER-INTEL/fix_intel.h
@@ -0,0 +1,593 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(Intel,FixIntel)
+
+#else
+
+#ifndef LMP_FIX_INTEL_H
+#define LMP_FIX_INTEL_H
+
+#include "fix.h"
+#include "intel_buffers.h"
+#include "force.h"
+#include "pair.h"
+#include "error.h"
+#include "update.h"
+
+namespace LAMMPS_NS {
+
+class IntelData;
+template <class flt_t, class acc_t> class IntelBuffers;
+ 
+class FixIntel : public Fix {
+ public:
+  FixIntel(class LAMMPS *, int, char **);
+  virtual ~FixIntel();
+  virtual int setmask();
+  virtual void init();
+
+  // Get all forces, calculation results from coprocesser
+  void sync_coprocessor();
+
+  double memory_usage();
+
+  typedef struct { double x,y,z; } lmp_ft;
+
+  enum {PREC_MODE_SINGLE, PREC_MODE_MIXED, PREC_MODE_DOUBLE};
+  
+  inline int precision() { return _precision_mode; }
+  inline IntelBuffers<float,float> * get_single_buffers() 
+    { return _single_buffers; }
+  inline IntelBuffers<float,double> * get_mixed_buffers() 
+    { return _mixed_buffers; }
+  inline IntelBuffers<double,double> * get_double_buffers() 
+    { return _double_buffers; }
+
+ protected:
+  IntelBuffers<float,float> *_single_buffers;
+  IntelBuffers<float,double> *_mixed_buffers;
+  IntelBuffers<double,double> *_double_buffers;
+
+  int _precision_mode, _nthreads;
+
+ public:
+  inline int* get_overflow_flag() { return _overflow_flag; }
+  inline int* get_off_overflow_flag() { return _off_overflow_flag; }
+  inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
+                               double *ev_in, const int offload,
+                               const int eatom = 0, const int vatom = 0);
+  inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
+                               double *ev_in, const int offload,
+                               const int eatom = 0, const int vatom = 0);
+  inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
+                               float *ev_in, const int offload,
+                               const int eatom = 0, const int vatom = 0);
+  inline void get_buffern(const int offload, int &nlocal, int &nall, 
+			  int &minlocal);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  inline int coprocessor_number() { return _cop; }
+  inline int full_host_list() { return _full_host_list; }
+  void set_offload_affinity();
+  inline double offload_balance() { return _offload_balance; }
+  inline int offload_end_neighbor() { return _balance_neighbor * atom->nlocal; }
+  inline int offload_end_pair();
+  inline int host_start_neighbor()
+    { if (_offload_noghost) return 0; else return offload_end_neighbor(); }
+  inline int host_start_pair()
+    { if (_offload_noghost) return 0; else return offload_end_pair(); }
+  inline int offload_nlocal() { return _offload_nlocal; }
+  inline int offload_nall() { return _offload_nall; }
+  inline int offload_min_ghost() { return _offload_min_ghost; }
+  inline int host_min_local() { return _host_min_local; }
+  inline int host_min_ghost() { return _host_min_ghost; }
+  inline int host_used_local() { return _host_used_local; }
+  inline int host_used_ghost() { return _host_used_ghost; }
+  inline int host_nall() { return _host_nall; }
+  inline int separate_buffers() { return _separate_buffers; }
+  inline int offload_noghost() { return _offload_noghost; }
+  inline void set_offload_noghost(const int v) 
+    { if (_offload_ghost < 0) _offload_noghost = v; }
+  inline void set_neighbor_host_sizes();
+
+  inline void zero_timers()
+    { memset(_timers, 0, sizeof(double) * NUM_ITIMERS); }
+  inline void start_watch(const int which) { _stopwatch[which] = MPI_Wtime(); }
+  inline double stop_watch(const int which);
+  inline double * off_watch_pair() { return _stopwatch_offload_pair; }
+  inline double * off_watch_neighbor() { return _stopwatch_offload_neighbor; }
+  inline void balance_stamp();
+  inline void acc_timers();
+  #else
+  inline int offload_end_neighbor() { return 0; }
+  inline int offload_end_pair() { return 0; }
+  inline int host_start_neighbor() { return 0; }
+  inline int host_start_pair() { return 0; }
+  inline void zero_timers() {}
+  inline void start_watch(const int which) {}
+  inline double stop_watch(const int which) { return 0.0; }
+  double * off_watch_pair() { return NULL; }
+  double * off_watch_neighbor() { return NULL; }
+  inline void balance_stamp() {}
+  inline void acc_timers() {}
+  inline int separate_buffers() { return 0; }
+  #endif
+
+ protected:
+  int _overflow_flag[5];
+  __declspec(align(64)) int _off_overflow_flag[5];
+  int _allow_separate_buffers, _offload_ghost;
+  #ifdef _LMP_INTEL_OFFLOAD
+  double _balance_pair_time, _balance_other_time;
+  int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
+  int _host_min_local, _host_min_ghost, _host_nall;
+  int _host_used_local, _host_used_ghost;
+  int _separate_buffers, _offload_noghost, _sync_at_pair;
+  bool _setup_time_cleared, _timers_allocated;
+  void output_timing_data();
+  FILE *_tscreen;
+
+  IntelBuffers<float,float>::vec3_acc_t *_off_force_array_s;
+  IntelBuffers<float,double>::vec3_acc_t *_off_force_array_m;
+  IntelBuffers<double,double>::vec3_acc_t *_off_force_array_d;
+  float *_off_ev_array_s;
+  double *_off_ev_array_d;
+  int _off_results_eatom, _off_results_vatom;
+  int _full_host_list, _cop, _ncops;
+
+  int get_ppn(int &);
+  #endif
+  void check_neighbor_intel();
+
+  double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed;
+  double _timers[NUM_ITIMERS];
+  double _stopwatch[NUM_ITIMERS];
+  __declspec(align(64)) double _stopwatch_offload_neighbor[1];
+  __declspec(align(64)) double _stopwatch_offload_pair[1];
+
+  template <class ft, class acc_t>
+  inline void add_results(const ft * restrict const f_in,
+                          const acc_t * restrict const ev_global,
+                          const int eatom, const int vatom,
+			  const int offload);
+
+  template <class ft, class acc_t>
+  inline void add_oresults(const ft * restrict const f_in,
+			   const acc_t * restrict const ev_global,
+			   const int eatom, const int vatom,
+			   const int out_offset, const int nall);
+
+  int _offload_affinity_balanced, _offload_threads, _offload_tpc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _max_offload_threads, _offload_cores, _offload_affinity_set;
+  int _im_real_space_task;
+  MPI_Comm _real_space_comm;
+  template <class ft, class acc_t>
+  inline void add_off_results(const ft * restrict const f_in,
+                              const acc_t * restrict const ev_global);
+  #endif
+};
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
+			   int &minlocal) {
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_separate_buffers) {
+    if (offload) {
+      if (neighbor->ago != 0) {
+	nlocal = _offload_nlocal;
+	nall = _offload_nall;
+      } else {
+	nlocal = atom->nlocal;
+	nall = nlocal + atom->nghost;
+      }
+      minlocal = 0;
+    } else {
+      nlocal = atom->nlocal;
+      nall = _host_nall;
+      minlocal = _host_min_local;
+    }
+    return;
+  }
+  if (_offload_noghost && offload)
+    nall = atom->nlocal;
+  else
+  #endif
+    nall = atom->nlocal + atom->nghost;
+  nlocal = atom->nlocal;
+  minlocal = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
+                                double *ev_in, const int offload,
+                                const int eatom, const int vatom) {
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _off_results_eatom = eatom;
+    _off_results_vatom = vatom;
+    _off_force_array_d = f_in;
+    _off_ev_array_d = ev_in;
+    if (_sync_at_pair == 1) sync_coprocessor();
+    return;
+  }
+  #endif
+  add_results(f_in, ev_in, eatom, vatom, 0);
+  if (_overflow_flag[LMP_OVERFLOW])
+    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_sync_at_pair) sync_coprocessor();
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
+                                double *ev_in, const int offload,
+                                const int eatom, const int vatom) {
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _off_results_eatom = eatom;
+    _off_results_vatom = vatom;
+    _off_force_array_m = f_in;
+    _off_ev_array_d = ev_in;
+    if (_sync_at_pair == 1) sync_coprocessor();
+    return;
+  }
+  #endif
+  add_results(f_in, ev_in, eatom, vatom, 0);
+  if (_overflow_flag[LMP_OVERFLOW])
+    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_sync_at_pair) sync_coprocessor();
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
+                                float *ev_in, const int offload,
+                                const int eatom, const int vatom) {
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _off_results_eatom = eatom;
+    _off_results_vatom = vatom;
+    _off_force_array_s = f_in;
+    _off_ev_array_s = ev_in;
+    if (_sync_at_pair == 1) sync_coprocessor();
+    return;
+  }
+  #endif
+  add_results(f_in, ev_in, eatom, vatom, 0);
+  if (_overflow_flag[LMP_OVERFLOW])
+    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_sync_at_pair) sync_coprocessor();
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class ft, class acc_t>
+void FixIntel::add_results(const ft * restrict const f_in,
+                           const acc_t * restrict const ev_global,
+                           const int eatom, const int vatom,
+			   const int offload) {
+  start_watch(TIME_PACK);
+  int f_length;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_separate_buffers) {
+    if (offload) {
+      add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
+      if (force->newton_pair) {
+	const acc_t * restrict const enull = 0;
+	int offset = _offload_nlocal;
+	if (atom->torque) offset *= 2;
+	add_oresults(f_in + offset, enull, eatom, vatom, 
+		     _offload_min_ghost, _offload_nghost);
+      }
+    } else {
+      add_oresults(f_in, ev_global, eatom, vatom,
+		   _host_min_local, _host_used_local);
+      if (force->newton_pair) {
+	const acc_t * restrict const enull = 0;
+	int offset = _host_used_local;
+	if (atom->torque) offset *= 2;
+	add_oresults(f_in + offset, enull, eatom, 
+		     vatom, _host_min_ghost, _host_used_ghost);
+      }
+    }
+    stop_watch(TIME_PACK);
+    return;
+  }
+  if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
+    f_length = atom->nlocal + atom->nghost;
+  else
+    f_length = atom->nlocal;
+  #else
+  if (force->newton_pair)
+    f_length = atom->nlocal + atom->nghost;
+  else
+    f_length = atom->nlocal;
+  #endif
+
+  add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
+  stop_watch(TIME_PACK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class ft, class acc_t>
+void FixIntel::add_oresults(const ft * restrict const f_in,
+			    const acc_t * restrict const ev_global,
+			    const int eatom, const int vatom,
+			    const int out_offset, const int nall) {
+  lmp_ft * restrict const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
+  if (atom->torque) {
+    if (f_in[1].w)
+      if (f_in[1].w == 1)
+        error->all(FLERR,"Bad matrix inversion in mldivide3");
+      else
+        error->all(FLERR,
+                   "Sphere particles not yet supported for gayberne/intel");
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none)
+  #endif
+  {
+    const int tid = omp_get_thread_num();
+    int ifrom, ito;
+    IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
+    if (atom->torque) {
+      int ii = ifrom * 2;
+      lmp_ft * restrict const tor = (lmp_ft *) lmp->atom->torque[0] +
+	out_offset;
+      if (eatom) {
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[ii].x;
+          f[i].y += f_in[ii].y;
+          f[i].z += f_in[ii].z;
+          force->pair->eatom[i] += f_in[ii].w;
+          tor[i].x += f_in[ii+1].x;
+          tor[i].y += f_in[ii+1].y;
+          tor[i].z += f_in[ii+1].z;
+          ii += 2;
+        }
+      } else {
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[ii].x;
+          f[i].y += f_in[ii].y;
+          f[i].z += f_in[ii].z;
+          tor[i].x += f_in[ii+1].x;
+          tor[i].y += f_in[ii+1].y;
+          tor[i].z += f_in[ii+1].z;
+          ii += 2;
+        }
+      }
+    } else {
+      if (eatom) {
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[i].x;
+          f[i].y += f_in[i].y;
+          f[i].z += f_in[i].z;
+          force->pair->eatom[i] += f_in[i].w;
+        }
+      } else {
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[i].x;
+          f[i].y += f_in[i].y;
+          f[i].z += f_in[i].z;
+        }
+      }
+    }
+  }
+
+  if (ev_global != NULL) {
+    force->pair->eng_vdwl += ev_global[0];
+    force->pair->eng_coul += ev_global[1];
+    force->pair->virial[0] += ev_global[2];
+    force->pair->virial[1] += ev_global[3];
+    force->pair->virial[2] += ev_global[4];
+    force->pair->virial[3] += ev_global[5];
+    force->pair->virial[4] += ev_global[6];
+    force->pair->virial[5] += ev_global[7];
+  }
+}
+
+#ifdef _LMP_INTEL_OFFLOAD
+
+/* ---------------------------------------------------------------------- */
+
+int FixIntel::offload_end_pair() {
+  if (neighbor->ago == 0) return _balance_neighbor * atom->nlocal;
+  else return _balance_pair * atom->nlocal;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixIntel::stop_watch(const int which) {
+  double elapsed = MPI_Wtime() - _stopwatch[which];
+  _timers[which] += elapsed;
+  return elapsed;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::balance_stamp() {
+  if (_offload_balance < 0.0) {
+    double ct = MPI_Wtime();
+    _balance_other_time = ct;
+    _balance_pair_time = ct - _stopwatch[TIME_HOST_PAIR];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::acc_timers() {
+  if (neighbor->ago == 0) {
+    _timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor;
+    if (_setup_time_cleared == false) {
+      zero_timers();
+      _setup_time_cleared = true;
+    }
+  }
+  _timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::set_neighbor_host_sizes() {
+  _host_min_local = _overflow_flag[LMP_LOCAL_MIN];
+  _host_min_ghost = _overflow_flag[LMP_GHOST_MIN];
+  _host_used_local = atom->nlocal - _host_min_local;
+  _host_used_ghost = _overflow_flag[LMP_GHOST_MAX] + 1 - _host_min_ghost;
+  if (_host_used_ghost < 0) _host_used_ghost = 0;
+  _host_nall = atom->nlocal + _host_used_ghost;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class ft, class acc_t>
+void FixIntel::add_off_results(const ft * restrict const f_in,
+                               const acc_t * restrict const ev_global) {
+  if (_offload_balance < 0.0)
+    _balance_other_time = MPI_Wtime() - _balance_other_time;
+
+  start_watch(TIME_OFFLOAD_WAIT);
+  #ifdef _LMP_INTEL_OFFLOAD
+  #pragma offload_wait target(mic:_cop) wait(f_in)
+  #endif
+  double wait_time = stop_watch(TIME_OFFLOAD_WAIT);
+
+  if (neighbor->ago == 0) {
+    if (_off_overflow_flag[LMP_OVERFLOW])
+      error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+    _offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1;
+    _offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN];
+    _offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 -
+      _offload_min_ghost;
+    if (_offload_nghost < 0) _offload_nghost = 0;
+    _offload_nall = _offload_nlocal + _offload_nghost;
+      _offload_nlocal;
+  }
+  
+  int nlocal = atom->nlocal;
+  // Load balance?
+  if (_offload_balance < 0.0) {
+    if (neighbor->ago == 0)
+      _balance_pair = _balance_neighbor;
+    double mic_time;
+    mic_time = *_stopwatch_offload_pair;
+    if (_balance_pair_time + _balance_other_time < mic_time) {
+      double ft = _balance_pair_time + _balance_other_time + wait_time -
+          mic_time;
+      _balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed +
+          INTEL_LB_MEAN_WEIGHT * ft;
+    }
+
+    double ctps = _balance_pair_time / (1.0-_balance_pair);
+    double otps = mic_time / _balance_pair;
+    double new_balance = (ctps + _balance_other_time - _balance_fixed) /
+        (otps + ctps);
+    if (new_balance < 0.01) new_balance = 0.01;
+    else if (new_balance > 0.99) new_balance = 0.99;
+    _balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor +
+        INTEL_LB_MEAN_WEIGHT * new_balance;
+  }
+
+  #ifdef TIME_BALANCE
+  start_watch(TIME_IMBALANCE);
+  MPI_Barrier(_real_space_comm);
+  stop_watch(TIME_IMBALANCE);
+  #endif
+  acc_timers();
+  if (atom->torque)
+    if (f_in[1].w < 0.0)
+      error->all(FLERR, "Bad matrix inversion in mldivide3");
+  add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
+}
+
+#endif
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+E: Neighbor list overflow, boost neigh_modify one
+
+Increase the value for neigh_modify one to allow for larger allocations for
+neighbor list builds. The value required can be different for the Intel
+package in order to support offload to a coprocessor.
+
+E: Bad matrix inversion in mldivide3
+
+This error should not occur unless the matrix is badly formed.
+
+E: Illegal package intel command
+
+The format for the package intel command is incorrect. Please see the
+documentation.
+
+E: fix intel has to operate on group 'all'
+
+Self explanatory.
+
+E: Illegal package intel mode requested
+
+The format for the package intel command is incorrect. Please see the
+documentation.
+
+E: Specified run_style does not support the Intel package.
+
+When using offload to a coprocessor, the Intel package requires a run style
+with the intel suffix.
+
+E: Currently, neighbor style BIN must be used with Intel package.
+
+This is the only neighbor style that has been implemented for the Intel
+package.
+
+E: Currently, cannot use neigh_modify exclude with Intel package.
+
+This is a current restriction of the Intel package.
+
+E: Currently, cannot use more than one intel style with hybrid.
+
+Currently, hybrid pair styles can only use the intel suffix for one of the
+pair styles.
+
+E: Cannot yet use hybrid styles with Intel package.
+
+The hybrid pair style configuration is not yet supported by the Intel 
+package. Support is limited to hybrid/overlay or a hybrid style that does 
+not require a skip list.
+
+E: MPI tasks per node must be multiple of offload_cards
+
+For offload to multiple coprocessors on a single node, the Intel package
+requires that each coprocessor is used by the same number of MPI tasks.
+
+*/
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
new file mode 100644
index 000000000..a541f0f35
--- /dev/null
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -0,0 +1,432 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "intel_buffers.h"
+#include "force.h"
+#include "memory.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
+    lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _buf_size(0),
+    _buf_local_size(0), _off_threads(0) {
+  _list_alloc_atoms = 0;
+  _ntypes = 0;
+  _off_map_maxlocal = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  _separate_buffers = 0;
+  _off_f = 0;
+  _off_map_ilist = 0;
+  _off_map_nmax = 0;
+  _off_map_maxhead = 0;
+  _off_list_alloc = false;
+  _off_threads = 0;
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+IntelBuffers<flt_t, acc_t>::~IntelBuffers()
+{
+  free_buffers();
+  free_all_nbor_buffers();
+  set_ntypes(0);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_buffers()
+{
+  if (_buf_size > 0) {
+    atom_t * x = get_x();
+    flt_t * q = get_q();
+    quat_t * quat = get_quat();
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    vec3_acc_t * f_start = get_off_f();
+    if (f_start != 0) {
+      acc_t * ev_global = get_ev_global();
+      if (ev_global != 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(x:alloc_if(0) free_if(1)) \
+	  nocopy(f_start:alloc_if(0) free_if(1)) \
+	  nocopy(ev_global:alloc_if(0) free_if(1))
+      }
+
+      if (q != 0) {
+        #pragma offload_transfer target (mic:_cop) \
+          nocopy(q:alloc_if(0) free_if(1))
+      }
+      if (quat != 0) {
+        #pragma offload_transfer target (mic:_cop) \
+          nocopy(quat:alloc_if(0) free_if(1))
+      }
+      lmp->memory->destroy(f_start);
+    }
+
+    if (_separate_buffers) {
+      lmp->memory->destroy(_host_x);
+      if (q != 0) lmp->memory->destroy(_host_q);
+      if (quat != 0) lmp->memory->destroy(_host_quat);
+    }
+    #endif
+
+    lmp->memory->destroy(x);
+    if (q != 0) lmp->memory->destroy(q);
+    if (quat != 0) lmp->memory->destroy(quat);
+    lmp->memory->destroy(_f);
+    _buf_size = _buf_local_size = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal, 
+				       const int nthreads,
+				       const int offload_end)
+{
+  free_buffers();
+  _buf_size = static_cast<double>(nall) * 1.1 + 1;
+  if (lmp->force->newton_pair)
+    _buf_local_size = _buf_size;
+  else
+    _buf_local_size = static_cast<double>(nlocal) * 1.1 + 1;
+  if (lmp->atom->torque)
+    _buf_local_size *= 2;
+  const int f_stride = get_stride(_buf_local_size);
+  lmp->memory->create(_x, _buf_size,"intel_x");
+  if (lmp->atom->q != NULL)
+    lmp->memory->create(_q, _buf_size, "intel_q");
+  if (lmp->atom->ellipsoid != NULL)
+    lmp->memory->create(_quat, _buf_size, "intel_quat");
+  lmp->memory->create(_f, f_stride * nthreads, "intel_f");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_separate_buffers) {
+    lmp->memory->create(_host_x, _buf_size,"intel_host_x");
+    if (lmp->atom->q != NULL)
+      lmp->memory->create(_host_q, _buf_size, "intel_host_q");
+    if (lmp->atom->ellipsoid != NULL)
+      lmp->memory->create(_host_quat, _buf_size, "intel_host_quat");
+  }
+    
+  if (offload_end > 0) {
+    lmp->memory->create(_off_f, f_stride * _off_threads, "intel_off_f");
+    const atom_t * const x = get_x();
+    const flt_t * const q = get_q();
+    const vec3_acc_t * f_start = get_off_f();
+    acc_t * ev_global = get_ev_global();
+    if (lmp->atom->q != NULL) {
+      if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
+	  nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
+	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
+      }
+    } else {
+      if (x != NULL && f_start != NULL && ev_global != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
+          nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
+	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
+      }
+    }
+    if (lmp->atom->ellipsoid != NULL) {
+      const quat_t * const quat = get_quat();
+      if (quat != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(quat:length(_buf_size) alloc_if(1) free_if(0))
+      }
+    }
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_nmax()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_off_map_nmax > 0) {
+    const int * tag = _off_map_tag;
+    const int * special = _off_map_special;
+    const int * nspecial = _off_map_nspecial;
+    const int * bins = _off_map_bins;
+    if (tag != 0 && special != 0 && nspecial !=0 && bins != 0) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(tag:alloc_if(0) free_if(1)) \
+	nocopy(special,nspecial:alloc_if(0) free_if(1)) \
+	nocopy(bins:alloc_if(0) free_if(1))
+    }
+    _off_map_nmax = 0;
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_nmax()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  free_nmax();
+  int *special, *nspecial;
+  int tag_length, special_length, nspecial_length;
+  int size = lmp->atom->nmax;
+  if (lmp->atom->molecular) {
+    special = lmp->atom->special[0];
+    nspecial = lmp->atom->nspecial[0];
+    special_length = size * lmp->atom->maxspecial;
+    nspecial_length = size * 3;
+    tag_length = size;
+  } else {
+    special = &_special_holder;
+    nspecial = &_nspecial_holder;
+    special_length = 1;
+    nspecial_length = 1;
+    tag_length = 1;
+  }
+  int *tag = lmp->atom->tag;
+  int *bins = lmp->neighbor->bins;
+  #pragma offload_transfer target(mic:_cop) \
+    nocopy(bins:length(size) alloc_if(1) free_if(0)) \
+    nocopy(tag:length(tag_length) alloc_if(1) free_if(0)) \
+    nocopy(special:length(special_length) alloc_if(1) free_if(0)) \
+    nocopy(nspecial:length(nspecial_length) alloc_if(1) free_if(0))
+  _off_map_tag = tag;
+  _off_map_special = special;
+  _off_map_nspecial = nspecial;
+  _off_map_nmax = size;
+  _off_map_bins = bins;
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_local()
+{
+  if (_off_map_maxlocal > 0) {
+    int * cnumneigh = _cnumneigh;
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_off_map_ilist != NULL) {
+      const int * ilist = _off_map_ilist;
+      const int * numneigh = _off_map_numneigh;
+      _off_map_ilist = NULL;
+      if (numneigh != 0 && ilist != 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(ilist,numneigh,cnumneigh:alloc_if(0) free_if(1))
+      }
+    }
+    #endif
+    lmp->memory->destroy(cnumneigh);
+    _off_map_maxlocal = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_local(NeighList *list, 
+					     const int offload_end)
+{
+  free_local();
+  int size = list->get_maxlocal();
+  lmp->memory->create(_cnumneigh, size, "_cnumneigh");
+  _off_map_maxlocal = size;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload_end > 0) {
+    int * numneigh = list->numneigh;
+    int * ilist = list->ilist;
+    int * cnumneigh = _cnumneigh;
+    if (cnumneigh != 0) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
+	nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
+	nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
+    }
+    _off_map_ilist = ilist;
+    _off_map_numneigh = numneigh;
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_binhead()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_off_map_maxhead > 0) {
+    const int * binhead = _off_map_binhead;
+    if (binhead !=0) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(binhead:alloc_if(0) free_if(1))
+    }
+    _off_map_maxhead = 0;
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_binhead()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  free_binhead();
+  int * binhead = lmp->neighbor->binhead;
+  const int maxhead = lmp->neighbor->maxhead;
+  #pragma offload_transfer target(mic:_cop) \
+    nocopy(binhead:length(maxhead) alloc_if(1) free_if(0))
+  _off_map_binhead = binhead;
+  _off_map_maxhead = maxhead;
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_nbor_list()
+{
+  if (_list_alloc_atoms > 0) {
+    lmp->memory->destroy(_list_alloc);
+    _list_alloc_atoms = 0;
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_off_list_alloc) {
+      int * list_alloc = _list_alloc;
+      int * special_flag = lmp->neighbor->special_flag_alloc();
+      int * stencil = _off_map_stencil;
+      if (list_alloc != 0 && special_flag != 0 && stencil != 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(special_flag,stencil:alloc_if(0) free_if(1)) \
+          nocopy(list_alloc:alloc_if(0) free_if(1))
+      }
+      _off_list_alloc = false;
+    }
+    #endif
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list, 
+						 const int nlocal,
+						 const int offload_end)
+{
+  free_nbor_list();
+  _list_alloc_atoms = 1.10 * nlocal;
+  int list_alloc_size = (_list_alloc_atoms + _off_threads) * get_max_nbors();
+  lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload_end > 0) {
+    int * list_alloc =_list_alloc;
+    int * special_flag = lmp->neighbor->special_flag;
+    int * stencil = list->stencil;
+
+    if (special_flag != NULL && list_alloc != NULL) {
+      #pragma offload_transfer target(mic:_cop) \
+        in(special_flag:length(4) alloc_if(1) free_if(0)) \
+	in(stencil:length(list->maxstencil) alloc_if(1) free_if(0)) \
+	nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0))
+      _off_map_stencil = stencil;
+      _off_list_alloc = true;
+    }
+  }
+  #endif
+}
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_stencil(NeighList *list)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  int * stencil = _off_map_stencil;
+  #pragma offload_transfer target(mic:_cop) \
+    nocopy(stencil:alloc_if(0) free_if(1))
+  stencil = list->stencil;
+  #pragma offload_transfer target(mic:_cop) \
+    in(stencil:length(list->maxstencil) alloc_if(1) free_if(0))
+  _off_map_stencil = stencil;
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
+{
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * cutneighsqo = _cutneighsq[0];
+      if (cutneighsqo != 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cutneighsqo:alloc_if(0) free_if(1))
+      }
+      #endif
+      lmp->memory->destroy(_cutneighsq);
+    }
+    if (ntypes > 0) {
+      lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * cutneighsqo = _cutneighsq[0];
+      if (cutneighsqo != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+    _ntypes = ntypes;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
+{
+  double tmem = sizeof(atom_t);
+  if (lmp->atom->q) tmem += sizeof(flt_t);
+  if (lmp->atom->torque) tmem += sizeof(quat_t);
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_separate_buffers) tmem *= 2;
+  #endif
+  tmem *= _buf_size;
+
+  const int fstride = get_stride(_buf_local_size);
+  tmem += fstride * nthreads * sizeof(vec3_acc_t);
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_off_f) tmem += fstride*_off_threads * sizeof(vec3_acc_t);
+  #endif
+
+  tmem += _off_map_maxlocal * sizeof(int);
+  tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int);
+  tmem += _ntypes * _ntypes * sizeof(int);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template class IntelBuffers<float,float>;
+template class IntelBuffers<float,double>;
+template class IntelBuffers<double,double>;
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
new file mode 100644
index 000000000..bc1ca9e3b
--- /dev/null
+++ b/src/USER-INTEL/intel_buffers.h
@@ -0,0 +1,284 @@
+/* -*- c++ -*- -------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_INTEL_BUFFERS_H
+#define LMP_INTEL_BUFFERS_H
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+#include "atom.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "intel_preprocess.h"
+#include <cstring>
+
+namespace LAMMPS_NS {
+
+#define ATOM_T typename IntelBuffers<flt_t,acc_t>::atom_t
+#define QUAT_T typename IntelBuffers<flt_t,acc_t>::quat_t
+#define FORCE_T typename IntelBuffers<flt_t,acc_t>::vec3_acc_t
+
+// May not need a separate force array for mixed/double
+template <class flt_t, class acc_t>
+class IntelBuffers {
+ public:
+  typedef struct { flt_t x,y,z; int w; } atom_t;
+  typedef struct { flt_t w,i,j,k; } quat_t;
+  typedef struct { flt_t x,y,z,w; } vec3_t;  
+  typedef struct { flt_t x,y,z,w; } vec4_t;
+  typedef struct { acc_t x,y,z,w; } vec3_acc_t;
+    
+  IntelBuffers(class LAMMPS *lmp_in);
+  ~IntelBuffers();
+
+  inline int get_stride(int nall) {
+    int stride;
+    IP_PRE_get_stride(stride, nall, sizeof(vec3_acc_t), 
+			 lmp->atom->torque);
+    return stride;
+  }
+
+  void free_buffers();
+
+  inline void grow(const int nall, const int nlocal, const int nthreads,
+                   const int offload_end) {
+    if (nall >= _buf_size || nlocal >= _buf_local_size)
+      _grow(nall, nlocal, nthreads, offload_end);
+  }
+
+  inline void free_all_nbor_buffers() {
+    free_nbor_list();
+    free_nmax();
+    free_binhead();
+    free_local();
+  }
+
+  inline void grow_nbor(NeighList *list, const int nlocal,
+                        const int offload_end) {
+    grow_local(list, offload_end);
+    if (offload_end) {
+      grow_nmax();
+      grow_binhead();
+    }
+    grow_nbor_list(list, nlocal, offload_end);
+  }
+
+  void free_nmax();
+
+  inline void grow_nmax() {
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (lmp->atom->nmax > _off_map_nmax)
+      _grow_nmax();
+    #endif
+  }
+
+  void free_local();
+
+  inline void grow_local(NeighList *list, const int offload_end) {
+    if (list->get_maxlocal() > _off_map_maxlocal)
+      _grow_local(list, offload_end);
+  }
+
+  void free_binhead();
+  
+  inline void grow_binhead() {
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (lmp->neighbor->maxhead > _off_map_maxhead)
+      _grow_binhead();
+    #endif
+  }
+
+  inline int get_max_nbors() {
+    int mn = lmp->neighbor->oneatom * sizeof(int) /
+        (INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN);
+    return mn * INTEL_DATA_ALIGN / sizeof(int);
+  }
+  
+  void free_nbor_list();
+
+  inline void grow_nbor_list(NeighList *list, const int nlocal,
+                             const int offload_end) {
+    if (nlocal > _list_alloc_atoms)
+      _grow_nbor_list(list, nlocal, offload_end);
+    #ifdef _LMP_INTEL_OFFLOAD
+    else if (offload_end > 0 && _off_map_stencil != list->stencil)
+      _grow_stencil(list);
+    #endif
+  }
+
+  void set_ntypes(const int ntypes);
+
+  inline int * firstneigh(const NeighList *list) { return _list_alloc; }
+  inline int * cnumneigh(const NeighList *list) { return _cnumneigh; }
+
+  inline atom_t * get_x(const int offload = 1) { 
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_separate_buffers && offload == 0) return _host_x;
+    #endif
+    return _x; 
+  }
+  inline flt_t * get_q(const int offload = 1) { 
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_separate_buffers && offload == 0) return _host_q;
+    #endif
+    return _q; 
+  }
+  inline quat_t * get_quat(const int offload = 1) { 
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_separate_buffers && offload == 0) return _host_quat;
+    #endif
+    return _quat; 
+  }
+  inline vec3_acc_t * get_f() { return _f; }
+  inline acc_t * get_ev_global() { return _ev_global; }
+  inline acc_t * get_ev_global_host() { return _ev_global_host; }
+  inline void zero_ev() 
+    { for (int i = 0; i < 8; i++) _ev_global[i] = _ev_global_host[i] = 0.0; }
+  inline flt_t ** get_cutneighsq() { return _cutneighsq; }
+  inline int get_off_threads() { return _off_threads; }
+  #ifdef _LMP_INTEL_OFFLOAD
+  inline void set_off_params(const int n, const int cop, 
+			     const int separate_buffers) 
+    { _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; } 
+  inline vec3_acc_t * get_off_f() { return _off_f; }
+  #endif
+
+  inline void thr_pack(const int ifrom, const int ito, const int ago) {
+    if (ago == 0) {
+      for (int i = ifrom; i < ito; i++) {
+        _x[i].x = lmp->atom->x[i][0];
+        _x[i].y = lmp->atom->x[i][1];
+        _x[i].z = lmp->atom->x[i][2];
+        _x[i].w = lmp->atom->type[i];
+      }
+      if (lmp->atom->q != NULL)
+        for (int i = ifrom; i < ito; i++)
+          _q[i] = lmp->atom->q[i];
+    } else {
+      for (int i = ifrom; i < ito; i++) {
+        _x[i].x = lmp->atom->x[i][0];
+        _x[i].y = lmp->atom->x[i][1];
+        _x[i].z = lmp->atom->x[i][2];
+      }
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  inline void thr_pack_cop(const int ifrom, const int ito, 
+			   const int offset, const bool dotype = false) {
+    double ** x = lmp->atom->x + offset;
+    if (dotype == false) {
+      #pragma vector nontemporal
+      for (int i = ifrom; i < ito; i++) {
+        _x[i].x = x[i][0];
+        _x[i].y = x[i][1];
+        _x[i].z = x[i][2];
+      }
+    } else {
+      int *type = lmp->atom->type + offset;
+      #pragma vector nontemporal
+      for (int i = ifrom; i < ito; i++) {
+	_x[i].x = x[i][0];
+	_x[i].y = x[i][1];
+	_x[i].z = x[i][2];
+	_x[i].w = type[i];
+      }
+    }
+  }
+
+  inline void thr_pack_host(const int ifrom, const int ito, 
+			    const int offset) {
+    double ** x = lmp->atom->x + offset;
+    for (int i = ifrom; i < ito; i++) {
+      _host_x[i].x = x[i][0];
+      _host_x[i].y = x[i][1];
+      _host_x[i].z = x[i][2];
+    }
+  }
+
+  inline void pack_sep_from_single(const int host_min_local, 
+				   const int used_local,
+				   const int host_min_ghost,
+				   const int used_ghost) {
+    memcpy(_host_x + host_min_local, _x + host_min_local,
+	   used_local * sizeof(atom_t));
+    memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost,
+	   used_ghost * sizeof(atom_t));
+    int nall = used_local + used_ghost + host_min_local;
+    _host_x[nall].x = INTEL_BIGP;
+    _host_x[nall].y = INTEL_BIGP;
+    _host_x[nall].z = INTEL_BIGP;
+    _host_x[nall].w = 1;
+    if (lmp->atom->q != NULL) {
+      memcpy(_host_q + host_min_local, _q + host_min_local,
+	     used_local * sizeof(flt_t));
+      memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost,
+	     used_ghost * sizeof(flt_t));
+    }
+  }
+  #endif
+
+  double memory_usage(const int nthreads);
+
+  int _special_holder, _nspecial_holder;
+
+ protected:
+  LAMMPS *lmp;
+  atom_t *_x;
+  flt_t *_q;
+  quat_t *_quat;
+  vec3_acc_t * _f;
+  int _off_threads, _off_map_maxlocal;
+
+  int _list_alloc_atoms;
+  int * _list_alloc;
+  int * _cnumneigh;
+
+  flt_t **_cutneighsq;
+  int _ntypes;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _separate_buffers;
+  atom_t *_host_x;
+  flt_t *_host_q;
+  quat_t *_host_quat;
+  vec3_acc_t *_off_f;
+  int _off_map_nmax, _off_map_maxhead, _cop;
+  int *_off_map_ilist;
+  int *_off_map_stencil, *_off_map_special, *_off_map_nspecial, *_off_map_tag;
+  int *_off_map_binhead, *_off_map_bins, *_off_map_numneigh;
+  bool _off_list_alloc;
+  #endif
+  
+  int _buf_size, _buf_local_size;
+  __declspec(align(64)) acc_t _ev_global[8];
+  __declspec(align(64)) acc_t _ev_global_host[8];
+
+  void _grow(const int nall, const int nlocal, const int nthreads,
+	     const int offload_end);
+  void _grow_nmax();
+  void _grow_local(NeighList *list, const int offload_end);
+  void _grow_binhead();
+  void _grow_nbor_list(NeighList *list, const int nlocal,
+                       const int offload_end);
+  void _grow_stencil(NeighList *list);
+};
+
+}
+
+#endif
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
new file mode 100644
index 000000000..49e3413e0
--- /dev/null
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -0,0 +1,391 @@
+/* -*- c++ -*- -------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef __INTEL_OFFLOAD
+#ifdef LMP_INTEL_OFFLOAD
+#define _LMP_INTEL_OFFLOAD
+#endif
+#endif
+
+#ifndef LMP_INTEL_PREPROCESS_H
+#define LMP_INTEL_PREPROCESS_H
+
+#ifndef LAMMPS_MEMALIGN
+#error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS for your LAMMPS makefile.
+#endif
+
+namespace LAMMPS_NS {
+
+enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
+      LMP_GHOST_MAX};
+enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
+      TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
+      TIME_IMBALANCE};
+#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
+
+#define INTEL_DATA_ALIGN 64
+#define INTEL_ONEATOM_FACTOR 2
+#define INTEL_MIC_VECTOR_WIDTH 16
+#define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
+#define INTEL_VECTOR_WIDTH 8
+#define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
+#define INTEL_LB_MEAN_WEIGHT 0.1
+#define INTEL_BIGP 1e15
+
+#define IP_PRE_get_stride(stride, n, datasize, torque)	\
+  {								\
+    int blength = n;						\
+    if (torque) blength *= 2;					\
+    const int bytes = blength * datasize;			\
+    stride = INTEL_DATA_ALIGN - (bytes % INTEL_DATA_ALIGN);     \
+    stride = blength + stride / datasize;			\
+  }
+
+#if defined(_OPENMP)
+
+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) 	\
+  {								\
+    const int idelta = 1 + inum/nthreads;			\
+    ifrom = tid * idelta;					\
+    ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;	\
+  }
+
+#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)	\
+  {								\
+    tid = omp_get_thread_num();         			\
+    IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads);		\
+  }
+
+#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
+                             datasize)                          \
+{                                                               \
+  int chunk_size = INTEL_DATA_ALIGN / datasize;                 \
+  int idelta = static_cast<int>(static_cast<float>(inum)	\
+				/chunk_size/nthreads) + 1;	\
+  idelta *= chunk_size;						\
+  ifrom = tid*idelta;                                           \
+  ito = ifrom + idelta;                                         \
+  if (ito > inum) ito = inum;                                   \
+}
+
+#define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
+				nthreads, datasize)		\
+  {								\
+    tid = omp_get_thread_num();         			\
+    IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads,     \
+			   datasize);				\
+  }
+
+#else
+
+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)	\
+  {								\
+    ifrom = 0;							\
+    ito = inum;						        \
+  }
+
+#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)	\
+  {								\
+    tid = 0;							\
+    ifrom = 0;							\
+    ito = inum;							\
+  }
+
+#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
+                             datasize)                          \
+{                                                               \
+    ifrom = 0;							\
+    ito = inum;						        \
+}
+
+#define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
+				nthreads, datasize)		\
+{								\
+  tid = 0;							\
+  ifrom = 0;							\
+  ito = inum;							\
+}
+
+#endif
+
+#ifdef _LMP_INTEL_OFFLOAD
+#include <sys/time.h>
+
+__declspec( target (mic))
+inline double MIC_Wtime() {
+  double time;
+  struct timeval tv;
+
+  gettimeofday(&tv, NULL);
+  time = 1.0 * tv.tv_sec + 1.0e-6 * tv.tv_usec;
+  return time;
+}
+
+#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,	\
+				     nlocal, nall)			\
+{									\
+    if (fix->separate_buffers() && ago != 0) {				\
+    fix->start_watch(TIME_PACK);					\
+    if (offload) {							\
+      _Pragma("omp parallel default(none) shared(buffers,nlocal,nall)")	\
+      {									\
+        int ifrom, ito, tid;						\
+	int nthreads = comm->nthreads;					\
+	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,		\
+				nthreads, sizeof(flt_t));		\
+	buffers->thr_pack_cop(ifrom, ito, 0);				\
+	int nghost = nall - nlocal;					\
+	if (nghost) {							\
+	  IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,	\
+				 nthreads, sizeof(flt_t));		\
+	  buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal,		\
+				fix->offload_min_ghost() - nlocal,	\
+				ago == 1);				\
+	}								\
+      }									\
+    } else {								\
+      buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);		\
+      buffers->thr_pack_host(nlocal, nall,				\
+			     fix->host_min_ghost()-nlocal);		\
+    }									\
+    fix->stop_watch(TIME_PACK);						\
+  }									\
+}
+
+#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, 	\
+			     buffers, offload, fix, separate_flag,	\
+			     x_size, q_size, ev_size, f_stride)		\
+{									\
+  separate_flag = 0;							\
+  if (ago == 0) {							\
+    x_size = 0;								\
+    q_size = nall;							\
+    if (offload) {							\
+      if (fix->separate_buffers()) {					\
+	if (lmp->atom->torque)						\
+	  separate_flag = 2;						\
+	else								\
+	  separate_flag = 1;						\
+      } else								\
+	separate_flag = 3;						\
+    }									\
+  } else {								\
+    x_size = nall;							\
+    q_size = 0;								\
+  }									\
+  ev_size = 0;								\
+  if (evflag) {								\
+    if (eflag) ev_size = 2;						\
+    if (vflag) ev_size = 8;						\
+  }									\
+  int f_length;								\
+  if (newton)								\
+    f_length = nall;							\
+  else									\
+    f_length = nlocal;							\
+  f_length -= minlocal;							\
+  f_stride = buffers->get_stride(f_length);				\
+}
+
+#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,    	\
+			   ev_global)					\
+{									\
+  if (offload) {							\
+    tc = buffers->get_off_threads();					\
+    f_start = buffers->get_off_f();					\
+    ev_global = buffers->get_ev_global();				\
+  } else {								\
+    tc = comm->nthreads;						\
+    f_start = buffers->get_f();						\
+    fix->start_watch(TIME_HOST_PAIR);					\
+    ev_global = buffers->get_ev_global_host();				\
+  }									\
+}
+
+#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,	\
+				  f_stride, x, q)			\
+{									\
+  if (separate_flag) {							\
+    if (separate_flag < 3) {						\
+      int all_local = nlocal;						\
+      int ghost_min = overflow[LMP_GHOST_MIN];				\
+      nlocal = overflow[LMP_LOCAL_MAX] + 1;				\
+      int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;		\
+      if (nghost < 0) nghost = 0;					\
+      nall = nlocal + nghost;						\
+      separate_flag--;							\
+      int flength;							\
+      if (NEWTON_PAIR) flength = nall;					\
+      else flength = nlocal;						\
+      IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),		\
+			   separate_flag);				\
+      if (nghost) {							\
+	if (nlocal < all_local || ghost_min > all_local) {		\
+	  memmove(x + nlocal, x + ghost_min,				\
+		  (nall - nlocal) * sizeof(ATOM_T));			\
+	  if (q != 0)							\
+	    memmove((void *)(q + nlocal), (void *)(q + ghost_min),	\
+		    (nall - nlocal) * sizeof(flt_t));			\
+	}								\
+      }									\
+    }									\
+    x[nall].x = INTEL_BIGP;						\
+    x[nall].y = INTEL_BIGP;						\
+    x[nall].z = INTEL_BIGP;						\
+  }									\
+}
+
+
+#else
+
+#define MIC_Wtime MPI_Wtime
+#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
+                                     nlocal, nall)			
+
+#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, 	\
+			     buffers, offload, fix, separate_flag,	\
+			     x_size, q_size, ev_size, f_stride)		\
+{                                                                       \
+  separate_flag = 0;							\
+  int f_length;                                                         \
+  if (newton)                                                           \
+    f_length = nall;                                                    \
+  else                                                                  \
+    f_length = nlocal;                                                  \
+  f_stride = buffers->get_stride(f_length);				\
+}
+
+#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,    	\
+			   ev_global)					\
+{									\
+  tc = comm->nthreads;							\
+  f_start = buffers->get_f();						\
+  fix->start_watch(TIME_HOST_PAIR);					\
+  ev_global = buffers->get_ev_global_host();				\
+}
+
+#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,	\
+				  f_stride, x, q)			
+
+
+#endif
+
+#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz)	\
+{									\
+  if (vflag == 1) {							\
+    sv0 += ev_pre * delx * delx * fpair;				\
+    sv1 += ev_pre * dely * dely * fpair;				\
+    sv2 += ev_pre * delz * delz * fpair;				\
+    sv3 += ev_pre * delx * dely * fpair;				\
+    sv4 += ev_pre * delx * delz * fpair;				\
+    sv5 += ev_pre * dely * delz * fpair;				\
+  }									\
+}
+
+#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp)    	\
+{									\
+  if (evflag) {								\
+    if (eflag) {							\
+      f[i].w += fwtmp;							\
+      oevdwl += sevdwl;							\
+    }									\
+    if (vflag == 1) {							\
+      ov0 += sv0;							\
+      ov1 += sv1;							\
+      ov2 += sv2;							\
+      ov3 += sv3;							\
+      ov4 += sv4;							\
+      ov5 += sv5;							\
+    }									\
+  }									\
+}
+
+#define IP_PRE_ev_tally_atomq(evflag, eflag, vflag, f, fwtmp)    	\
+{									\
+  if (evflag) {								\
+    if (eflag) {							\
+      f[i].w += fwtmp;							\
+      oevdwl += sevdwl;							\
+      oecoul += secoul;							\
+    }									\
+    if (vflag == 1) {							\
+      ov0 += sv0;							\
+      ov1 += sv1;							\
+      ov2 += sv2;							\
+      ov3 += sv3;							\
+      ov4 += sv4;							\
+      ov5 += sv5;							\
+    }									\
+  }									\
+}
+
+#define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom,	\
+			       nall, nlocal, minlocal, nthreads,	\
+			       f_start, f_stride, x)			\
+{									\
+  int o_range;								\
+  if (newton)								\
+    o_range = nall;							\
+  else									\
+    o_range = nlocal;							\
+  if (offload == 0) o_range -= minlocal;				\
+    IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,		\
+			 sizeof(acc_t));				\
+									\
+  int t_off = f_stride;						        \
+  if (eflag && eatom) {							\
+    for (int t = 1; t < nthreads; t++) {				\
+      _Pragma("vector nontemporal")					\
+      for (int n = iifrom; n < iito; n++) {				\
+        f_start[n].x += f_start[n + t_off].x;				\ 
+        f_start[n].y += f_start[n + t_off].y;				\
+	f_start[n].z += f_start[n + t_off].z;				\
+	f_start[n].w += f_start[n + t_off].w;				\
+      }									\
+      t_off += f_stride;						\
+    }									\
+  } else {								\
+    for (int t = 1; t < nthreads; t++) {				\
+      _Pragma("vector nontemporal")   					\
+      for (int n = iifrom; n < iito; n++) {                             \ 
+	f_start[n].x += f_start[n + t_off].x;                  	        \
+        f_start[n].y += f_start[n + t_off].y;				\
+        f_start[n].z += f_start[n + t_off].z;				\
+      }									\
+      t_off += f_stride;						\
+    }									\
+  }									\
+									\
+  if (evflag) {								\
+    if (vflag == 2) {							\
+      const ATOM_T * restrict const xo = x + minlocal;			\
+      _Pragma("vector nontemporal")   					\
+      for (int n = iifrom; n < iito; n++) {				\
+	ov0 += f_start[n].x * xo[n].x;					\
+	ov1 += f_start[n].y * xo[n].y;					\
+	ov2 += f_start[n].z * xo[n].z;					\
+	ov3 += f_start[n].y * xo[n].x;					\
+	ov4 += f_start[n].z * xo[n].x;					\
+	ov5 += f_start[n].z * xo[n].y;					\
+      }									\
+    }									\
+  }									\
+}
+
+}
+
+#endif
diff --git a/src/USER-INTEL/math_extra_intel.h b/src/USER-INTEL/math_extra_intel.h
new file mode 100644
index 000000000..62163b3f6
--- /dev/null
+++ b/src/USER-INTEL/math_extra_intel.h
@@ -0,0 +1,354 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MATH_EXTRA_INTEL_H
+#define LMP_MATH_EXTRA_INTEL_H
+
+#define ME_quat_to_mat_trans(quat, mat)		\
+{						\
+  flt_t quat_w = quat.w;			\
+  flt_t quat_i = quat.i;			\
+  flt_t quat_j = quat.j;			\
+  flt_t quat_k = quat.k;			\
+  flt_t w2 = quat_w * quat_w;			\
+  flt_t i2 = quat_i * quat_i;			\
+  flt_t j2 = quat_j * quat_j;			\
+  flt_t k2 = quat_k * quat_k;			\
+  flt_t twoij = (flt_t)2.0 * quat_i * quat_j;	\
+  flt_t twoik = (flt_t)2.0 * quat_i * quat_k;	\
+  flt_t twojk = (flt_t)2.0 * quat_j * quat_k;	\
+  flt_t twoiw = (flt_t)2.0 * quat_i * quat_w;	\
+  flt_t twojw = (flt_t)2.0 * quat_j * quat_w;	\
+  flt_t twokw = (flt_t)2.0 * quat_k * quat_w;	\
+  						\
+  mat##_0 = w2 + i2 - j2 - k2;			\
+  mat##_3 = twoij - twokw;			\
+  mat##_6 = twojw + twoik;			\
+  						\
+  mat##_1 = twoij + twokw;			\
+  mat##_4 = w2 - i2 + j2 - k2;			\
+  mat##_7 = twojk - twoiw;			\
+  						\
+  mat##_2 = twoik - twojw;			\
+  mat##_5 = twojk + twoiw;			\
+  mat##_8 = w2 - i2 - j2 + k2;			\
+}
+
+/* ----------------------------------------------------------------------
+   diagonal matrix times a full matrix
+------------------------------------------------------------------------- */
+
+#define ME_diag_times3(d, m, ans)			\
+  {							\
+  ans##_0 = d[0] * m##_0;				\
+  ans##_1 = d[0] * m##_1;				\
+  ans##_2 = d[0] * m##_2;				\
+  ans##_3 = d[1] * m##_3;				\
+  ans##_4 = d[1] * m##_4;				\
+  ans##_5 = d[1] * m##_5;				\
+  ans##_6 = d[2] * m##_6;				\
+  ans##_7 = d[2] * m##_7;				\
+  ans##_8 = d[2] * m##_8;				\
+}
+
+#define ME_diag_times3a(d, m, ans)			\
+  {							\
+  ans##_0 = d##_0 * m##_0;				\
+  ans##_1 = d##_0 * m##_1;				\
+  ans##_2 = d##_0 * m##_2;				\
+  ans##_3 = d##_1 * m##_3;				\
+  ans##_4 = d##_1 * m##_4;				\
+  ans##_5 = d##_1 * m##_5;				\
+  ans##_6 = d##_2 * m##_6;				\
+  ans##_7 = d##_2 * m##_7;				\
+  ans##_8 = d##_2 * m##_8;				\
+}
+
+/* ----------------------------------------------------------------------
+   multiply the transpose of mat1 times mat2
+------------------------------------------------------------------------- */
+
+#define ME_transpose_times3(m1, m2, ans)                	\
+{								\
+  ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6;	\
+  ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7;	\
+  ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8;	\
+  ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6;	\
+  ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7;	\
+  ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8;	\
+  ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6;	\
+  ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7;	\
+  ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8;	\
+}
+
+/* ----------------------------------------------------------------------
+   normalize a vector, return in ans
+------------------------------------------------------------------------- */
+
+#define ME_normalize3(v0, v1, v2, ans)	        	\
+{							\
+  flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2);	\
+  ans##_0 = v0 * scale;					\
+  ans##_1 = v1 * scale;					\
+  ans##_2 = v2 * scale;					\
+}
+
+/* ----------------------------------------------------------------------
+   add two matrices
+------------------------------------------------------------------------- */
+
+#define ME_plus3(m1, m2, ans)			\
+{						\
+  ans##_0 = m1##_0 + m2##_0;			\
+  ans##_1 = m1##_1 + m2##_1;			\
+  ans##_2 = m1##_2 + m2##_2;			\
+  ans##_3 = m1##_3 + m2##_3;			\
+  ans##_4 = m1##_4 + m2##_4;			\
+  ans##_5 = m1##_5 + m2##_5;			\
+  ans##_6 = m1##_6 + m2##_6;			\
+  ans##_7 = m1##_7 + m2##_7;			\
+  ans##_8 = m1##_8 + m2##_8;			\
+}
+
+/* ----------------------------------------------------------------------
+   dot product of 2 vectors
+------------------------------------------------------------------------- */
+
+#define ME_dot3(v1, v2) \
+  (v1##_0*v2##_0 + v1##_1 * v2##_1 + v1##_2 * v2##_2)
+
+/* ----------------------------------------------------------------------
+   determinant of a matrix
+------------------------------------------------------------------------- */
+
+#define ME_det3(m)				    \
+  ( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \
+    m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \
+    m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 )
+
+/* ----------------------------------------------------------------------
+   row vector times matrix
+------------------------------------------------------------------------- */
+
+#define ME_vecmat(v, m, ans)				    \
+{							    \
+  ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6;  \
+  ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7;  \
+  ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8;  \
+}
+
+/* ----------------------------------------------------------------------
+   cross product of 2 vectors
+------------------------------------------------------------------------- */
+
+#define ME_cross3(v1, v2, ans)			\
+{						\
+  ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1;	\
+  ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2;	\
+  ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0;	\
+}
+
+/* ----------------------------------------------------------------------
+   cross product of 2 vectors
+------------------------------------------------------------------------- */
+
+#define ME_mv0_cross3(m1, v2, ans)		\
+{						\
+  ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1;	\
+  ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2;	\
+  ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0;	\
+}
+
+#define ME_mv1_cross3(m1, v2, ans)		\
+{						\
+  ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1;	\
+  ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2;	\
+  ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0;	\
+}
+
+#define ME_mv2_cross3(m1, v2, ans)		\
+{						\
+  ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1;	\
+  ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2;	\
+  ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0;	\
+}
+
+
+#define ME_compute_eta_torque(m1, m2, s1, ans)                              \
+{								            \
+  flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7-		    \
+    m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5-				    \
+    m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8;				    \
+  den = (flt_t)1.0 / den;						    \
+									    \
+  ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0-   \
+ 		   m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+    \
+		   m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8-	 	    \
+		   m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+		    \
+		   m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den;	    \
+									    \
+  ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+		    \
+		   (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5-    \
+		   (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2-    \
+		   m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+		    \
+		   m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den;	    \
+  									    \
+  ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4-              \
+		   m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1-		    \
+		   m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+		    \
+		   (flt_t)2.0*m1##_4*m1##_0*m2##_2-                         \
+		   (flt_t)2.0*m1##_3*m2##_2*m1##_1+			    \
+		   m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den;	    \
+									    \
+  ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+  \
+		   m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+    \
+		   m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8-		    \
+		   m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- 		    \
+		   m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den;	    \
+									    \
+  ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+		    \
+		   (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5-    \
+		   (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+    \
+		   m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2-		    \
+		   m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den;	    \
+									    \
+  ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4-		    \
+		   m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+		    \
+		   (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+    \
+		   m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4-		    \
+		   (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)*   \
+    den;							   	    \
+									    \
+  ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+  	    \
+		   (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+    \
+		   m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5-    \
+		   m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7-		    \
+		   m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den;	    \
+									    \
+  ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7-		    \
+		    (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+   \
+		    (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8-   \
+		    m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+		    \
+		    m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den;	    \
+									    \
+  ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4-		    \
+		   m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7-		    \
+		   m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+		    \
+		   (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+    \
+                    m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)*  \
+    den;								    \
+}
+
+#define ME_vcopy4(dst,src)			\
+  dst##_0 = src##_0;				\
+  dst##_1 = src##_1;				\
+  dst##_2 = src##_2;				\
+  dst##_3 = src##_3;
+
+#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error)	\
+{							\
+  flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5;	\
+  flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t;	\
+							\
+  aug_3 = v_0;						\
+  aug_0 = m1##_0;					\
+  aug_1 = m1##_1;					\
+  aug_2 = m1##_2;					\
+  aug_7 = v_1;						\
+  aug_4 = m1##_3;					\
+  aug_5 = m1##_4;					\
+  aug_6 = m1##_5;					\
+  aug_11 = v_2;						\
+  aug_8 = m1##_6;					\
+  aug_9 = m1##_7;					\
+  aug_10 = m1##_8;					\
+							\
+  if (fabs(aug_4) > fabs(aug_0)) {			\
+    flt_t swapt;					\
+    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;	\
+    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;	\
+    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;	\
+    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;	\
+  }							\
+  if (fabs(aug_8) > fabs(aug_0)) {			\
+    flt_t swapt;					\
+    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;	\
+    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;        \ 
+    swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;      \
+    swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;      \
+  }							\
+							\
+  if (aug_0 != (flt_t)0.0) {				\
+  } else if (aug_4 != (flt_t)0.0) {			\
+    flt_t swapt;					\
+    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;	\
+    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;	\
+    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;	\
+    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;	\
+  } else if (aug_8 != (flt_t)0.0) {			\
+    flt_t swapt;					\
+    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;	\
+    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;	\
+    swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;	\
+    swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;	\
+  } else						\
+    error = 1;						\
+							\
+  t = aug_4 / aug_0;					\
+  aug_5 -= t * aug_1;					\
+  aug_6 -= t * aug_2;					\
+  aug_7 -= t * aug_3;					\
+  t = aug_8 / aug_0;					\
+  aug_9 -= t * aug_1;					\
+  aug_10 -= t * aug_2;					\
+  aug_11 -= t * aug_3;					\
+							\
+  if (fabs(aug_9) > fabs(aug_5)) {			\
+    flt_t swapt;					\
+    swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;	\
+    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;	\
+    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;	\
+    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;	\
+  }							\
+							\
+  if (aug_5 != (flt_t)0.0) {				\
+  } else if (aug_9 != (flt_t)0.0) {			\
+    flt_t swapt;					\
+    swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;        \
+    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;	\
+    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;	\
+    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;	\
+  }							\
+							\
+  t = aug_9 / aug_5;					\
+  aug_10 -= t * aug_6;					\
+  aug_11 -= t * aug_7;					\
+							\
+  if (aug_10 == (flt_t)0.0)				\
+    error = 1;						\
+							\
+  ans##_2 = aug_11/aug_10;				\
+  t = (flt_t)0.0;					\
+  t += aug_6 * ans##_2;					\
+  ans##_1 = (aug_7-t) / aug_5;				\
+  t = (flt_t)0.0;					\
+  t += aug_1 * ans##_1;					\
+  t += aug_2 * ans##_2;					\
+  ans##_0 = (aug_3 - t) / aug_0;			\
+}
+
+#endif
diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp
new file mode 100644
index 000000000..a5f12a56f
--- /dev/null
+++ b/src/USER-INTEL/neigh_half_bin_intel.cpp
@@ -0,0 +1,1453 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "comm.h"
+#include "group.h"
+#include "fix_intel.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
+
+template <class flt_t>
+inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2,
+		      const flt_t bboxlo0, const flt_t bboxlo1,
+		      const flt_t bboxlo2, const flt_t bboxhi0,
+		      const flt_t bboxhi1, const flt_t bboxhi2,
+		      const flt_t bininvx, const flt_t bininvy,
+		      const flt_t bininvz, const int nbinx, const int nbiny,
+		      const int nbinz, const int mbinx, const int mbiny,
+		      const int mbinz, const int mbinxlo, const int mbinylo,
+		      const int mbinzlo)
+{
+  int ix, iy, iz;
+
+  if (x0 >= bboxhi0)
+    ix = static_cast<int> ((x0 - bboxhi0) * bininvx) + nbinx;
+  else if (x0 >= bboxlo0) {
+    ix = static_cast<int> ((x0 - bboxlo0) * bininvx);
+    ix = MIN(ix, nbinx-1);
+  } else
+    ix = static_cast<int> ((x0 - bboxlo0) * bininvx) - 1;
+
+  if (x1 >= bboxhi1)
+    iy = static_cast<int> ((x1 - bboxhi1) * bininvy) + nbiny;
+  else if (x1 >= bboxlo1) {
+    iy = static_cast<int> ((x1 - bboxlo1) * bininvy);
+    iy = MIN(iy, nbiny-1);
+  } else
+    iy = static_cast<int> ((x1 - bboxlo1) * bininvy) - 1;
+
+  if (x2 >= bboxhi2)
+    iz = static_cast<int> ((x2 - bboxhi2) * bininvz) + nbinz;
+  else if (x2 >= bboxlo2) {
+    iz = static_cast<int> ((x2 - bboxlo2) * bininvz);
+    iz = MIN(iz, nbinz - 1);
+  } else
+    iz = static_cast<int> ((x2 - bboxlo2) * bininvz) - 1;
+
+  return (iz - mbinzlo) * mbiny * mbinx + (iy - mbinylo) * mbinx +
+    (ix - mbinxlo);
+}
+
+#define ofind_special(which, special, nspecial, i, tag, special_flag) \
+{                                                                     \
+  which = 0;                                                          \
+  const int n1 = nspecial[i * 3];                                     \
+  const int n2 = nspecial[i * 3 + 1];                                 \
+  const int n3 = nspecial[i * 3 + 2];                                 \
+  const int *sptr = special + i * maxspecial;                         \
+  for (int s = 0; s < n3; s++) {                                      \
+    if (sptr[s] == tag) {                                             \
+      if (s < n1) {                                                   \
+        if (special_flag[1] == 0) which = -1;                         \
+        else if (special_flag[1] == 1) which = 0;                     \
+        else which = 1;                                               \
+      } else if (s < n2) {                                            \
+        if (special_flag[2] == 0) which = -1;                         \
+        else if (special_flag[2] == 1) which = 0;                     \
+        else which = 2;                                               \
+      } else {                                                        \
+        if (special_flag[3] == 0) which = -1;                         \
+        else if (special_flag[3] == 1) which = 0;                     \
+        else which = 3;                                               \
+      }                                                               \
+    }                                                                 \
+  }                                                                   \
+}
+
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+template <class flt_t, class acc_t>
+void Neighbor::bin_atoms(void * xin) {
+  const ATOM_T * restrict const x = (const ATOM_T * restrict const)xin;
+  int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  const flt_t bininvx = this->bininvx;
+  const flt_t bininvy = this->bininvy;
+  const flt_t bininvz = this->bininvz;
+  const flt_t bboxlo0 = this->bboxlo[0];
+  const flt_t bboxlo1 = this->bboxlo[1];
+  const flt_t bboxlo2 = this->bboxlo[2];
+  const flt_t bboxhi0 = this->bboxhi[0];
+  const flt_t bboxhi1 = this->bboxhi[1];
+  const flt_t bboxhi2 = this->bboxhi[2];
+
+  int i, ibin;
+
+  for (i = 0; i < mbins; i++) binhead[i] = -1;
+
+  int *mask = atom->mask;
+
+  if (includegroup) {
+    int bitmask = group->bitmask[includegroup];
+    for (i = nall-1; i >= nlocal; i--) {
+      if (mask[i] & bitmask) {
+        ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+            bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny,
+            nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo);
+        bins[i] = binhead[ibin];
+        binhead[ibin] = i;
+      }
+    }
+    for (i = atom->nfirst-1; i >= 0; i--) {
+      ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny,
+          nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo);
+      bins[i] = binhead[ibin];
+      binhead[ibin] = i;
+    }
+  } else {
+    for (i = nall-1; i >= 0; i--) {
+      ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny,
+          nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo);
+      bins[i] = binhead[ibin];
+      binhead[ibin] = i;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with partial Newton's 3rd law
+   each owned atom i checks own bin and other bins in stencil
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if j is ghost (also stored by proc owning j)
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_no_newton_intel(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+
+  // Get fix for intel stuff
+  FixIntel *fix = static_cast<FixIntel *>(fix_intel);
+
+  const int off_end = fix->offload_end_neighbor();
+  int host_start = off_end;;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->full_host_list()) host_start = 0;
+  if (exclude) 
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    hbnni<float,double>(1, list, fix->get_mixed_buffers(),
+                        0, off_end, fix);
+    hbnni<float,double>(0, list, fix->get_mixed_buffers(),
+                        host_start, nlocal,fix);
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    hbnni<double,double>(1, list, fix->get_double_buffers(),
+                         0, off_end, fix);
+    hbnni<double,double>(0, list, fix->get_double_buffers(),
+                         host_start, nlocal, fix);
+  } else {
+    hbnni<float,float>(1, list, fix->get_single_buffers(),
+                       0, off_end, fix);
+    hbnni<float,float>(0, list, fix->get_single_buffers(),
+                       host_start, nlocal, fix);
+  }
+}
+
+template <class flt_t, class acc_t>
+void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
+                     const int astart, const int aend, void *fix_in) {
+  IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
+  FixIntel *fix = (FixIntel *)fix_in;
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+
+  if (offload) {
+    fix->start_watch(TIME_PACK);
+    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
+    buffers->grow_nbor(list, atom->nlocal, aend);
+
+    ATOM_T biga;
+    biga.x = INTEL_BIGP;
+    biga.y = INTEL_BIGP;
+    biga.z = INTEL_BIGP;
+    biga.w = 1;
+    buffers->get_x()[nall] = biga;
+
+    const int nthreads = comm->nthreads;
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(buffers)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, 
+				sizeof(ATOM_T));
+      buffers->thr_pack(ifrom, ito, 0);
+    }
+    fix->stop_watch(TIME_PACK);
+
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    bin_atoms<flt_t,acc_t>(buffers->get_x());
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else {
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    if (INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  }
+  const int pad_width = pad;
+
+  if (aend-astart == 0) {
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    return;
+  }
+
+  const ATOM_T * restrict const x = buffers->get_x();
+  int * restrict const firstneigh = buffers->firstneigh(list);
+
+  const int molecular = atom->molecular;
+  int *ns = NULL, *s = NULL;
+  int tag_size, special_size;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    tag_size = nall;
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    tag_size = 0;
+    special_size = 0;
+  }
+  const int * restrict const special = s;
+  const int * restrict const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const int * restrict const tag = atom->tag;
+
+  int * restrict const ilist = list->ilist;
+  int * restrict numneigh = list->numneigh;
+  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = list->nstencil;
+  const int * restrict const stencil = list->stencil;
+  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  int * const molecule = atom->molecule;
+  #endif
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  if (offload) {
+    timer_compute = fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = fix->get_off_overflow_flag();
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else {
+    tnum = comm->nthreads;
+    overflow = fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+
+  const flt_t bboxlo0 = this->bboxlo[0];
+  const flt_t bboxlo1 = this->bboxlo[1];
+  const flt_t bboxlo2 = this->bboxlo[2];
+  const flt_t bboxhi0 = this->bboxhi[0];
+  const flt_t bboxhi1 = this->bboxhi[1];
+  const flt_t bboxhi2 = this->bboxhi[2];
+  const flt_t bininvx = this->bininvx;
+  const flt_t bininvy = this->bininvy;
+  const flt_t bininvz = this->bininvz;
+  
+  // Make sure dummy coordinates to eliminate loop remainder not within cutoff
+  {
+    const flt_t dx = (INTEL_BIGP - bboxhi0);
+    const flt_t dy = (INTEL_BIGP - bboxhi1);
+    const flt_t dz = (INTEL_BIGP - bboxhi2);
+    if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
+      error->one(FLERR,
+	"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * restrict const binhead = this->binhead;
+  const int * restrict const special_flag = this->special_flag;
+  const int nbinx = this->nbinx;
+  const int nbiny = this->nbiny;
+  const int nbinz = this->nbinz;
+  const int mbinxlo = this->mbinxlo;
+  const int mbinylo = this->mbinylo;
+  const int mbinzlo = this->mbinzlo;
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * restrict const bins = this->bins;
+  const int cop = fix->coprocessor_number();
+  const int separate_buffers = fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    out(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(special_flag:length(0) alloc_if(0) free_if(0)) \
+    in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz) \
+    in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload) \
+    in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \
+    in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(numneigh)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = astart;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(numneigh,overflow)
+    #endif
+    {
+      #ifdef _LMP_INTEL_OFFLOAD
+      int lmin = nall, lmax = -1, gmin = nall, gmax = -1;
+      #endif
+
+      const int num = aend - astart;
+      int tid, ifrom, ito;
+      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+      ifrom += astart;
+      ito += astart;
+
+      int which;
+
+      const int list_size = (ito + tid + 1) * maxnbors;
+      int ct = (ifrom + tid) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      for (int i = ifrom; i < ito; i++) {
+        int j, k, n, n2, itype, jtype, ibin;
+        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
+
+        n = 0;
+        n2 = maxnbors;
+
+        xtmp = x[i].x;
+        ytmp = x[i].y;
+        ztmp = x[i].z;
+        itype = x[i].w;
+        const int ioffset = ntypes*itype;
+
+        // loop over all atoms in other bins in stencil including self
+        // only store pair if i < j
+        // stores own/own pairs only once
+        // stores own/ghost pairs on both procs
+
+        ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+                          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz,
+                          nbinx, nbiny, nbinz, mbinx, mbiny, mbinz,
+                          mbinxlo, mbinylo, mbinzlo);
+
+        for (k = 0; k < nstencil; k++) {
+          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+            if (j <= i) continue;
+
+            jtype = x[j].w;
+            #ifndef _LMP_INTEL_OFFLOAD
+            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+            #endif
+
+            delx = xtmp - x[j].x;
+            dely = ytmp - x[j].y;
+            delz = ztmp - x[j].z;
+            rsq = delx * delx + dely * dely + delz * delz;
+            if (rsq <= cutneighsq[ioffset + jtype]) {
+              if (j < nlocal) {
+                neighptr[n++] = j;
+                #ifdef _LMP_INTEL_OFFLOAD
+		if (j < lmin) lmin = j;
+		if (j > lmax) lmax = j;
+                #endif
+              } else {
+                neighptr[n2++] = j;
+	        #ifdef _LMP_INTEL_OFFLOAD
+		if (j < gmin) gmin = j;
+		if (j > gmax) gmax = j;
+                #endif
+	      }
+	    }
+          }
+        }
+        ilist[i] = i;
+
+        cnumneigh[i] = ct;
+        if (n > maxnbors) *overflow = 1;
+        for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
+        while( (n % pad_width) != 0 ) neighptr[n++] = nall;
+        numneigh[i] = n;
+        while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
+        ct += n;
+        neighptr += n;
+        if (ct + n + maxnbors > list_size) {
+          *overflow = 1;
+	  ct = (ifrom + tid) * maxnbors;
+        }
+      }
+
+      if (*overflow == 1)
+	for (int i = ifrom; i < ito; i++)
+	  numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (separate_buffers) {
+        #if defined(_OPENMP)
+        #pragma omp critical
+        #endif
+        {
+          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+        #pragma omp barrier
+      }
+
+      int ghost_offset = 0, nall_offset = nall;
+      if (separate_buffers) {
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+	} else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      }
+      #endif
+
+      if (molecular) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            ofind_special(which, special, nspecial, i, tag[j], special_flag);
+            #ifdef _LMP_INTEL_OFFLOAD
+	    if (j >= nlocal) {
+	      if (j == nall) 
+		jlist[jj] = nall_offset;
+	      else if (which > 0) 
+		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+	      else jlist[jj]-=ghost_offset;
+            } else
+            #endif
+	      if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+          }
+        }
+      }
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+	for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+	  int jj = 0;
+	  for (jj = 0; jj < jnum; jj++)
+	    if (jlist[jj] >= nlocal) break;
+	  while (jj < jnum) {
+	    if (jlist[jj] == nall) jlist[jj] = nall_offset;
+	    else jlist[jj] -= ghost_offset;
+	    jj++;
+	  }
+	}
+      }
+      #endif
+    } // end omp
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload) {
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    #ifdef _LMP_INTEL_OFFLOAD
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+    #endif
+  } else {
+    for (int i = astart; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (separate_buffers) {
+      fix->start_watch(TIME_PACK);
+      fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(fix->host_min_local(),
+				    fix->host_used_local(),
+				    fix->host_min_ghost(),
+				    fix->host_used_ghost());
+      fix->stop_watch(TIME_PACK);
+    }
+    #endif
+  }
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with full Newton's 3rd law
+   each owned atom i checks its own bin and other bins in Newton stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_newton_intel(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+
+  // Get fix for intel stuff
+  FixIntel *fix = static_cast<FixIntel *>(fix_intel);
+
+  const int off_end = fix->offload_end_neighbor();
+  int host_start = fix->host_start_neighbor();;
+  int offload_noghost = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->full_host_list()) host_start = 0;
+  offload_noghost = fix->offload_noghost();
+  if (exclude) 
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    if (offload_noghost) {
+      hbni<float,double,1>(1, list, fix->get_mixed_buffers(),
+                           0, off_end, fix);
+      hbni<float,double,1>(0, list, fix->get_mixed_buffers(),
+                           host_start, nlocal, fix, off_end);
+    } else {
+      hbni<float,double,0>(1, list, fix->get_mixed_buffers(),
+                           0, off_end, fix);
+      hbni<float,double,0>(0, list, fix->get_mixed_buffers(),
+                           host_start, nlocal, fix);
+    }
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    if (offload_noghost) {
+      hbni<double,double,1>(1, list, fix->get_double_buffers(),
+                            0, off_end, fix);
+      hbni<double,double,1>(0, list, fix->get_double_buffers(),
+                            host_start, nlocal, fix, off_end);
+    } else {
+      hbni<double,double,0>(1, list, fix->get_double_buffers(),
+                            0, off_end, fix);
+      hbni<double,double,0>(0, list, fix->get_double_buffers(),
+                            host_start, nlocal, fix);
+    }
+  } else {
+    if (offload_noghost) {
+      hbni<float,float,1>(1, list, fix->get_single_buffers(), 0, off_end, fix);
+      hbni<float,float,1>(0, list, fix->get_single_buffers(),
+                          host_start, nlocal, fix, off_end);
+    } else {
+      hbni<float,float,0>(1, list, fix->get_single_buffers(), 0, off_end, fix);
+      hbni<float,float,0>(0, list, fix->get_single_buffers(),
+                          host_start, nlocal, fix);
+    }
+  }
+}
+
+template <class flt_t, class acc_t, int offload_noghost>
+void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
+                    const int astart, const int aend, void *fix_in,
+                    const int offload_end) {
+  IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
+  FixIntel *fix = (FixIntel *)fix_in;
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+
+  if (offload) {
+    fix->start_watch(TIME_PACK);
+    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
+    buffers->grow_nbor(list, atom->nlocal, aend);
+
+    ATOM_T biga;
+    biga.x = INTEL_BIGP;
+    biga.y = INTEL_BIGP;
+    biga.z = INTEL_BIGP;
+    biga.w = 1;
+    buffers->get_x()[nall]=biga;
+
+    const int nthreads = comm->nthreads;
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(buffers)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, 
+				sizeof(ATOM_T));
+      buffers->thr_pack(ifrom, ito, 0);
+    }
+    fix->stop_watch(TIME_PACK);
+
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    bin_atoms<flt_t,acc_t>(buffers->get_x());
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else {
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    if (INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  }    
+  const int pad_width = pad;
+
+  if (aend-astart == 0) {
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    return;
+  }
+
+  const ATOM_T * restrict const x = buffers->get_x();
+  int * restrict const firstneigh = buffers->firstneigh(list);
+  int nall_t = nall;
+  if (offload_noghost && offload) nall_t = atom->nlocal;
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL, *s = NULL;
+  int tag_size, special_size;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    tag_size = e_nall;
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    tag_size = 0;
+    special_size = 0;
+  }
+  const int * restrict const special = s;
+  const int * restrict const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const int * restrict const tag = atom->tag;
+
+  int * restrict const ilist = list->ilist;
+  int * restrict numneigh = list->numneigh;
+  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = list->nstencil;
+  const int * restrict const stencil = list->stencil;
+  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  int * const molecule = atom->molecule;
+  #endif
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  if (offload) {
+    timer_compute = fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = fix->get_off_overflow_flag();
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else {
+    tnum = comm->nthreads;
+    overflow = fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+
+  const flt_t bboxlo0 = this->bboxlo[0];
+  const flt_t bboxlo1 = this->bboxlo[1];
+  const flt_t bboxlo2 = this->bboxlo[2];
+  const flt_t bboxhi0 = this->bboxhi[0];
+  const flt_t bboxhi1 = this->bboxhi[1];
+  const flt_t bboxhi2 = this->bboxhi[2];
+  const flt_t bininvx = this->bininvx;
+  const flt_t bininvy = this->bininvy;
+  const flt_t bininvz = this->bininvz;
+  // Make sure dummy coordinates to eliminate loop remainder not within cutoff
+  {
+    const flt_t dx = (INTEL_BIGP - bboxhi0);
+    const flt_t dy = (INTEL_BIGP - bboxhi1);
+    const flt_t dz = (INTEL_BIGP - bboxhi2);
+    if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
+      error->one(FLERR,
+	"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * restrict const binhead = this->binhead;
+  const int * restrict const special_flag = this->special_flag;
+  const int nbinx = this->nbinx;
+  const int nbiny = this->nbiny;
+  const int nbinz = this->nbinz;
+  const int mbinxlo = this->mbinxlo;
+  const int mbinylo = this->mbinylo;
+  const int mbinzlo = this->mbinzlo;
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * restrict const bins = this->bins;
+  const int cop = fix->coprocessor_number();
+  const int separate_buffers = fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    out(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(special_flag:length(0) alloc_if(0) free_if(0)) \
+    in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,e_nall,offload)\
+    in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload_end) \
+    in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \
+    in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(numneigh)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = astart;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(numneigh, overflow)
+    #endif
+    {
+      #ifdef _LMP_INTEL_OFFLOAD
+      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
+      #endif
+      
+      const int num = aend - astart;
+      int tid, ifrom, ito;
+      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+      ifrom += astart;
+      ito += astart;
+
+      int which;
+
+      const int list_size = (ito + tid + 1) * maxnbors;
+      int ct = (ifrom + tid) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      for (int i = ifrom; i < ito; i++) {
+        int j, k, n, n2, itype, jtype, ibin;
+        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
+
+        n = 0;
+	n2 = maxnbors;
+
+        xtmp = x[i].x;
+        ytmp = x[i].y;
+        ztmp = x[i].z;
+        itype = x[i].w;
+        const int ioffset = ntypes * itype;
+
+        // loop over rest of atoms in i's bin, ghosts are at end of linked list
+        // if j is owned atom, store it, since j is beyond i in linked list
+        // if j is ghost, only store if j coords are "above/to the right" of i
+
+        for (j = bins[i]; j >= 0; j = bins[j]) {
+          if (j >= nlocal) {
+            if (offload_noghost && offload) continue;
+            if (x[j].z < ztmp) continue;
+            if (x[j].z == ztmp) {
+              if (x[j].y < ytmp) continue;
+              if (x[j].y == ytmp && x[j].x < xtmp) continue;
+            }
+          } else if (offload_noghost && i < offload_end) continue;
+
+          jtype = x[j].w;
+          #ifndef _LMP_INTEL_OFFLOAD
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	  #endif
+
+          delx = xtmp - x[j].x;
+          dely = ytmp - x[j].y;
+          delz = ztmp - x[j].z;
+          rsq = delx * delx + dely * dely + delz * delz;
+
+          if (rsq <= cutneighsq[ioffset + jtype]) {
+	    if (j < nlocal) {
+	      neighptr[n++] = j;
+	      #ifdef _LMP_INTEL_OFFLOAD
+	      if (j < lmin) lmin = j;
+	      if (j > lmax) lmax = j;
+              #endif
+	    } else {
+	      neighptr[n2++] = j;
+	      #ifdef _LMP_INTEL_OFFLOAD
+	      if (j < gmin) gmin = j;
+	      if (j > gmax) gmax = j;
+              #endif
+            }
+	  }
+        }
+        // loop over all atoms in other bins in stencil, store every pair
+
+        ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+                          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz,
+                          nbinx, nbiny, nbinz, mbinx, mbiny, mbinz,
+                          mbinxlo, mbinylo, mbinzlo);
+
+        for (k = 0; k < nstencil; k++) {
+          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+            if (offload_noghost) {
+              if (j < nlocal) {
+                if (i < offload_end) continue;
+              } else if (offload) continue;
+            }
+
+            jtype = x[j].w;
+            #ifndef _LMP_INTEL_OFFLOAD
+            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	    #endif
+
+            delx = xtmp - x[j].x;
+            dely = ytmp - x[j].y;
+            delz = ztmp - x[j].z;
+            rsq = delx * delx + dely * dely + delz * delz;
+            if (rsq <= cutneighsq[ioffset + jtype]) {
+	      if (j < nlocal) {
+		neighptr[n++] = j;
+                #ifdef _LMP_INTEL_OFFLOAD
+		if (j < lmin) lmin = j;
+		if (j > lmax) lmax = j;
+                #endif
+	      } else {
+		neighptr[n2++] = j;
+	        #ifdef _LMP_INTEL_OFFLOAD
+		if (j < gmin) gmin = j;
+		if (j > gmax) gmax = j;
+                #endif
+	      }
+	    }
+          }
+        }
+        ilist[i] = i;
+
+        cnumneigh[i] = ct;
+        if (n > maxnbors) *overflow = 1;
+	for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
+        while( (n % pad_width) != 0 ) neighptr[n++] = e_nall;
+        numneigh[i] = n;
+	while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
+        ct += n;
+        neighptr += n;
+	if (ct + n + maxnbors > list_size) {
+	  *overflow = 1;
+	  ct = (ifrom + tid) * maxnbors;
+	}
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (separate_buffers) {
+        #if defined(_OPENMP)
+        #pragma omp critical
+        #endif
+        {
+  	  if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+	  if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+	  if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+	  if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+	#pragma omp barrier
+      }
+
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+	int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+	if (nghost < 0) nghost = 0;
+	if (offload) {
+	  ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+	  nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+	} else {
+	  ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+	  nall_offset = nlocal + nghost;
+	}
+      }
+      #endif
+
+      if (molecular) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            ofind_special(which, special, nspecial, i, tag[j],
+                          special_flag);
+	    #ifdef _LMP_INTEL_OFFLOAD
+	    if (j >= nlocal) {
+	      if (j == e_nall)
+		jlist[jj] = nall_offset;
+	      else if (which > 0) 
+		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+	      else jlist[jj]-=ghost_offset;
+            } else
+	    #endif
+            if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+          }
+        }
+      }
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+	for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+	  int jj = 0;
+	  for (jj = 0; jj < jnum; jj++)
+	    if (jlist[jj] >= nlocal) break;
+	  while (jj < jnum) {
+	    if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+	    else jlist[jj] -= ghost_offset;
+	    jj++;
+	  }
+	}
+      }
+      #endif
+    } // end omp
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload) {
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    #ifdef _LMP_INTEL_OFFLOAD
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+    #endif
+  } else {
+    for (int i = astart; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (separate_buffers) {
+      fix->start_watch(TIME_PACK);
+      fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(fix->host_min_local(),
+				    fix->host_used_local(),
+				    fix->host_min_ghost(),
+				    fix->host_used_ghost());
+      fix->stop_watch(TIME_PACK);
+    }
+    #endif
+  }
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with Newton's 3rd law for triclinic
+   each owned atom i checks its own bin and other bins in triclinic stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_newton_tri_intel(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+
+  // Get fix for intel stuff
+  FixIntel *fix = static_cast<FixIntel *>(fix_intel);
+
+  const int off_end = fix->offload_end_neighbor();
+  int host_start = fix->host_start_neighbor();
+  int offload_noghost = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->full_host_list()) host_start = 0;
+  offload_noghost = fix->offload_noghost();
+  if (exclude) 
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    if (offload_noghost) {
+      hbnti<float,double,1>(1, list, fix->get_mixed_buffers(),
+			    0, off_end, fix);
+      hbnti<float,double,1>(0, list, fix->get_mixed_buffers(),
+			    host_start, nlocal, fix, off_end);
+    } else {
+      hbnti<float,double,0>(1, list, fix->get_mixed_buffers(),
+			    0, off_end, fix);
+      hbnti<float,double,0>(0, list, fix->get_mixed_buffers(),
+			    host_start, nlocal, fix);
+    }
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    if (offload_noghost) {
+      hbnti<double,double,1>(1, list, fix->get_double_buffers(),
+			     0, off_end, fix);
+      hbnti<double,double,1>(0, list, fix->get_double_buffers(),
+			     host_start, nlocal, fix, off_end);
+    } else {
+      hbnti<double,double,0>(1, list, fix->get_double_buffers(),
+			     0, off_end, fix);
+      hbnti<double,double,0>(0, list, fix->get_double_buffers(),
+			     host_start, nlocal, fix);
+    }
+  } else {
+    if (offload_noghost) {
+      hbnti<float,float,1>(1, list, fix->get_single_buffers(),
+			   0, off_end, fix);
+      hbnti<float,float,1>(0, list, fix->get_single_buffers(),
+			   host_start, nlocal, fix, off_end);
+    } else {
+      hbnti<float,float,0>(1, list, fix->get_single_buffers(),
+			   0, off_end, fix);
+      hbnti<float,float,0>(0, list, fix->get_single_buffers(),
+			   host_start, nlocal, fix);
+    }
+  }
+}
+
+template <class flt_t, class acc_t, int offload_noghost>
+void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
+                     const int astart, const int aend, void *fix_in,
+		     const int offload_end) {
+  IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
+  FixIntel *fix = (FixIntel *)fix_in;
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+
+  if (offload) {
+    fix->start_watch(TIME_PACK);
+    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
+    buffers->grow_nbor(list, atom->nlocal, aend);
+
+    ATOM_T biga;
+    biga.x = INTEL_BIGP;
+    biga.y = INTEL_BIGP;
+    biga.z = INTEL_BIGP;
+    biga.w = 1;
+    buffers->get_x()[nall]=biga;
+
+    const int nthreads = comm->nthreads;
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(buffers)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, 
+				sizeof(ATOM_T));
+      buffers->thr_pack(ifrom, ito, 0);
+    }
+    fix->stop_watch(TIME_PACK);
+
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    bin_atoms<flt_t,acc_t>(buffers->get_x());
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else {
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    if (INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  }
+  const int pad_width = pad;
+
+  if (aend-astart == 0) {
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    return;
+  }
+
+  const ATOM_T * restrict const x = buffers->get_x();
+  int * restrict const firstneigh = buffers->firstneigh(list);
+  int nall_t = nall;
+  if (offload_noghost && offload) nall_t = atom->nlocal;
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL, *s = NULL;
+  int tag_size, special_size;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    tag_size = e_nall;
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    tag_size = 0;
+    special_size = 0;
+  }
+  const int * restrict const special = s;
+  const int * restrict const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const int * restrict const tag = atom->tag;
+
+  int * restrict const ilist = list->ilist;
+  int * restrict numneigh = list->numneigh;
+  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = list->nstencil;
+  const int * restrict const stencil = list->stencil;
+  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  int * const molecule = atom->molecule;
+  #endif
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  if (offload) {
+    timer_compute = fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = fix->get_off_overflow_flag();
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else {
+    tnum = comm->nthreads;
+    overflow = fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+
+  const flt_t bboxlo0 = this->bboxlo[0];
+  const flt_t bboxlo1 = this->bboxlo[1];
+  const flt_t bboxlo2 = this->bboxlo[2];
+  const flt_t bboxhi0 = this->bboxhi[0];
+  const flt_t bboxhi1 = this->bboxhi[1];
+  const flt_t bboxhi2 = this->bboxhi[2];
+  const flt_t bininvx = this->bininvx;
+  const flt_t bininvy = this->bininvy;
+  const flt_t bininvz = this->bininvz;
+  // Make sure dummy coordinates to eliminate loop remainder not within cutoff
+  {
+    const flt_t dx = (INTEL_BIGP - bboxhi0);
+    const flt_t dy = (INTEL_BIGP - bboxhi1);
+    const flt_t dz = (INTEL_BIGP - bboxhi2);
+    if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
+      error->one(FLERR,
+	"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * restrict const binhead = this->binhead;
+  const int * restrict const special_flag = this->special_flag;
+  const int nbinx = this->nbinx;
+  const int nbiny = this->nbiny;
+  const int nbinz = this->nbinz;
+  const int mbinxlo = this->mbinxlo;
+  const int mbinylo = this->mbinylo;
+  const int mbinzlo = this->mbinzlo;
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * restrict const bins = this->bins;
+  const int cop = fix->coprocessor_number();
+  const int separate_buffers = fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    out(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(special_flag:length(0) alloc_if(0) free_if(0)) \
+    in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,offload_end) \
+    in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,e_nall,offload) \
+    in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \
+    in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(numneigh)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = astart;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(numneigh, overflow)
+    #endif
+    {
+      #ifdef _LMP_INTEL_OFFLOAD
+      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
+      #endif
+
+      const int num = aend-astart;
+      int tid, ifrom, ito;
+      IP_PRE_omp_range_id(ifrom,ito,tid,num,nthreads);
+      ifrom += astart;
+      ito += astart;
+
+      int which;
+
+      const int list_size = (ito + tid + 1) * maxnbors;
+      int ct = (ifrom + tid) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      for (int i = ifrom; i < ito; i++) {
+        int j, k, n, n2, itype, jtype, ibin;
+        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
+
+        n = 0;
+        n2 = maxnbors;
+
+        xtmp = x[i].x;
+        ytmp = x[i].y;
+        ztmp = x[i].z;
+        itype = x[i].w;
+        const int ioffset = ntypes * itype;
+
+        // loop over all atoms in bins in stencil
+        // pairs for atoms j "below" i are excluded
+        // below = lower z or (equal z and lower y) or (equal zy and lower x)
+        //         (equal zyx and j <= i)
+        // latter excludes self-self interaction but allows superposed atoms
+
+        ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+                          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz,
+                          nbinx, nbiny, nbinz, mbinx, mbiny, mbinz,
+                          mbinxlo, mbinylo, mbinzlo);
+
+        for (k = 0; k < nstencil; k++) {
+          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+	    if (offload_noghost) {
+              if (j < nlocal) {
+                if (i < offload_end) continue;
+              } else if (offload) continue;
+            }
+
+            if (x[j].z < ztmp) continue;
+            if (x[j].z == ztmp) {
+              if (x[j].y < ytmp) continue;
+              if (x[j].y == ytmp) {
+                if (x[j].x < xtmp) continue;
+                if (x[j].x == xtmp && j <= i) continue;
+              }
+            }
+
+            jtype = x[j].w;
+            #ifndef _LMP_INTEL_OFFLOAD
+            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	    #endif
+
+            delx = xtmp - x[j].x;
+            dely = ytmp - x[j].y;
+            delz = ztmp - x[j].z;
+            rsq = delx * delx + dely * dely + delz * delz;
+            if (rsq <= cutneighsq[ioffset + jtype]) {
+              if (j < nlocal) {
+                neighptr[n++] = j;
+                #ifdef _LMP_INTEL_OFFLOAD
+		if (j < lmin) lmin = j;
+		if (j > lmax) lmax = j;
+                #endif
+	      }  else {
+                neighptr[n2++] = j;
+  	        #ifdef _LMP_INTEL_OFFLOAD
+		if (j < gmin) gmin = j;
+		if (j > gmax) gmax = j;
+                #endif
+	      }
+            }
+          }
+        }
+        ilist[i] = i;
+
+        cnumneigh[i] = ct;
+        if (n > maxnbors) *overflow = 1;
+        for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
+        while( (n % pad_width) != 0 ) neighptr[n++] = e_nall;
+        numneigh[i] = n;
+        while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
+        ct += n;
+        neighptr += n;
+        if (ct + n + maxnbors > list_size) {
+          *overflow = 1;
+          ct = (ifrom + tid) * maxnbors;
+        }
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (separate_buffers) {
+        #if defined(_OPENMP)
+        #pragma omp critical
+        #endif
+        {
+          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+        #pragma omp barrier
+      }
+
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+	} else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      }
+      #endif
+
+      if (molecular) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            ofind_special(which, special, nspecial, i, tag[j], special_flag);
+            #ifdef _LMP_INTEL_OFFLOAD
+	    if (j >= nlocal) {
+	      if (j == e_nall) 
+		jlist[jj] = nall_offset;
+	      else if (which > 0) 
+		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+	      else jlist[jj]-=ghost_offset;
+            } else
+            #endif
+	      if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+          }
+        }
+      }
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+	for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+	  int jj = 0;
+	  for (jj = 0; jj < jnum; jj++)
+	    if (jlist[jj] >= nlocal) break;
+	  while (jj < jnum) {
+	    if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+	    else jlist[jj] -= ghost_offset;
+	    jj++;
+	  }
+	}
+      }
+      #endif
+    } // end omp
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload) {
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    #ifdef _LMP_INTEL_OFFLOAD
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+    #endif
+  } else {
+    for (int i = astart; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (separate_buffers) {
+      fix->start_watch(TIME_PACK);
+      fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(fix->host_min_local(),
+				    fix->host_used_local(),
+				    fix->host_min_ghost(),
+				    fix->host_used_ghost());
+      fix->stop_watch(TIME_PACK);
+    }
+    #endif
+  }
+}
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
new file mode 100644
index 000000000..46e608c92
--- /dev/null
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -0,0 +1,1075 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_gayberne_intel.h"
+#include "math_extra_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "atom_vec_ellipsoid.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
+#define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
+
+/* ---------------------------------------------------------------------- */
+
+PairGayBerneIntel::PairGayBerneIntel(LAMMPS *lmp) :
+  PairGayBerne(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGayBerneIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairGayBerneIntel::compute(int eflag, int vflag,
+                                IntelBuffers<flt_t,acc_t> *buffers,
+                                const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag, vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
+    const int * const ellipsoid = atom->ellipsoid;
+    QUAT_T * restrict const quat = buffers->get_quat();
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, 
+				sizeof(ATOM_T));
+      if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
+
+      for (int i = ifrom; i < ito; i++) {
+	int qi = ellipsoid[i];
+	if (qi > -1) {
+	  quat[i].w = bonus[qi].quat[0];
+	  quat[i].i = bonus[qi].quat[1];
+	  quat[i].j = bonus[qi].quat[2];
+	  quat[i].k = bonus[qi].quat[3];
+	}
+      }
+    }
+    quat[nall].w = (flt_t)1.0;
+    quat[nall].i = (flt_t)0.0;
+    quat[nall].j = (flt_t)0.0;
+    quat[nall].k = (flt_t)0.0;
+    fix->stop_watch(TIME_PACK);
+  }
+
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairGayBerneIntel::eval(const int offload, const int vflag,
+                             IntelBuffers<flt_t,acc_t> *buffers,
+                             const ForceConst<flt_t> &fc,
+                             const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  ATOM_T * restrict const x = buffers->get_x(offload);
+  QUAT_T * restrict const quat = buffers->get_quat(offload);
+  const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+  const int *ellipsoid = atom->ellipsoid;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->separate_buffers()) {				
+    fix->start_watch(TIME_PACK);					
+    if (offload) {
+      #pragma omp parallel default(none) \
+	shared(buffers,nlocal,nall,bonus,ellipsoid)
+      {									
+        int ifrom, ito, tid;						
+	int nthreads = comm->nthreads;					
+	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,		
+				  nthreads, sizeof(ATOM_T));		
+	if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
+	for (int i = ifrom; i < ito; i++) {
+	  int qi = ellipsoid[i];
+	  if (qi > -1) {
+	    quat[i].w = bonus[qi].quat[0];
+	    quat[i].i = bonus[qi].quat[1];
+	    quat[i].j = bonus[qi].quat[2];
+	    quat[i].k = bonus[qi].quat[3];
+	  }
+	}
+	int nghost = nall - nlocal;
+	if (nghost) {
+	  IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,		
+				 nthreads, sizeof(ATOM_T));			
+	  int offset = 0;
+	  ifrom += nlocal;
+	  ito += nlocal;
+	  if (ago != 0) {
+	    offset = fix->offload_min_ghost() - nlocal;
+	    buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
+	  }
+	  for (int i = ifrom; i < ito; i++) {
+	    int qi = ellipsoid[i + offset];
+	    if (qi > -1) {
+	      quat[i].w = bonus[qi].quat[0];
+	      quat[i].i = bonus[qi].quat[1];
+	      quat[i].j = bonus[qi].quat[2];
+	      quat[i].k = bonus[qi].quat[3];
+	    }
+	  }
+	}
+      }									
+    } else {
+      if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
+      for (int i = fix->host_min_local(); i < nlocal; i++) {
+	int qi = ellipsoid[i];
+	if (qi > -1) {
+	  quat[i].w = bonus[qi].quat[0];
+	  quat[i].i = bonus[qi].quat[1];
+	  quat[i].j = bonus[qi].quat[2];
+	  quat[i].k = bonus[qi].quat[3];
+	}
+      }
+      int offset = fix->host_min_ghost() - nlocal;
+      if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
+      for (int i = nlocal; i < nall; i++) {
+	int qi = ellipsoid[i + offset];
+	if (qi > -1) {
+	  quat[i].w = bonus[qi].quat[0];
+	  quat[i].i = bonus[qi].quat[1];
+	  quat[i].j = bonus[qi].quat[2];
+	  quat[i].k = bonus[qi].quat[3];
+	}
+      }
+    }									
+    fix->stop_watch(TIME_PACK);						
+  }									
+  #endif
+
+  //  const int * restrict const ilist = list->ilist;
+  const int * restrict const numneigh = list->numneigh;
+  const int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int * restrict const firstneigh = buffers->firstneigh(list);
+  const flt_t * restrict const special_lj = fc.special_lj;
+
+  const FC_PACKED1_T * restrict const ijc = fc.ijc[0];
+  const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
+  const FC_PACKED3_T * restrict const ic = fc.ic;
+  const flt_t mu = fc.mu;
+  const flt_t gamma = fc.gamma;
+  const flt_t upsilon = fc.upsilon;
+
+  flt_t * const rsq_formi = fc.rsq_form[0];
+  flt_t * const delx_formi = fc.delx_form[0];
+  flt_t * const dely_formi = fc.dely_form[0];
+  flt_t * const delz_formi = fc.delz_form[0];
+  int * const jtype_formi = fc.jtype_form[0];
+  int * const jlist_formi = fc.jlist_form[0];
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * restrict f_start;
+  acc_t * restrict ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int max_nbors = _max_nbors;
+  const int nthreads = tc;
+
+  int pad = 1;
+  if (offload) {
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else {
+    if (INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  }    
+  const int pad_width = pad;
+  
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj:length(0) alloc_if(0) free_if(0)) \
+    in(ijc,lj34,ic:length(0) alloc_if(0) free_if(0)) \
+    in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
+    in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(quat:length(nall+1) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \
+    in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute=MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (separate_flag) {							
+      if (separate_flag < 3) {							
+	int all_local = nlocal;						
+	int ghost_min = overflow[LMP_GHOST_MIN];				
+	nlocal = overflow[LMP_LOCAL_MAX] + 1;				
+	int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
+	if (nghost < 0) nghost = 0;
+	nall = nlocal + nghost;
+	separate_flag--;							
+	int flength;							
+	if (NEWTON_PAIR) flength = nall;					
+	else flength = nlocal;						
+	IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),		
+			     separate_flag);				
+	if (nghost) {
+	  if (nlocal < all_local || ghost_min > all_local) {			
+	    memmove(x + nlocal, x + ghost_min,
+		    (nall - nlocal) * sizeof(ATOM_T));			
+	    memmove(quat + nlocal, quat + ghost_min,
+		    (nall - nlocal) * sizeof(QUAT_T));			
+	  }
+	}
+      }	
+      x[nall].x = (flt_t)INTEL_BIGP;
+      x[nall].y = (flt_t)INTEL_BIGP;
+      x[nall].z = (flt_t)INTEL_BIGP;
+      quat[nall].w = (flt_t)1.0;
+      quat[nall].i = (flt_t)0.0;
+      quat[nall].j = (flt_t)0.0;
+      quat[nall].k = (flt_t)0.0;
+    }				
+    #endif
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal) \
+      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * restrict const f = f_start - minlocal * 2 + (tid * f_stride);
+      memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
+
+      flt_t * restrict const rsq_form = rsq_formi + tid * max_nbors;
+      flt_t * restrict const delx_form = delx_formi + tid * max_nbors;
+      flt_t * restrict const dely_form = dely_formi + tid * max_nbors;
+      flt_t * restrict const delz_form = delz_formi + tid * max_nbors;
+      int * restrict const jtype_form = jtype_formi + tid * max_nbors;
+      int * restrict const jlist_form = jlist_formi + tid * max_nbors;
+
+      int ierror = 0;
+      for (int i = iifrom; i < iito; ++i) {
+        // const int i = ilist[ii];
+        const int itype = x[i].w;
+        const int ptr_off = itype * ntypes;
+        const FC_PACKED1_T * restrict const ijci = ijc + ptr_off;
+        const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
+
+        const int * restrict const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+
+        flt_t a1_0, a1_1, a1_2, a1_3, a1_4, a1_5, a1_6, a1_7, a1_8;
+        flt_t b1_0, b1_1, b1_2, b1_3, b1_4, b1_5, b1_6, b1_7, b1_8;
+        flt_t g1_0, g1_1, g1_2, g1_3, g1_4, g1_5, g1_6, g1_7, g1_8;
+
+        if (ijci[itype].form == ELLIPSE_ELLIPSE) {
+          flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
+          ME_quat_to_mat_trans(quat[i],a1);
+          ME_diag_times3(ic[itype].well,a1,temp);
+          ME_transpose_times3(a1,temp,b1);
+          ME_diag_times3(ic[itype].shape2,a1,temp);
+          ME_transpose_times3(a1,temp,g1);
+        }
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp, t1tmp, t2tmp, t3tmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+        fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
+
+        if (EVFLAG) {
+          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        }
+
+        bool multiple_forms = false;
+        int packed_j = 0;
+        for (int jj = 0; jj < jnum; jj++) {
+          int jm = jlist[jj];
+          int j = jm & NEIGHMASK;
+          const int jtype = x[j].w;
+
+          if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
+            flt_t delx = x[j].x-xtmp;
+            flt_t dely = x[j].y-ytmp;
+            flt_t delz = x[j].z-ztmp;
+            flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+            if (rsq < ijci[jtype].cutsq) {
+              rsq_form[packed_j] = rsq;
+              delx_form[packed_j] = delx;
+              dely_form[packed_j] = dely;
+              delz_form[packed_j] = delz;
+              jtype_form[packed_j] = jtype;
+              jlist_form[packed_j] = jm;
+              packed_j++;
+            }
+          } else
+            multiple_forms = true;
+        }
+	while( (packed_j % pad_width) != 0 )
+	  jlist_form[packed_j++] = nall;
+
+        // -------------------------------------------------------------
+
+	#ifdef __MIC__
+	__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
+	__assume(packed_j % 8 == 0);
+	__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
+	#endif
+        #pragma vector aligned
+	#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
+	                         sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+        for (int jj = 0; jj < packed_j; jj++) {
+          flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
+          flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
+          flt_t g2_0, g2_1, g2_2, g2_3, g2_4, g2_5, g2_6, g2_7, g2_8;
+          flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
+          flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
+          flt_t rtor_0, rtor_1, rtor_2;
+
+	  const int sbindex = jlist_form[jj] >> SBBITS & 3;
+	  const int j = jlist_form[jj] & NEIGHMASK;
+          flt_t factor_lj = special_lj[sbindex];
+          const int jtype = jtype_form[jj];
+	  const flt_t sigma = ijci[jtype].sigma;
+	  const flt_t epsilon = ijci[jtype].epsilon;
+	  const flt_t shape2_0 = ic[jtype].shape2[0];
+	  const flt_t shape2_1 = ic[jtype].shape2[1];
+	  const flt_t shape2_2 = ic[jtype].shape2[2];
+          flt_t one_eng, evdwl;
+
+          ME_quat_to_mat_trans(quat[j], a2);
+          ME_diag_times3(ic[jtype].well, a2, temp);
+          ME_transpose_times3(a2, temp, b2);
+          ME_diag_times3a(shape2, a2, temp);
+          ME_transpose_times3(a2, temp, g2);
+	  
+          flt_t tempv_0, tempv_1, tempv_2, tempv2_0, tempv2_1, tempv2_2;
+          flt_t temp1, temp2, temp3;
+
+          flt_t r12hat_0, r12hat_1, r12hat_2;
+          ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat);
+          flt_t r = sqrt(rsq_form[jj]);
+
+          // compute distance of closest approach
+	  
+          flt_t g12_0, g12_1, g12_2, g12_3, g12_4, g12_5, g12_6, g12_7, g12_8;
+          ME_plus3(g1, g2, g12);
+          flt_t kappa_0, kappa_1, kappa_2;
+          ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
+		       kappa, ierror);
+	  
+          // tempv = G12^-1*r12hat
+
+          flt_t inv_r = (flt_t)1.0 / r;
+          tempv_0 = kappa_0 * inv_r;
+          tempv_1 = kappa_1 * inv_r;
+          tempv_2 = kappa_2 * inv_r;
+          flt_t sigma12 = ME_dot3(r12hat, tempv);
+          sigma12 = pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
+          flt_t h12 = r - sigma12;
+
+          // energy
+          // compute u_r
+
+          flt_t varrho = sigma / (h12 + gamma * sigma);
+          flt_t varrho6 = pow(varrho, (flt_t)6.0);
+          flt_t varrho12 = varrho6 * varrho6;
+          flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
+
+          // compute eta_12
+
+          flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
+          flt_t det_g12 = ME_det3(g12);
+          eta = pow(eta / det_g12, upsilon);
+
+          // compute chi_12
+
+          flt_t b12_0, b12_1, b12_2, b12_3, b12_4, b12_5, b12_6, b12_7, b12_8;
+          flt_t iota_0, iota_1, iota_2;
+          ME_plus3(b1, b2, b12);
+          ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
+		       iota, ierror);
+
+          // tempv = G12^-1*r12hat
+
+          tempv_0 = iota_0 * inv_r;
+          tempv_1 = iota_1 * inv_r;
+          tempv_2 = iota_2 * inv_r;
+          flt_t chi = ME_dot3(r12hat, tempv);
+          chi = pow(chi * (flt_t)2.0, mu);
+
+          // force
+          // compute dUr/dr
+
+          temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) / 
+	    sigma;
+          temp1 = temp1 * (flt_t)24.0 * epsilon;
+          flt_t u_slj = temp1 * pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
+          flt_t dUr_0, dUr_1, dUr_2;
+          temp2 = ME_dot3(kappa, r12hat);
+          flt_t uslj_rsq = u_slj / rsq_form[jj];
+          dUr_0 = temp1 * r12hat_0 + uslj_rsq * (kappa_0 - temp2 * r12hat_0);
+          dUr_1 = temp1 * r12hat_1 + uslj_rsq * (kappa_1 - temp2 * r12hat_1);
+          dUr_2 = temp1 * r12hat_2 + uslj_rsq * (kappa_2 - temp2 * r12hat_2);
+
+          // compute dChi_12/dr
+
+          flt_t dchi_0, dchi_1, dchi_2;
+          temp1 = ME_dot3(iota, r12hat);
+          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * 
+	    pow(chi, (mu - (flt_t)1.0) / mu);
+          dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
+          dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
+          dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
+
+          temp1 = -eta * u_r;
+          temp2 = eta * chi;
+          fforce_0 = temp1 * dchi_0 - temp2 * dUr_0;
+          fforce_1 = temp1 * dchi_1 - temp2 * dUr_1;
+          fforce_2 = temp1 * dchi_2 - temp2 * dUr_2;
+
+          // torque for particle 1 and 2
+          // compute dUr
+
+          tempv_0 = -uslj_rsq * kappa_0;
+          tempv_1 = -uslj_rsq * kappa_1;
+          tempv_2 = -uslj_rsq * kappa_2;
+          ME_vecmat(kappa, g1, tempv2);
+          ME_cross3(tempv, tempv2, dUr);
+          flt_t dUr2_0, dUr2_1, dUr2_2;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            ME_vecmat(kappa, g2, tempv2);
+            ME_cross3(tempv, tempv2, dUr2);
+          }
+
+          // compute d_chi
+
+          ME_vecmat(iota, b1, tempv);
+          ME_cross3(tempv, iota, dchi);
+          temp1 = (flt_t)-4.0 / rsq_form[jj];
+          dchi_0 *= temp1;
+          dchi_1 *= temp1;
+          dchi_2 *= temp1;
+          flt_t dchi2_0, dchi2_1, dchi2_2;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            ME_vecmat(iota, b2, tempv);
+            ME_cross3(tempv, iota, dchi2);
+            dchi2_0 *= temp1;
+            dchi2_1 *= temp1;
+            dchi2_2 *= temp1;
+          }
+
+          // compute d_eta
+
+          flt_t deta_0, deta_1, deta_2;
+          deta_0 = deta_1 = deta_2 = (flt_t)0.0;
+          ME_compute_eta_torque(g12, a1, shape2, temp);
+          temp1 = -eta * upsilon;
+
+          tempv_0 = temp1 * temp_0;
+          tempv_1 = temp1 * temp_1;
+          tempv_2 = temp1 * temp_2;
+          ME_mv0_cross3(a1, tempv, tempv2);
+          deta_0 += tempv2_0;
+          deta_1 += tempv2_1;
+          deta_2 += tempv2_2;
+
+          tempv_0 = temp1 * temp_3;
+          tempv_1 = temp1 * temp_4;
+          tempv_2 = temp1 * temp_5;
+          ME_mv1_cross3(a1, tempv, tempv2);
+          deta_0 += tempv2_0;
+          deta_1 += tempv2_1;
+          deta_2 += tempv2_2;
+
+          tempv_0 = temp1 * temp_6;
+          tempv_1 = temp1 * temp_7;
+          tempv_2 = temp1 * temp_8;
+          ME_mv2_cross3(a1, tempv, tempv2);
+          deta_0 += tempv2_0;
+          deta_1 += tempv2_1;
+          deta_2 += tempv2_2;
+
+          // compute d_eta for particle 2
+
+          flt_t deta2_0, deta2_1, deta2_2;
+          if (NEWTON_PAIR || j < nlocal) {
+            deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
+            ME_compute_eta_torque(g12, a2, shape2, temp);
+
+            tempv_0 = temp1 * temp_0;
+            tempv_1 = temp1 * temp_1;
+            tempv_2 = temp1 * temp_2;
+            ME_mv0_cross3(a2, tempv, tempv2);
+            deta2_0 += tempv2_0;
+            deta2_1 += tempv2_1;
+            deta2_2 += tempv2_2;
+
+            tempv_0 = temp1 * temp_3;
+            tempv_1 = temp1 * temp_4;
+            tempv_2 = temp1 * temp_5;
+            ME_mv1_cross3(a2, tempv, tempv2);
+            deta2_0 += tempv2_0;
+            deta2_1 += tempv2_1;
+            deta2_2 += tempv2_2;
+
+            tempv_0 = temp1 * temp_6;
+            tempv_1 = temp1 * temp_7;
+            tempv_2 = temp1 * temp_8;
+            ME_mv2_cross3(a2, tempv, tempv2);
+            deta2_0 += tempv2_0;
+            deta2_1 += tempv2_1;
+            deta2_2 += tempv2_2;
+          }
+
+          // torque
+
+          temp1 = u_r * eta;
+          temp2 = u_r * chi;
+          temp3 = chi * eta;
+
+          ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) * 
+	    (flt_t)-1.0;
+          ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) * 
+	    (flt_t)-1.0;
+          ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) * 
+	    (flt_t)-1.0;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) * 
+	      (flt_t)-1.0;
+            rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) * 
+	      (flt_t)-1.0;
+            rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) * 
+	      (flt_t)-1.0;
+          }
+
+          one_eng = temp1 * chi;
+	  #ifndef __MIC__
+	  if (jlist_form[jj] == nall) {
+	    one_eng = (flt_t)0.0;
+	    fforce_0 = 0.0;
+	    fforce_1 = 0.0;
+	    fforce_2 = 0.0;
+	    ttor_0 = 0.0;
+	    ttor_1 = 0.0;
+	    ttor_2 = 0.0;
+	    rtor_0 = 0.0;
+	    rtor_1 = 0.0;
+	    rtor_2 = 0.0;
+	  }
+	  #endif
+
+          fforce_0 *= factor_lj;
+          fforce_1 *= factor_lj;
+          fforce_2 *= factor_lj;
+          ttor_0 *= factor_lj;
+          ttor_1 *= factor_lj;
+          ttor_2 *= factor_lj;
+
+	  #ifdef __MIC__
+	  if (jlist_form[jj] < nall) {
+	  #endif
+	    fxtmp += fforce_0;
+	    fytmp += fforce_1;
+	    fztmp += fforce_2;
+	    t1tmp += ttor_0;
+	    t2tmp += ttor_1;
+	    t3tmp += ttor_2;
+
+	    if (NEWTON_PAIR || j < nlocal) {
+	      rtor_0 *= factor_lj;
+	      rtor_1 *= factor_lj;
+	      rtor_2 *= factor_lj;
+	      int jp = j * 2;
+	      f[jp].x -= fforce_0;
+	      f[jp].y -= fforce_1;
+	      f[jp].z -= fforce_2;
+	      jp++;
+	      f[jp].x += rtor_0;
+	      f[jp].y += rtor_1;
+	      f[jp].z += rtor_2;
+	    }
+	  
+	    if (EVFLAG) {
+	      flt_t ev_pre = (flt_t)0;
+	      if (NEWTON_PAIR || i < nlocal)
+		ev_pre += (flt_t)0.5;
+	      if (NEWTON_PAIR || j < nlocal)
+		ev_pre += (flt_t)0.5;
+
+	      if (EFLAG) {
+		evdwl = factor_lj * one_eng;
+		sevdwl += ev_pre * evdwl;
+		if (eatom) {
+		  if (NEWTON_PAIR || i < nlocal)
+		    fwtmp += (flt_t)0.5 * evdwl;
+		  if (NEWTON_PAIR || j < nlocal)
+		    f[j*2].w += (flt_t)0.5 * evdwl;
+		}
+	      }
+	      
+	      if (vflag == 1) {
+		ev_pre *= (flt_t)-1.0;
+		sv0 += ev_pre * delx_form[jj] * fforce_0;
+		sv1 += ev_pre * dely_form[jj] * fforce_1;
+		sv2 += ev_pre * delz_form[jj] * fforce_2;
+		sv3 += ev_pre * delx_form[jj] * fforce_1;
+		sv4 += ev_pre * delx_form[jj] * fforce_2;
+		sv5 += ev_pre * dely_form[jj] * fforce_2;
+	      }
+	    } // EVFLAG
+	  #ifdef __MIC__
+	  }
+	  #endif
+        } // for jj
+
+        // -------------------------------------------------------------
+
+        if (multiple_forms)
+          ierror = 2;
+
+        int ip = i * 2;
+        f[ip].x += fxtmp;
+        f[ip].y += fytmp;
+        f[ip].z += fztmp;
+        ip++;
+        f[ip].x += t1tmp;
+        f[ip].y += t2tmp;
+        f[ip].z += t3tmp;
+
+        if (EVFLAG) {
+          if (EFLAG) {
+            if (eatom) f[i * 2].w += fwtmp;
+            oevdwl += sevdwl;
+          }
+          if (vflag == 1) {
+            ov0 += sv0;
+            ov1 += sv1;
+            ov2 += sv2;
+            ov3 += sv3;
+            ov4 += sv4;
+            ov5 += sv5;
+          }
+        }
+      } // for i
+      int o_range;
+      if (NEWTON_PAIR)
+        o_range = nall;
+      else
+        o_range = nlocal;
+      if (offload == 0) o_range -= minlocal;
+      IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, 
+			     sizeof(FORCE_T));
+      const int two_iito = iito * 2;
+
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+
+      acc_t *facc = &(f_start[0].x);
+      const int sto = two_iito * 4;
+      const int fst4 = f_stride * 4;
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      int t_off = f_stride;
+      if (EFLAG && eatom) {
+        for (int t = 1; t < nthreads; t++) {
+	  #pragma vector nontemporal
+          for (int n = iifrom * 2; n < two_iito; n++) {
+            f_start[n].x += f_start[n + t_off].x;
+            f_start[n].y += f_start[n + t_off].y;
+            f_start[n].z += f_start[n + t_off].z;
+            f_start[n].w += f_start[n + t_off].w;
+          }
+          t_off += f_stride;
+        }
+      } else {
+        for (int t = 1; t < nthreads; t++) {
+	  #pragma vector nontemporal
+          for (int n = iifrom * 2; n < two_iito; n++) {
+            f_start[n].x += f_start[n + t_off].x;
+            f_start[n].y += f_start[n + t_off].y;
+            f_start[n].z += f_start[n + t_off].z;
+          }
+          t_off += f_stride;
+        }
+      }
+
+      if (EVFLAG) {
+        if (vflag==2) {
+          const ATOM_T * restrict const xo = x + minlocal;
+	  #pragma vector nontemporal
+          for (int n = iifrom; n < iito; n++) {
+            const int nt2 = n * 2;
+            ov0 += f_start[nt2].x * xo[n].x;
+            ov1 += f_start[nt2].y * xo[n].y;
+            ov2 += f_start[nt2].z * xo[n].z;
+            ov3 += f_start[nt2].y * xo[n].x;
+            ov4 += f_start[nt2].z * xo[n].x;
+            ov5 += f_start[nt2].z * xo[n].y;
+          }
+        }
+      }
+
+      if (ierror)
+        f_start[1].w = ierror;
+    } // omp
+
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = (acc_t)0.0;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // offload
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload,eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGayBerneIntel::init_style()
+{
+  PairGayBerne::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  fix->set_offload_affinity();
+  if (force->newton_pair) fix->set_offload_noghost(1);
+  _cop = fix->coprocessor_number();
+  #endif
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    fix->get_mixed_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    fix->get_double_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  } else {
+    fix->get_single_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_single_buffers());
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                         IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int tp1 = atom->ntypes + 1;
+  _max_nbors = buffers->get_max_nbors();
+  int mthreads = comm->nthreads;
+  if (mthreads < buffers->get_off_threads())
+    mthreads = buffers->get_off_threads();
+  fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_lj[0] = 1.0;
+  }
+  fc.gamma = gamma;
+  fc.upsilon = upsilon;
+  fc.mu = mu;
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.ijc[i][j].lj1 = lj1[i][j];
+      fc.ijc[i][j].lj2 = lj2[i][j];
+      fc.ijc[i][j].cutsq = cutsq[i][j];
+      fc.ijc[i][j].offset = offset[i][j];
+      fc.ijc[i][j].sigma = sigma[i][j];
+      fc.ijc[i][j].epsilon = epsilon[i][j];
+      fc.ijc[i][j].form = form[i][j];
+      fc.ijc[i][j].lshape = lshape[i] * lshape[j];
+      fc.lj34[i][j].lj3 = lj3[i][j];
+      fc.lj34[i][j].lj4 = lj4[i][j];
+    }
+    for (int j = 0; j < 4; j++) {
+      fc.ic[i].shape2[j] = shape2[i][j];
+      fc.ic[i].well[j] = well[i][j];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  FC_PACKED1_T *oijc = fc.ijc[0];
+  FC_PACKED2_T *olj34 = fc.lj34[0];
+  FC_PACKED3_T *oic = fc.ic;
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  if (oijc != NULL && oic != NULL) {
+    #pragma offload_transfer target(mic:_cop) \
+      in(special_lj: length(4) alloc_if(0) free_if(0)) \
+      in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
+      in(oic: length(tp1) alloc_if(0) free_if(0)) \
+      in(ocutneighsq: length(tp1sq))
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                      const int one_length,
+                                                      const int nthreads,
+                                                      Memory *memory,
+						      const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      fc_packed3 *oic = ic;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      fc_packed1 *oijc = ijc[0];
+      fc_packed2 *olj34 = lj34[0];
+      flt_t * orsq_form = rsq_form[0];
+      flt_t * odelx_form = delx_form[0];
+      flt_t * odely_form = dely_form[0];
+      flt_t * odelz_form = delz_form[0];
+      int * ojtype_form = jtype_form[0];
+      int * ojlist_form = jlist_form[0];
+
+      if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
+	  orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
+	  odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
+	  _cop >= 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
+          nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
+          nocopy(odelz_form, ojtype_form, ojlist_form: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(oic);
+      _memory->destroy(ijc);
+      _memory->destroy(lj34);
+      _memory->destroy(rsq_form);
+      _memory->destroy(delx_form);
+      _memory->destroy(dely_form);
+      _memory->destroy(delz_form);
+      _memory->destroy(jtype_form);
+      _memory->destroy(jlist_form);
+    }
+
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(ijc, ntypes, ntypes, "fc.ijc");
+      memory->create(lj34, ntypes, ntypes, "fc.lj34");
+      memory->create(ic, ntypes, "fc.ic");
+      memory->create(rsq_form, nthreads, one_length, "rsq_form");
+      memory->create(delx_form, nthreads, one_length, "delx_form");
+      memory->create(dely_form, nthreads, one_length, "dely_form");
+      memory->create(delz_form, nthreads, one_length, "delz_form");
+      memory->create(jtype_form, nthreads, one_length, "jtype_form");
+      memory->create(jlist_form, nthreads, one_length, "jlist_form");
+
+      for (int zn = 0; zn < nthreads; zn++)
+	for (int zo = 0; zo < one_length; zo++) {
+	  rsq_form[zn][zo] = 10.0;
+	  delx_form[zn][zo] = 10.0;
+	  dely_form[zn][zo] = 10.0;
+	  delz_form[zn][zo] = 10.0;
+	  jtype_form[zn][zo] = 1;
+	  jlist_form[zn][zo] = 0;
+	}
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      fc_packed1 *oijc = ijc[0];
+      fc_packed2 *olj34 = lj34[0];
+      fc_packed3 *oic = ic;
+      flt_t * orsq_form = rsq_form[0];
+      flt_t * odelx_form = delx_form[0];
+      flt_t * odely_form = dely_form[0];
+      flt_t * odelz_form = delz_form[0];
+      int * ojtype_form = jtype_form[0];
+      int * ojlist_form = jlist_form[0];
+      int off_onel = one_length * nthreads;
+
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL && 
+	  oic != NULL && orsq_form != NULL && odelx_form != NULL && 
+	  odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL && 
+	  ojlist_form !=NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(oic: length(ntypes) alloc_if(1) free_if(0)) \
+          in(orsq_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(odelx_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(odely_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(odelz_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(ojtype_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(ojlist_form: length(off_onel) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes = ntypes;
+  _memory = memory;
+}
diff --git a/src/USER-INTEL/pair_gayberne_intel.h b/src/USER-INTEL/pair_gayberne_intel.h
new file mode 100644
index 000000000..eb055e151
--- /dev/null
+++ b/src/USER-INTEL/pair_gayberne_intel.h
@@ -0,0 +1,99 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gayberne/intel,PairGayBerneIntel)
+
+#else
+
+#ifndef LMP_PAIR_GAYBERNE_INTEL_H
+#define LMP_PAIR_GAYBERNE_INTEL_H
+
+#include "pair_gayberne.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairGayBerneIntel : public PairGayBerne {
+
+ public:
+  PairGayBerneIntel(class LAMMPS *);
+
+  virtual void compute(int, int);
+  void init_style();
+
+ private:
+  template <class flt_t> class ForceConst;
+
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { 
+      flt_t cutsq, lj1, lj2, offset, sigma, epsilon, lshape; 
+      int form; 
+    } fc_packed1;
+    typedef struct { flt_t lj3, lj4; } fc_packed2;
+    typedef struct { flt_t shape2[4], well[4]; } fc_packed3;
+
+    __declspec(align(64)) flt_t special_lj[4], gamma, upsilon, mu;
+    fc_packed1 **ijc;
+    fc_packed2 **lj34;
+    fc_packed3 *ic;
+
+    flt_t **rsq_form, **delx_form, **dely_form, **delz_form;
+    int **jtype_form, **jlist_form;
+
+    ForceConst() : _ntypes(0)  {}
+    ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
+
+    void set_ntypes(const int ntypes, const int one_length,
+                    const int nthreads, Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+  int _max_nbors;
+
+  double gayberne_lj(const int i, const int j, double a1[3][3],
+                     double b1[3][3], double g1[3][3], double *r12,
+                     const double rsq, double *fforce, double *ttor);
+
+  FixIntel *fix;
+  int _cop;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
new file mode 100644
index 000000000..576d5b21c
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -0,0 +1,675 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_charmm_coul_long_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "kspace.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
+#define TABLE_T typename ForceConst<flt_t>::table_t
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulLongIntel::PairLJCharmmCoulLongIntel(LAMMPS *lmp) :
+  PairLJCharmmCoulLong(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulLongIntel::~PairLJCharmmCoulLongIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
+					IntelBuffers<flt_t,acc_t> *buffers,
+					const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+  
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost, 
+			      nthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+  
+  // -------------------- Regular version
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      if (force->newton_pair) {
+	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
+				     IntelBuffers<flt_t,acc_t> *buffers,
+				     const ForceConst<flt_t> &fc,
+				     const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * restrict const x = buffers->get_x(offload);
+  flt_t * restrict const q = buffers->get_q(offload);
+
+  const int * restrict const numneigh = list->numneigh;
+  const int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int * restrict const firstneigh = buffers->firstneigh(list);
+
+  const flt_t * restrict const special_coul = fc.special_coul;
+  const flt_t * restrict const special_lj = fc.special_lj;
+  const flt_t qqrd2e = force->qqrd2e;
+  const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
+
+  const flt_t * restrict const cutsq = fc.cutsq[0];
+  const LJ_T * restrict const lj = fc.lj[0];
+  const TABLE_T * restrict const table = fc.table;
+  const flt_t * restrict const etable = fc.etable;
+  const flt_t * restrict const detable = fc.detable;
+  const flt_t * restrict const ctable = fc.ctable;
+  const flt_t * restrict const dctable = fc.dctable;
+  const flt_t cut_ljsq = fc.cut_ljsq;
+  const flt_t cut_lj_innersq = fc.cut_lj_innersq;
+  const flt_t cut_coulsq = fc.cut_coulsq;
+  const flt_t g_ewald = fc.g_ewald;
+  const flt_t tabinnersq = fc.tabinnersq;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+		       
+  int tc;
+  FORCE_T * restrict f_start;
+  acc_t * restrict ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+  // Redeclare as local variables for offload
+  const int ncoultablebits = this->ncoultablebits;
+  const int ncoulmask = this->ncoulmask;
+  const int ncoulshiftbits = this->ncoulshiftbits;
+  #ifdef INTEL_ALLOW_TABLE
+  #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \
+                    in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \
+                    in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits)
+  #else
+  #define ITABLE_IN
+  #endif
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
+    in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(nthreads,qqrd2e,g_ewald,inum,nall,ntypes,cut_coulsq,vflag,eatom) \
+    in(f_stride,separate_flag,offload) \
+    in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    ITABLE_IN signal(f_start)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
+			      f_stride, x, q);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = oecoul = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal) \
+      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      flt_t cutboth = cut_coulsq;
+
+      for (int i = iifrom; i < iito; ++i) {
+	//        const int i = ilist[ii];
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const flt_t * restrict const cutsqi = cutsq + ptr_off;
+        const LJ_T * restrict const lji = lj + ptr_off;
+
+        const int   * restrict const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp,fytmp,fztmp,fwtmp;
+	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const flt_t qtmp = q[i];
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EVFLAG) {
+	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+	}
+
+	#pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcecoul, forcelj, evdwl, ecoul;
+          forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
+
+          const int sbindex = jlist[jj] >> SBBITS & 3;
+          const int j = jlist[jj] & NEIGHMASK;
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          const flt_t r2inv = (flt_t)1.0 / rsq;
+
+	  #ifdef __MIC__
+	  if (rsq < cut_coulsq) {
+          #endif
+            #ifdef INTEL_ALLOW_TABLE
+            if (!ncoultablebits || rsq <= tabinnersq) {
+            #endif
+              const flt_t A1 =  0.254829592;
+              const flt_t A2 = -0.284496736;
+              const flt_t A3 =  1.421413741;
+              const flt_t A4 = -1.453152027;
+              const flt_t A5 =  1.061405429;
+              const flt_t EWALD_F = 1.12837917;
+              const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+
+              const flt_t r = sqrt(rsq);
+              const flt_t grij = g_ewald * r;
+              const flt_t expm2 = exp(-grij * grij);
+              const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+              const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+              const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+              forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+              if (EFLAG) ecoul = prefactor * erfc;
+              if (sbindex) {
+                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+                    prefactor;
+                forcecoul -= adjust;
+                if (EFLAG) ecoul -= adjust;
+              }
+            #ifdef INTEL_ALLOW_TABLE
+            } else {
+              float rsq_lookup = rsq;
+              const int itable = (__intel_castf32_u32(rsq_lookup) &
+                  ncoulmask) >> ncoulshiftbits;
+              const flt_t fraction = (rsq_lookup - table[itable].r) *
+                  table[itable].dr;
+
+              const flt_t tablet = table[itable].f +
+                  fraction * table[itable].df;
+              forcecoul = qtmp * q[j] * tablet;
+              if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+                  fraction * detable[itable]);
+              if (sbindex) {
+                const flt_t table2 = ctable[itable] +
+                    fraction * dctable[itable];
+                const flt_t prefactor = qtmp * q[j] * table2;
+                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                    prefactor;
+                forcecoul -= adjust;
+                if (EFLAG) ecoul -= adjust;
+              }
+            }
+            #endif
+	  #ifdef __MIC__
+	  }
+	  #endif
+
+	  #ifdef __MIC__
+	  if (rsq < cut_ljsq) {
+	  #endif
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
+            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
+
+	    #ifdef __MIC__
+	    if (rsq > cut_lj_innersq) {
+	    #endif
+              const flt_t drsq = cut_ljsq - rsq;
+              const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
+              const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
+                  inv_denom_lj;
+              const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
+              if (EFLAG) {
+		#ifndef __MIC__
+		if (rsq > cut_lj_innersq) {
+		#endif
+                  forcelj = forcelj * switch1 + evdwl * switch2;
+                  evdwl *= switch1;
+		#ifndef __MIC__
+		}
+		#endif
+              } else {
+                const flt_t philj = r6inv * (lji[jtype].z*r6inv -
+                    lji[jtype].w);
+		#ifndef __MIC__
+		if (rsq > cut_lj_innersq)
+		#endif
+                  forcelj =  forcelj * switch1 + philj * switch2;
+              }
+	    #ifdef __MIC__
+	    }
+	    #endif
+
+            if (sbindex) {
+              const flt_t factor_lj = special_lj[sbindex];
+              forcelj *= factor_lj;
+              if (EFLAG) evdwl *= factor_lj;
+            }
+	  #ifdef __MIC__
+	  }
+	  #else
+	  if (rsq > cut_coulsq) { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
+	  if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+	  #endif
+
+	  #ifdef __MIC__
+	  if (rsq < cut_coulsq) {
+	  #endif
+            const flt_t fpair = (forcecoul + forcelj) * r2inv;
+            fxtmp += delx * fpair;
+            fytmp += dely * fpair;
+            fztmp += delz * fpair;
+            if (NEWTON_PAIR || j < nlocal) {
+              f[j].x -= delx * fpair;
+              f[j].y -= dely * fpair;
+              f[j].z -= delz * fpair;
+            }
+
+            if (EVFLAG) {
+              flt_t ev_pre = (flt_t)0;
+              if (NEWTON_PAIR || i < nlocal)
+                ev_pre += (flt_t)0.5;
+              if (NEWTON_PAIR || j < nlocal)
+                ev_pre += (flt_t)0.5;
+
+              if (EFLAG) {
+                sevdwl += ev_pre * evdwl;
+                secoul += ev_pre * ecoul;
+                if (eatom) {
+                  if (NEWTON_PAIR || i < nlocal)
+                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                  if (NEWTON_PAIR || j < nlocal) 
+                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                }
+              }
+
+	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
+				   delx, dely, delz);
+            }
+	  #ifdef __MIC__
+	  }
+	  #endif
+        } // for jj
+        f[i].x += fxtmp;
+        f[i].y += fytmp;
+        f[i].z += fztmp;
+
+	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+      } // for ii
+
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			     nlocal, minlocal, nthreads, f_start, f_stride, 
+			     x);
+    } // end of omp parallel region
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = oecoul;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongIntel::init_style()
+{
+  PairLJCharmmCoulLong::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+  
+  #ifdef _LMP_INTEL_OFFLOAD
+  fix->set_offload_affinity();
+  _cop = fix->coprocessor_number();
+  #endif
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    fix->get_mixed_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    fix->get_double_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  } else {
+    fix->get_single_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_single_buffers());
+  }
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                          IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int tp1 = atom->ntypes + 1;
+  int ntable = 1;
+  if (ncoultablebits)
+    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
+
+  fc.set_ntypes(tp1, ntable, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  if (cut_lj > cut_coul)
+    error->all(FLERR,
+	 "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq, cut_coulsq);
+
+  fc.g_ewald = force->kspace->g_ewald;
+  fc.tabinnersq = tabinnersq;
+  fc.cut_coulsq = cut_coulsq;
+  fc.cut_ljsq = cut_ljsq;
+  fc.cut_lj_innersq = cut_lj_innersq;
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_coul[i] = force->special_coul[i];
+    fc.special_coul[0] = 1.0;
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.lj[i][j].x = lj1[i][j];
+      fc.lj[i][j].y = lj2[i][j];
+      fc.lj[i][j].z = lj3[i][j];
+      fc.lj[i][j].w = lj4[i][j];
+      fc.cutsq[i][j] = cutsq[i][j];
+    }
+  }
+
+  if (ncoultablebits) {
+    for (int i = 0; i < ntable; i++) {
+      fc.table[i].r = rtable[i];
+      fc.table[i].dr = drtable[i];
+      fc.table[i].f = ftable[i];
+      fc.table[i].df = dftable[i];
+      fc.etable[i] = etable[i];
+      fc.detable[i] = detable[i];
+      fc.ctable[i] = ctable[i];
+      fc.dctable[i] = dctable[i];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  flt_t * special_coul = fc.special_coul;
+  flt_t * cutsq = fc.cutsq[0];
+  LJ_T * lj = fc.lj[0];
+  TABLE_T * table = fc.table;
+  flt_t * etable = fc.etable;
+  flt_t * detable = fc.detable;
+  flt_t * ctable = fc.ctable;
+  flt_t * dctable = fc.dctable;
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  #pragma offload_transfer target(mic:_cop) \
+    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
+    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(table: length(ntable) alloc_if(0) free_if(0)) \
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                              const int ntable,
+                                                              Memory *memory,
+							      const int cop) {
+  if ( (ntypes != _ntypes || ntable != _ntable) ) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      table_t * otable = table;
+      flt_t * oetable = etable;
+      flt_t * odetable = detable;
+      flt_t * octable = ctable;
+      flt_t * odctable = dctable;
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          otable != NULL && oetable != NULL && odetable != NULL &&
+          octable != NULL && odctable != NULL && ospecial_coul != NULL &&
+	  cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
+	  nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
+	  nocopy(otable: alloc_if(0) free_if(1)) \
+	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(cutsq);
+      _memory->destroy(lj);
+      _memory->destroy(table);
+      _memory->destroy(etable);
+      _memory->destroy(detable);
+      _memory->destroy(ctable);
+      _memory->destroy(dctable);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(cutsq,ntypes,ntypes,"fc.cutsq");
+      memory->create(lj,ntypes,ntypes,"fc.lj");
+      memory->create(table,ntable,"pair:fc.table");
+      memory->create(etable,ntable,"pair:fc.etable");
+      memory->create(detable,ntable,"pair:fc.detable");
+      memory->create(ctable,ntable,"pair:fc.ctable");
+      memory->create(dctable,ntable,"pair:fc.dctable");
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      table_t * otable = table;
+      flt_t * oetable = etable;
+      flt_t * odetable = detable;
+      flt_t * octable = ctable;
+      flt_t * odctable = dctable;
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          otable !=NULL && oetable != NULL && odetable != NULL &&
+          octable != NULL && odctable != NULL && ospecial_coul != NULL &&
+	  cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \
+          nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \
+          nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes=ntypes;
+  _ntable=ntable;
+  _memory=memory;
+}
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
new file mode 100644
index 000000000..ad66c786b
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
@@ -0,0 +1,104 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/long/intel,PairLJCharmmCoulLongIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_INTEL_H
+#define LMP_PAIR_LJ_CHARMM_COUL_LONG_INTEL_H
+
+#include "pair_lj_charmm_coul_long.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
+
+ public:
+  PairLJCharmmCoulLongIntel(class LAMMPS *);
+  virtual ~PairLJCharmmCoulLongIntel();
+
+  virtual void compute(int, int);
+  void init_style();
+
+  typedef struct { float x,y,z; int w; } sng4_t;
+
+ private:
+  FixIntel *fix;
+  int _cop;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+	    IntelBuffers<flt_t,acc_t> * buffers,
+	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t r, dr, f, df; } table_t;
+    __declspec(align(64)) flt_t special_coul[4];
+    __declspec(align(64)) flt_t special_lj[4];
+    flt_t **cutsq, g_ewald, tabinnersq;
+    flt_t cut_coulsq, cut_ljsq;
+    flt_t cut_lj_innersq;
+    table_t *table;
+    flt_t *etable, *detable, *ctable, *dctable;
+    typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
+
+    ForceConst() : _ntypes(0), _ntable(0) {}
+    ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
+
+    void set_ntypes(const int ntypes, const int ntable, Memory *memory,
+		    const int cop);
+
+   private:
+    int _ntypes, _ntable, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+E: Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic
+
+The intel accelerated version of the CHARMM style requires that the
+Lennard-Jones cutoff is not greater than the coulombic cutoff.
+
+*/
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
new file mode 100644
index 000000000..4163a1f7d
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -0,0 +1,634 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_coul_long_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "kspace.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
+#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
+#define TABLE_T typename ForceConst<flt_t>::table_t
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulLongIntel::PairLJCutCoulLongIntel(LAMMPS *lmp) :
+  PairLJCutCoulLong(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulLongIntel::~PairLJCutCoulLongIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
+				     IntelBuffers<flt_t,acc_t> *buffers,
+				     const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
+				nthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+  
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      if (force->newton_pair) {
+	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
+				     IntelBuffers<flt_t,acc_t> *buffers,
+				     const ForceConst<flt_t> &fc,
+				     const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * restrict const x = buffers->get_x(offload);
+  flt_t * restrict const q = buffers->get_q(offload);
+
+  const int * restrict const numneigh = list->numneigh;
+  const int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int * restrict const firstneigh = buffers->firstneigh(list);
+
+  const flt_t * restrict const special_coul = fc.special_coul;
+  const flt_t * restrict const special_lj = fc.special_lj;
+  const flt_t qqrd2e = force->qqrd2e;
+
+  const C_FORCE_T * restrict const c_force = fc.c_force[0];
+  const C_ENERGY_T * restrict const c_energy = fc.c_energy[0];
+  const TABLE_T * restrict const table = fc.table;
+  const flt_t * restrict const etable = fc.etable;
+  const flt_t * restrict const detable = fc.detable;
+  const flt_t * restrict const ctable = fc.ctable;
+  const flt_t * restrict const dctable = fc.dctable;
+  const flt_t g_ewald = fc.g_ewald;
+  const flt_t tabinnersq = fc.tabinnersq;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * restrict f_start;
+  acc_t * restrict ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+  // Redeclare as local variables for offload
+  const int ncoultablebits = this->ncoultablebits;
+  const int ncoulmask = this->ncoulmask;
+  const int ncoulshiftbits = this->ncoulshiftbits;
+  #ifdef INTEL_ALLOW_TABLE
+  #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \
+                    in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \
+                    in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits)
+  #else
+  #define ITABLE_IN
+  #endif
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
+    in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
+    in(f_stride,nlocal,minlocal,separate_flag,offload) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    ITABLE_IN signal(f_start)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
+			      f_stride, x, q);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = oecoul = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal)	\
+      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      for (int i = iifrom; i < iito; ++i) {
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const C_FORCE_T * restrict const c_forcei = c_force + ptr_off;
+        const C_ENERGY_T * restrict const c_energyi = c_energy + ptr_off;
+
+        const int   * restrict const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp,fytmp,fztmp,fwtmp;
+	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const flt_t qtmp = q[i];
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EVFLAG) {
+	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+	}
+
+	#pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcecoul, forcelj, evdwl, ecoul;
+          forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
+
+          const int sbindex = jlist[jj] >> SBBITS & 3;
+          const int j = jlist[jj] & NEIGHMASK;
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          const flt_t r2inv = (flt_t)1.0 / rsq;
+
+	  #ifdef __MIC__
+	  if (rsq < c_forcei[jtype].cutsq) {
+          #endif
+            #ifdef INTEL_ALLOW_TABLE
+            if (!ncoultablebits || rsq <= tabinnersq) {
+            #endif
+              const flt_t A1 =  0.254829592;
+              const flt_t A2 = -0.284496736;
+              const flt_t A3 =  1.421413741;
+              const flt_t A4 = -1.453152027;
+              const flt_t A5 =  1.061405429;
+              const flt_t EWALD_F = 1.12837917;
+              const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+
+              const flt_t r = sqrt(rsq);
+              const flt_t grij = g_ewald * r;
+              const flt_t expm2 = exp(-grij * grij);
+              const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+              const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+              const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+              forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+              if (EFLAG) ecoul = prefactor * erfc;
+              if (sbindex) {
+                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+                    prefactor;
+                forcecoul -= adjust;
+                if (EFLAG) ecoul -= adjust;
+              }
+            #ifdef INTEL_ALLOW_TABLE
+            } else {
+              float rsq_lookup = rsq;
+              const int itable = (__intel_castf32_u32(rsq_lookup) &
+                  ncoulmask) >> ncoulshiftbits;
+              const flt_t fraction = (rsq_lookup - table[itable].r) *
+                  table[itable].dr;
+
+              const flt_t tablet = table[itable].f +
+                  fraction * table[itable].df;
+              forcecoul = qtmp * q[j] * tablet;
+              if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+                  fraction * detable[itable]);
+              if (sbindex) {
+                const flt_t table2 = ctable[itable] +
+                    fraction * dctable[itable];
+                const flt_t prefactor = qtmp * q[j] * table2;
+                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                    prefactor;
+                forcecoul -= adjust;
+                if (EFLAG) ecoul -= adjust;
+              }
+            }
+            #endif
+	  #ifdef __MIC__
+	  }
+	  #endif
+
+	  #ifdef __MIC__
+	  if (rsq < c_forcei[jtype].cut_ljsq) {
+	  #endif
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv -
+			       c_forcei[jtype].lj2);
+            if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv -
+                                      c_energyi[jtype].lj4) -
+                               c_energyi[jtype].offset;
+
+            if (sbindex) {
+              const flt_t factor_lj = special_lj[sbindex];
+              forcelj *= factor_lj;
+              if (EFLAG) evdwl *= factor_lj;
+            }
+	  #ifdef __MIC__
+	  }
+	  #else
+	  if (rsq > c_forcei[jtype].cutsq)
+	    { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
+	  if (rsq > c_forcei[jtype].cut_ljsq)
+	    { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+	  #endif
+
+	  #ifdef __MIC__
+	  if (rsq < c_forcei[jtype].cutsq) {
+	  #endif
+            const flt_t fpair = (forcecoul + forcelj) * r2inv;
+            fxtmp += delx * fpair;
+            fytmp += dely * fpair;
+            fztmp += delz * fpair;
+            if (NEWTON_PAIR || j < nlocal) {
+              f[j].x -= delx * fpair;
+              f[j].y -= dely * fpair;
+              f[j].z -= delz * fpair;
+            }
+
+            if (EVFLAG) {
+              flt_t ev_pre = (flt_t)0;
+              if (NEWTON_PAIR || i < nlocal)
+                ev_pre += (flt_t)0.5;
+              if (NEWTON_PAIR || j < nlocal)
+                ev_pre += (flt_t)0.5;
+
+              if (EFLAG) {
+                sevdwl += ev_pre * evdwl;
+                secoul += ev_pre * ecoul;
+                if (eatom) {
+                  if (NEWTON_PAIR || i < nlocal)
+                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                  if (NEWTON_PAIR || j < nlocal) 
+                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                }
+              }
+ 	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
+            }
+          #ifdef __MIC__
+	  }
+	  #endif
+        } // for jj
+
+        f[i].x += fxtmp;
+        f[i].y += fytmp;
+        f[i].z += fztmp;
+	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+      } // for ii
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			     nlocal, minlocal, nthreads, f_start, f_stride, 
+			     x);
+    } // end of omp parallel region
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = oecoul;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongIntel::init_style()
+{
+  PairLJCutCoulLong::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+  
+  #ifdef _LMP_INTEL_OFFLOAD
+  fix->set_offload_affinity();
+  _cop = fix->coprocessor_number();
+  #endif
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    fix->get_mixed_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    fix->get_double_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  } else {
+    fix->get_single_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_single_buffers());
+  }
+}
+
+template <class flt_t, class acc_t>
+void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                          IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int tp1 = atom->ntypes + 1;
+  int ntable = 1;
+  if (ncoultablebits)
+    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
+
+  fc.set_ntypes(tp1, ntable, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  fc.g_ewald = force->kspace->g_ewald;
+  fc.tabinnersq = tabinnersq;
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_coul[i] = force->special_coul[i];
+    fc.special_coul[0] = 1.0;
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.c_force[i][j].cutsq = cutsq[i][j];
+      fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
+      fc.c_force[i][j].lj1 = lj1[i][j];
+      fc.c_force[i][j].lj2 = lj2[i][j];
+      fc.c_energy[i][j].lj3 = lj3[i][j];
+      fc.c_energy[i][j].lj4 = lj4[i][j];
+      fc.c_energy[i][j].offset = offset[i][j];
+    }
+  }
+
+  if (ncoultablebits) {
+    for (int i = 0; i < ntable; i++) {
+      fc.table[i].r = rtable[i];
+      fc.table[i].dr = drtable[i];
+      fc.table[i].f = ftable[i];
+      fc.table[i].df = dftable[i];
+      fc.etable[i] = etable[i];
+      fc.detable[i] = detable[i];
+      fc.ctable[i] = ctable[i];
+      fc.dctable[i] = dctable[i];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  flt_t * special_coul = fc.special_coul;
+  C_FORCE_T * c_force = fc.c_force[0];
+  C_ENERGY_T * c_energy = fc.c_energy[0];
+  TABLE_T * table = fc.table;
+  flt_t * etable = fc.etable;
+  flt_t * detable = fc.detable;
+  flt_t * ctable = fc.ctable;
+  flt_t * dctable = fc.dctable;
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  #pragma offload_transfer target(mic:_cop) \
+    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
+    in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(table: length(ntable) alloc_if(0) free_if(0)) \
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+							   const int ntable,
+							   Memory *memory,
+							   const int cop) {
+  if ( (ntypes != _ntypes || ntable != _ntable) ) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      c_force_t * oc_force = c_force[0];
+      c_energy_t * oc_energy = c_energy[0];
+      table_t * otable = table;
+      flt_t * oetable = etable;
+      flt_t * odetable = detable;
+      flt_t * octable = ctable;
+      flt_t * odctable = dctable;
+      if (ospecial_lj != NULL && oc_force != NULL &&
+          oc_energy != NULL && otable != NULL && oetable != NULL &&
+          odetable != NULL && octable != NULL && odctable != NULL &&
+          ospecial_coul != NULL && _cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
+	  nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
+	  nocopy(otable: alloc_if(0) free_if(1)) \
+	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(c_force);
+      _memory->destroy(c_energy);
+      _memory->destroy(table);
+      _memory->destroy(etable);
+      _memory->destroy(detable);
+      _memory->destroy(ctable);
+      _memory->destroy(dctable);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(c_force,ntypes,ntypes,"fc.c_force");
+      memory->create(c_energy,ntypes,ntypes,"fc.c_energy");
+      memory->create(table,ntable,"pair:fc.table");
+      memory->create(etable,ntable,"pair:fc.etable");
+      memory->create(detable,ntable,"pair:fc.detable");
+      memory->create(ctable,ntable,"pair:fc.ctable");
+      memory->create(dctable,ntable,"pair:fc.dctable");
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      c_force_t * oc_force = c_force[0];
+      c_energy_t * oc_energy = c_energy[0];
+      table_t * otable = table;
+      flt_t * oetable = etable;
+      flt_t * odetable = detable;
+      flt_t * octable = ctable;
+      flt_t * odctable = dctable;
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && oc_force != NULL &&
+          oc_energy != NULL && otable !=NULL && oetable != NULL &&
+          odetable != NULL && octable != NULL && odctable != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
+          nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \
+          nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \
+          nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes=ntypes;
+  _ntable=ntable;
+  _memory=memory;
+}
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
new file mode 100644
index 000000000..d7b4282a9
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
@@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/long/intel,PairLJCutCoulLongIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_INTEL_H
+#define LMP_PAIR_LJ_CUT_COUL_LONG_INTEL_H
+
+#include "pair_lj_cut_coul_long.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
+
+ public:
+  PairLJCutCoulLongIntel(class LAMMPS *);
+  virtual ~PairLJCutCoulLongIntel();
+
+  virtual void compute(int, int);
+  void init_style();
+
+  typedef struct { float x,y,z; int w; } sng4_t;
+
+ private:
+  FixIntel *fix;
+  int _cop;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+	    IntelBuffers<flt_t,acc_t> * buffers,
+	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t cutsq, cut_ljsq, lj1, lj2; } c_force_t;
+    typedef struct { flt_t lj3, lj4, offset, pad; } c_energy_t;
+    typedef struct { flt_t r, dr, f, df; } table_t;
+    __declspec(align(64)) flt_t special_coul[4];
+    __declspec(align(64)) flt_t special_lj[4];
+    flt_t g_ewald, tabinnersq;
+    c_force_t **c_force;
+    c_energy_t **c_energy;
+    table_t *table;
+    flt_t *etable, *detable, *ctable, *dctable;
+
+    ForceConst() : _ntypes(0), _ntable(0) {}
+    ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
+
+    void set_ntypes(const int ntypes, const int ntable, Memory *memory,
+		    const int cop);
+
+   private:
+    int _ntypes, _ntable, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+*/
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
new file mode 100644
index 000000000..bca3a7349
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -0,0 +1,412 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutIntel::PairLJCutIntel(LAMMPS *lmp) :
+  PairLJCut(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCutIntel::compute(int eflag, int vflag,
+                             IntelBuffers<flt_t,acc_t> *buffers,
+                             const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag, vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    if (ago != 0) {
+      #if defined(_OPENMP)
+      #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+      #endif
+      {
+        int ifrom, ito, tid;
+	IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+				  nthreads, sizeof(ATOM_T));
+	buffers->thr_pack(ifrom,ito,ago);
+      }
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCutIntel::eval(const int offload, const int vflag,
+                          IntelBuffers<flt_t,acc_t> *buffers,
+                          const ForceConst<flt_t> &fc,
+                          const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * restrict const x = buffers->get_x(offload);
+
+  const int * restrict const numneigh = list->numneigh;
+  const int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int * restrict const firstneigh = buffers->firstneigh(list);
+  const flt_t * restrict const special_lj = fc.special_lj;
+  const FC_PACKED1_T * restrict const ljc12o = fc.ljc12o[0];
+  const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * restrict f_start;
+  acc_t * restrict ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+  int *overflow = fix->get_off_overflow_flag();
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
+			      f_stride, x, 0);
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal) \
+      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      for (int i = iifrom; i < iito; ++i) {
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const FC_PACKED1_T * restrict const ljc12oi = ljc12o + ptr_off;
+        const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
+
+        const int * restrict const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EVFLAG) {
+          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        }
+
+        #pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcelj, evdwl;
+          forcelj = evdwl = (flt_t)0.0;
+
+          const int sbindex = jlist[jj] >> SBBITS & 3;
+          const int j = jlist[jj] & NEIGHMASK;
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          #ifdef __MIC__
+          if (rsq < ljc12oi[jtype].cutsq) {
+	  #endif
+            flt_t factor_lj = special_lj[sbindex];
+            flt_t r2inv = 1.0 / rsq;
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            #ifndef __MIC__
+	    if (rsq > ljc12oi[jtype].cutsq) r6inv = (flt_t)0.0;
+	    #endif
+            forcelj = r6inv * (ljc12oi[jtype].lj1 * r6inv - ljc12oi[jtype].lj2);
+            flt_t fpair = factor_lj * forcelj * r2inv;
+
+            fxtmp += delx * fpair;
+            fytmp += dely * fpair;
+            fztmp += delz * fpair;
+            if (NEWTON_PAIR || j < nlocal) {
+              f[j].x -= delx * fpair;
+              f[j].y -= dely * fpair;
+              f[j].z -= delz * fpair;
+            }
+
+            if (EVFLAG) {
+              flt_t ev_pre = (flt_t)0;
+              if (NEWTON_PAIR || i<nlocal)
+                ev_pre += (flt_t)0.5;
+              if (NEWTON_PAIR || j<nlocal)
+                ev_pre += (flt_t)0.5;
+
+              if (EFLAG) {
+                evdwl = r6inv * (lj34i[jtype].lj3 * r6inv-lj34i[jtype].lj4) -
+                    ljc12oi[jtype].offset;
+                evdwl *= factor_lj;
+                sevdwl += ev_pre*evdwl;
+                if (eatom) {
+                  if (NEWTON_PAIR || i < nlocal)
+                    fwtmp += 0.5 * evdwl;
+                  if (NEWTON_PAIR || j < nlocal)
+                    f[j].w += 0.5 * evdwl;
+                }
+              }
+
+	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
+				   delx, dely, delz);
+            }
+          #ifdef __MIC__
+          } // if rsq
+          #endif
+        } // for jj
+        f[i].x += fxtmp;
+        f[i].y += fytmp;
+        f[i].z += fztmp;
+        
+	IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+      } // for ii
+
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			     nlocal, minlocal, nthreads, f_start, f_stride, 
+			     x);
+    } // end omp
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+	ev_global[1] = (acc_t)0.0;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutIntel::init_style()
+{
+  PairLJCut::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_balance() != 0.0)
+    error->all(FLERR,
+          "Offload for lj/cut/intel is not yet available. Set balance to 0.");
+  #endif
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    fix->get_mixed_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    fix->get_double_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  } else {
+    fix->get_single_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_single_buffers());
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                      IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int tp1 = atom->ntypes + 1;
+  fc.set_ntypes(tp1,memory,_cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.ljc12o[i][j].lj1 = lj1[i][j];
+      fc.ljc12o[i][j].lj2 = lj2[i][j];
+      fc.lj34[i][j].lj3 = lj3[i][j];
+      fc.lj34[i][j].lj4 = lj4[i][j];
+      fc.ljc12o[i][j].cutsq = cutsq[i][j];
+      fc.ljc12o[i][j].offset = offset[i][j];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                   Memory *memory,
+						   const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      fc_packed1 *oljc12o = ljc12o[0];
+      fc_packed2 *olj34 = lj34[0];
+
+      _memory->destroy(oljc12o);
+      _memory->destroy(olj34);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(ljc12o,ntypes,ntypes,"fc.c12o");
+      memory->create(lj34,ntypes,ntypes,"fc.lj34");
+    }
+  }
+  _ntypes = ntypes;
+  _memory = memory;
+}
diff --git a/src/USER-INTEL/pair_lj_cut_intel.h b/src/USER-INTEL/pair_lj_cut_intel.h
new file mode 100644
index 000000000..a40e39af5
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_cut_intel.h
@@ -0,0 +1,93 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/intel,PairLJCutIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_INTEL_H
+#define LMP_PAIR_LJ_CUT_INTEL_H
+
+#include "pair_lj_cut.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutIntel : public PairLJCut {
+
+ public:
+  PairLJCutIntel(class LAMMPS *);
+
+  virtual void compute(int, int);
+  void init_style();
+
+ private:
+  FixIntel *fix;
+  int _cop;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t cutsq, lj1, lj2, offset; } fc_packed1;
+    typedef struct { flt_t lj3, lj4; } fc_packed2;
+
+    __declspec(align(64)) flt_t special_lj[4];
+    fc_packed1 **ljc12o;
+    fc_packed2 **lj34;
+
+    ForceConst() : _ntypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL, _cop); }
+
+    void set_ntypes(const int ntypes, Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+*/
diff --git a/src/USER-INTEL/verlet_intel.cpp b/src/USER-INTEL/verlet_intel.cpp
new file mode 100644
index 000000000..64177e0f0
--- /dev/null
+++ b/src/USER-INTEL/verlet_intel.cpp
@@ -0,0 +1,486 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "string.h"
+#include "verlet_intel.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "atom.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "output.h"
+#include "update.h"
+#include "modify.h"
+#include "compute.h"
+#include "fix.h"
+#include "timer.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+VerletIntel::VerletIntel(LAMMPS *lmp, int narg, char **arg) :
+  Integrate(lmp, narg, arg) {}
+
+/* ----------------------------------------------------------------------
+   initialization before run
+------------------------------------------------------------------------- */
+
+void VerletIntel::init()
+{
+  Integrate::init();
+
+  // warn if no fixes
+
+  if (modify->nfix == 0 && comm->me == 0)
+    error->warning(FLERR,"No fixes defined, atoms won't move");
+
+  // virial_style:
+  // 1 if computed explicitly by pair->compute via sum over pair interactions
+  // 2 if computed implicitly by pair->virial_fdotr_compute via sum over ghosts
+
+  if (force->newton_pair) virial_style = 2;
+  else virial_style = 1;
+
+  // setup lists of computes for global and per-atom PE and pressure
+
+  ev_setup();
+
+  // detect if fix omp is present for clearing force arrays
+
+  int ifix = modify->find_fix("package_omp");
+  if (ifix >= 0) external_force_clear = 1;
+
+  if (nvlist_atom)
+    error->all(FLERR,
+	       "Cannot currently get per-atom virials with Intel package.");
+  #ifdef _LMP_INTEL_OFFLOAD
+  ifix = modify->find_fix("package_intel");
+  if (ifix >= 0) fix_intel = static_cast<FixIntel *>(modify->fix[ifix]);
+  else fix_intel = 0;
+  #endif
+
+  // set flags for what arrays to clear in force_clear()
+  // need to clear additionals arrays if they exist
+
+  torqueflag = 0;
+  if (atom->torque_flag) torqueflag = 1;
+  erforceflag = 0;
+  if (atom->erforce_flag) erforceflag = 1;
+  e_flag = 0;
+  if (atom->e_flag) e_flag = 1;
+  rho_flag = 0;
+  if (atom->rho_flag) rho_flag = 1;
+
+  // orthogonal vs triclinic simulation box
+
+  triclinic = domain->triclinic;
+}
+
+/* ----------------------------------------------------------------------
+   setup before run
+------------------------------------------------------------------------- */
+
+void VerletIntel::setup()
+{
+  if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");
+
+  update->setupflag = 1;
+
+  // setup domain, communication and neighboring
+  // acquire ghosts
+  // build neighbor lists
+
+  atom->setup();
+  modify->setup_pre_exchange();
+  if (triclinic) domain->x2lamda(atom->nlocal);
+  domain->pbc();
+  domain->reset_box();
+  comm->setup();
+  if (neighbor->style) neighbor->setup_bins();
+  comm->exchange();
+  if (atom->sortfreq > 0) atom->sort();
+  comm->borders();
+  if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+  domain->image_check();
+  domain->box_too_small_check();
+  modify->setup_pre_neighbor();
+  neighbor->build();
+  neighbor->ncalls = 0;
+
+  // compute all forces
+
+  ev_set(update->ntimestep);
+  force_clear();
+  modify->setup_pre_force(vflag);
+
+  if (pair_compute_flag) force->pair->compute(eflag,vflag);
+  else if (force->pair) force->pair->compute_dummy(eflag,vflag);
+
+  if (atom->molecular) {
+    if (force->bond) force->bond->compute(eflag,vflag);
+    if (force->angle) force->angle->compute(eflag,vflag);
+    if (force->dihedral) force->dihedral->compute(eflag,vflag);
+    if (force->improper) force->improper->compute(eflag,vflag);
+  }
+
+  if (force->kspace) {
+    force->kspace->setup();
+    if (kspace_compute_flag) force->kspace->compute(eflag,vflag);
+    else force->kspace->compute_dummy(eflag,vflag);
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  sync_mode = 0;
+  if (fix_intel) {
+    if (fix_intel->offload_balance() != 0.0) {
+      if (fix_intel->offload_noghost())
+	sync_mode = 2;
+      else
+	sync_mode = 1;
+    }
+  }
+  
+  if (sync_mode == 1) fix_intel->sync_coprocessor();
+  #endif
+
+  if (force->newton) comm->reverse_comm();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (sync_mode == 2) fix_intel->sync_coprocessor();
+  #endif
+
+  modify->setup(vflag);
+  output->setup();
+  update->setupflag = 0;
+}
+
+/* ----------------------------------------------------------------------
+   setup without output
+   flag = 0 = just force calculation
+   flag = 1 = reneighbor and force calculation
+------------------------------------------------------------------------- */
+
+void VerletIntel::setup_minimal(int flag)
+{
+  update->setupflag = 1;
+
+  // setup domain, communication and neighboring
+  // acquire ghosts
+  // build neighbor lists
+
+  if (flag) {
+    modify->setup_pre_exchange();
+    if (triclinic) domain->x2lamda(atom->nlocal);
+    domain->pbc();
+    domain->reset_box();
+    comm->setup();
+    if (neighbor->style) neighbor->setup_bins();
+    comm->exchange();
+    comm->borders();
+    if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+    domain->image_check();
+    domain->box_too_small_check();
+    modify->setup_pre_neighbor();
+    neighbor->build();
+    neighbor->ncalls = 0;
+  }
+
+  // compute all forces
+
+  ev_set(update->ntimestep);
+  force_clear();
+  modify->setup_pre_force(vflag);
+
+  if (pair_compute_flag) force->pair->compute(eflag,vflag);
+  else if (force->pair) force->pair->compute_dummy(eflag,vflag);
+
+  if (atom->molecular) {
+    if (force->bond) force->bond->compute(eflag,vflag);
+    if (force->angle) force->angle->compute(eflag,vflag);
+    if (force->dihedral) force->dihedral->compute(eflag,vflag);
+    if (force->improper) force->improper->compute(eflag,vflag);
+  }
+
+  if (force->kspace) {
+    force->kspace->setup();
+    if (kspace_compute_flag) force->kspace->compute(eflag,vflag);
+    else force->kspace->compute_dummy(eflag,vflag);
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  sync_mode = 0;
+  if (fix_intel) {
+    if (fix_intel->offload_balance() != 0.0) {
+      if (fix_intel->offload_noghost())
+	sync_mode = 2;
+      else
+	sync_mode = 1;
+    }
+  }
+  
+  if (sync_mode == 1) fix_intel->sync_coprocessor();
+  #endif
+
+  if (force->newton) comm->reverse_comm();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (sync_mode == 2) fix_intel->sync_coprocessor();
+  #endif
+
+  modify->setup(vflag);
+  update->setupflag = 0;
+}
+
+/* ----------------------------------------------------------------------
+   run for N steps
+------------------------------------------------------------------------- */
+
+void VerletIntel::run(int n)
+{
+  bigint ntimestep;
+  int nflag,sortflag;
+
+  int n_post_integrate = modify->n_post_integrate;
+  int n_pre_exchange = modify->n_pre_exchange;
+  int n_pre_neighbor = modify->n_pre_neighbor;
+  int n_pre_force = modify->n_pre_force;
+  int n_post_force = modify->n_post_force;
+  int n_end_of_step = modify->n_end_of_step;
+
+  if (atom->sortfreq > 0) sortflag = 1;
+  else sortflag = 0;
+
+  for (int i = 0; i < n; i++) {
+
+    ntimestep = ++update->ntimestep;
+    ev_set(ntimestep);
+
+    // initial time integration
+
+    modify->initial_integrate(vflag);
+    if (n_post_integrate) modify->post_integrate();
+
+    // regular communication vs neighbor list rebuild
+
+    nflag = neighbor->decide();
+
+    if (nflag == 0) {
+      timer->stamp();
+      comm->forward_comm();
+      timer->stamp(TIME_COMM);
+    } else {
+      if (n_pre_exchange) modify->pre_exchange();
+      if (triclinic) domain->x2lamda(atom->nlocal);
+      domain->pbc();
+      if (domain->box_change) {
+        domain->reset_box();
+        comm->setup();
+        if (neighbor->style) neighbor->setup_bins();
+      }
+      timer->stamp();
+      comm->exchange();
+      if (sortflag && ntimestep >= atom->nextsort) atom->sort();
+      comm->borders();
+      if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+      timer->stamp(TIME_COMM);
+      if (n_pre_neighbor) modify->pre_neighbor();
+      neighbor->build();
+      timer->stamp(TIME_NEIGHBOR);
+    }
+
+    // force computations
+    // important for pair to come before bonded contributions
+    // since some bonded potentials tally pairwise energy/virial
+    // and Pair:ev_tally() needs to be called before any tallying
+
+    force_clear();
+    if (n_pre_force) modify->pre_force(vflag);
+
+    timer->stamp();
+
+    if (pair_compute_flag) {
+      force->pair->compute(eflag,vflag);
+      timer->stamp(TIME_PAIR);
+    }
+
+    if (atom->molecular) {
+      if (force->bond) force->bond->compute(eflag,vflag);
+      if (force->angle) force->angle->compute(eflag,vflag);
+      if (force->dihedral) force->dihedral->compute(eflag,vflag);
+      if (force->improper) force->improper->compute(eflag,vflag);
+      timer->stamp(TIME_BOND);
+    }
+
+    if (kspace_compute_flag) {
+      force->kspace->compute(eflag,vflag);
+      timer->stamp(TIME_KSPACE);
+    }
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (sync_mode == 1) {
+      fix_intel->sync_coprocessor();
+      timer->stamp(TIME_PAIR);
+    }
+    #endif
+
+    // reverse communication of forces
+
+    if (force->newton) {
+      comm->reverse_comm();
+      timer->stamp(TIME_COMM);
+    }
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (sync_mode == 2) {
+      fix_intel->sync_coprocessor();
+      timer->stamp(TIME_PAIR);
+    }
+    #endif
+
+    // force modifications, final time integration, diagnostics
+
+    if (n_post_force) modify->post_force(vflag);
+    modify->final_integrate();
+    if (n_end_of_step) modify->end_of_step();
+
+    // all output
+
+    if (ntimestep == output->next) {
+      timer->stamp();
+      output->write(ntimestep);
+      timer->stamp(TIME_OUTPUT);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void VerletIntel::cleanup()
+{
+  modify->post_run();
+  domain->box_too_small_check();
+  update->update_time();
+}
+
+/* ----------------------------------------------------------------------
+   clear force on own & ghost atoms
+   clear other arrays as needed
+------------------------------------------------------------------------- */
+
+void VerletIntel::force_clear()
+{
+  int i;
+
+  if (external_force_clear) return;
+
+  // clear force on all particles
+  // if either newton flag is set, also include ghosts
+  // when using threads always clear all forces.
+
+  if (neighbor->includegroup == 0) {
+    int nall;
+    if (force->newton) nall = atom->nlocal + atom->nghost;
+    else nall = atom->nlocal;
+
+    size_t nbytes = sizeof(double) * nall;
+
+    if (nbytes) {
+      memset(&(atom->f[0][0]),0,3*nbytes);
+      if (torqueflag)  memset(&(atom->torque[0][0]),0,3*nbytes);
+      if (erforceflag) memset(&(atom->erforce[0]),  0,  nbytes);
+      if (e_flag)      memset(&(atom->de[0]),       0,  nbytes);
+      if (rho_flag)    memset(&(atom->drho[0]),     0,  nbytes);
+    }
+
+  // neighbor includegroup flag is set
+  // clear force only on initial nfirst particles
+  // if either newton flag is set, also include ghosts
+
+  } else {
+    int nall = atom->nfirst;
+
+    double **f = atom->f;
+    for (i = 0; i < nall; i++) {
+      f[i][0] = 0.0;
+      f[i][1] = 0.0;
+      f[i][2] = 0.0;
+    }
+
+    if (torqueflag) {
+      double **torque = atom->torque;
+      for (i = 0; i < nall; i++) {
+        torque[i][0] = 0.0;
+        torque[i][1] = 0.0;
+        torque[i][2] = 0.0;
+      }
+    }
+
+    if (erforceflag) {
+      double *erforce = atom->erforce;
+      for (i = 0; i < nall; i++) erforce[i] = 0.0;
+    }
+
+    if (e_flag) {
+      double *de = atom->de;
+      for (i = 0; i < nall; i++) de[i] = 0.0;
+    }
+
+    if (rho_flag) {
+      double *drho = atom->drho;
+      for (i = 0; i < nall; i++) drho[i] = 0.0;
+    }
+
+    if (force->newton) {
+      nall = atom->nlocal + atom->nghost;
+
+      for (i = atom->nlocal; i < nall; i++) {
+        f[i][0] = 0.0;
+        f[i][1] = 0.0;
+        f[i][2] = 0.0;
+      }
+
+      if (torqueflag) {
+        double **torque = atom->torque;
+        for (i = atom->nlocal; i < nall; i++) {
+          torque[i][0] = 0.0;
+          torque[i][1] = 0.0;
+          torque[i][2] = 0.0;
+        }
+      }
+
+      if (erforceflag) {
+        double *erforce = atom->erforce;
+        for (i = atom->nlocal; i < nall; i++) erforce[i] = 0.0;
+      }
+
+      if (e_flag) {
+        double *de = atom->de;
+        for (i = 0; i < nall; i++) de[i] = 0.0;
+      }
+
+      if (rho_flag) {
+        double *drho = atom->drho;
+        for (i = 0; i < nall; i++) drho[i] = 0.0;
+      }
+    }
+  }
+}
diff --git a/src/USER-INTEL/verlet_intel.h b/src/USER-INTEL/verlet_intel.h
new file mode 100644
index 000000000..de4231431
--- /dev/null
+++ b/src/USER-INTEL/verlet_intel.h
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef INTEGRATE_CLASS
+
+IntegrateStyle(verlet/intel,VerletIntel)
+
+#else
+
+#ifndef LMP_VERLET_INTEL_H
+#define LMP_VERLET_INTEL_H
+
+#include "integrate.h"
+#ifdef LMP_INTEL_OFFLOAD
+#include "fix_intel.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class VerletIntel : public Integrate {
+ public:
+  VerletIntel(class LAMMPS *, int, char **);
+  virtual ~VerletIntel() {}
+  virtual void init();
+  virtual void setup();
+  virtual void setup_minimal(int);
+  virtual void run(int);
+  void cleanup();
+
+ protected:
+  int triclinic;                    // 0 if domain is orthog, 1 if triclinic
+  int torqueflag,erforceflag;
+  int e_flag,rho_flag;
+
+  virtual void force_clear();
+  #ifdef _LMP_INTEL_OFFLOAD
+  FixIntel *fix_intel;
+  int sync_mode;
+  #endif
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+W: No fixes defined, atoms won't move
+
+If you are not using a fix like nve, nvt, npt then atom velocities and
+coordinates will not be updated during timestepping.
+
+E: Cannot currently get per-atom virials with intel package.
+
+The Intel package does not yet support per-atom virial calculation.
+
+*/
diff --git a/src/USER-INTEL/verlet_split_intel.cpp b/src/USER-INTEL/verlet_split_intel.cpp
new file mode 100644
index 000000000..3976607b1
--- /dev/null
+++ b/src/USER-INTEL/verlet_split_intel.cpp
@@ -0,0 +1,589 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Yuxing Peng and Chris Knight (U Chicago)
+------------------------------------------------------------------------- */
+
+#include "string.h"
+#include "verlet_split_intel.h"
+#include "universe.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "output.h"
+#include "update.h"
+#include "fix.h"
+#include "modify.h"
+#include "timer.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+VerletSplitIntel::VerletSplitIntel(LAMMPS *lmp, int narg, char **arg) :
+  VerletIntel(lmp, narg, arg)
+{
+  // error checks on partitions
+
+  if (universe->nworlds != 2)
+    error->universe_all(FLERR,"Verlet/split requires 2 partitions");
+  if (universe->procs_per_world[0] % universe->procs_per_world[1])
+    error->universe_all(FLERR,"Verlet/split requires Rspace partition "
+                        "size be multiple of Kspace partition size");
+
+  // master = 1 for Rspace procs, 0 for Kspace procs
+
+  if (universe->iworld == 0) master = 1;
+  else master = 0;
+
+  ratio = universe->procs_per_world[0] / universe->procs_per_world[1];
+
+  // Kspace root proc broadcasts info about Kspace proc layout to Rspace procs
+
+  int kspace_procgrid[3];
+
+  if (universe->me == universe->root_proc[1]) {
+    kspace_procgrid[0] = comm->procgrid[0];
+    kspace_procgrid[1] = comm->procgrid[1];
+    kspace_procgrid[2] = comm->procgrid[2];
+  }
+  MPI_Bcast(kspace_procgrid,3,MPI_INT,universe->root_proc[1],universe->uworld);
+
+  int ***kspace_grid2proc;
+  memory->create(kspace_grid2proc,kspace_procgrid[0],
+                 kspace_procgrid[1],kspace_procgrid[2],
+                 "verlet/split:kspace_grid2proc");
+
+  if (universe->me == universe->root_proc[1]) {
+    for (int i = 0; i < comm->procgrid[0]; i++)
+      for (int j = 0; j < comm->procgrid[1]; j++)
+        for (int k = 0; k < comm->procgrid[2]; k++)
+          kspace_grid2proc[i][j][k] = comm->grid2proc[i][j][k];
+  }
+  MPI_Bcast(&kspace_grid2proc[0][0][0],
+            kspace_procgrid[0]*kspace_procgrid[1]*kspace_procgrid[2],MPI_INT,
+            universe->root_proc[1],universe->uworld);
+
+  // Rspace partition must be multiple of Kspace partition in each dim
+  // so atoms of one Kspace proc coincide with atoms of several Rspace procs
+
+  if (master) {
+    int flag = 0;
+    if (comm->procgrid[0] % kspace_procgrid[0]) flag = 1;
+    if (comm->procgrid[1] % kspace_procgrid[1]) flag = 1;
+    if (comm->procgrid[2] % kspace_procgrid[2]) flag = 1;
+    if (flag)
+      error->one(FLERR,
+                 "Verlet/split requires Rspace partition layout be "
+                 "multiple of Kspace partition layout in each dim");
+  }
+
+  // block = 1 Kspace proc with set of Rspace procs it overlays
+  // me_block = 0 for Kspace proc
+  // me_block = 1 to ratio for Rspace procs
+  // block = MPI communicator for that set of procs
+
+  int iblock,key;
+
+  if (!master) {
+    iblock = comm->me;
+    key = 0;
+  } else {
+    int kpx = comm->myloc[0] / (comm->procgrid[0]/kspace_procgrid[0]);
+    int kpy = comm->myloc[1] / (comm->procgrid[1]/kspace_procgrid[1]);
+    int kpz = comm->myloc[2] / (comm->procgrid[2]/kspace_procgrid[2]);
+    iblock = kspace_grid2proc[kpx][kpy][kpz];
+    key = 1;
+  }
+
+  MPI_Comm_split(universe->uworld,iblock,key,&block);
+  MPI_Comm_rank(block,&me_block);
+
+  // output block groupings to universe screen/logfile
+  // bmap is ordered by block and then by proc within block
+
+  int *bmap = new int[universe->nprocs];
+  for (int i = 0; i < universe->nprocs; i++) bmap[i] = -1;
+  bmap[iblock*(ratio+1)+me_block] = universe->me;
+
+  int *bmapall = new int[universe->nprocs];
+  MPI_Allreduce(bmap,bmapall,universe->nprocs,MPI_INT,MPI_MAX,universe->uworld);
+
+  if (universe->me == 0) {
+    if (universe->uscreen) {
+      fprintf(universe->uscreen,
+              "Per-block Rspace/Kspace proc IDs (original proc IDs):\n");
+      int m = 0;
+      for (int i = 0; i < universe->nprocs/(ratio+1); i++) {
+        fprintf(universe->uscreen,"  block %d:",i);
+        int kspace_proc = bmapall[m];
+        for (int j = 1; j <= ratio; j++)
+          fprintf(universe->uscreen," %d",bmapall[m+j]);
+        fprintf(universe->uscreen," %d",kspace_proc);
+        kspace_proc = bmapall[m];
+        for (int j = 1; j <= ratio; j++) {
+          if (j == 1) fprintf(universe->uscreen," (");
+          else fprintf(universe->uscreen," ");
+          fprintf(universe->uscreen,"%d",
+                  universe->uni2orig[bmapall[m+j]]);
+        }
+        fprintf(universe->uscreen," %d)\n",universe->uni2orig[kspace_proc]);
+        m += ratio + 1;
+      }
+    }
+    if (universe->ulogfile) {
+      fprintf(universe->ulogfile,
+              "Per-block Rspace/Kspace proc IDs (original proc IDs):\n");
+      int m = 0;
+      for (int i = 0; i < universe->nprocs/(ratio+1); i++) {
+        fprintf(universe->ulogfile,"  block %d:",i);
+        int kspace_proc = bmapall[m];
+        for (int j = 1; j <= ratio; j++)
+          fprintf(universe->ulogfile," %d",bmapall[m+j]);
+
+        fprintf(universe->ulogfile," %d",kspace_proc);
+        kspace_proc = bmapall[m];
+        for (int j = 1; j <= ratio; j++) {
+          if (j == 1) fprintf(universe->ulogfile," (");
+          else fprintf(universe->ulogfile," ");
+          fprintf(universe->ulogfile,"%d",
+                  universe->uni2orig[bmapall[m+j]]);
+        }
+        fprintf(universe->ulogfile," %d)\n",universe->uni2orig[kspace_proc]);
+        m += ratio + 1;
+      }
+    }
+  }
+
+  memory->destroy(kspace_grid2proc);
+  delete [] bmap;
+  delete [] bmapall;
+
+  // size/disp = vectors for MPI gather/scatter within block
+
+  qsize = new int[ratio+1];
+  qdisp = new int[ratio+1];
+  xsize = new int[ratio+1];
+  xdisp = new int[ratio+1];
+
+  // f_kspace = Rspace copy of Kspace forces
+  // allocate dummy version for Kspace partition
+
+  maxatom = 0;
+  f_kspace = NULL;
+  if (!master) memory->create(f_kspace,1,1,"verlet/split:f_kspace");
+}
+
+/* ---------------------------------------------------------------------- */
+
+VerletSplitIntel::~VerletSplitIntel()
+{
+  delete [] qsize;
+  delete [] qdisp;
+  delete [] xsize;
+  delete [] xdisp;
+  memory->destroy(f_kspace);
+  MPI_Comm_free(&block);
+}
+
+/* ----------------------------------------------------------------------
+   initialization before run
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::init()
+{
+  if (!force->kspace && comm->me == 0)
+    error->warning(FLERR,"No Kspace calculation with verlet/split");
+
+  if (force->kspace_match("tip4p",0)) tip4p_flag = 1;
+  else tip4p_flag = 0;
+
+  // currently TIP4P does not work with verlet/split, so generate error
+  // see Axel email on this, also other TIP4P notes below
+
+  if (tip4p_flag) error->all(FLERR,"Verlet/split does not yet support TIP4P");
+
+  VerletIntel::init();
+}
+
+/* ----------------------------------------------------------------------
+   setup before run
+   servant partition only sets up KSpace calculation
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::setup()
+{
+  if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");
+
+  if (!master) force->kspace->setup();
+  else {
+    VerletIntel::setup();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   setup without output
+   flag = 0 = just force calculation
+   flag = 1 = reneighbor and force calculation
+   servant partition only sets up KSpace calculation
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::setup_minimal(int flag)
+{
+  if (!master) force->kspace->setup();
+  else {
+    VerletIntel::setup_minimal(flag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   run for N steps
+   master partition does everything but Kspace
+   servant partition does just Kspace
+   communicate back and forth every step:
+     atom coords from master -> servant
+     kspace forces from servant -> master
+     also box bounds from master -> servant if necessary
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::run(int n)
+{
+  bigint ntimestep;
+  int nflag,sortflag;
+
+  // sync both partitions before start timer
+
+  MPI_Barrier(universe->uworld);
+  timer->init();
+  timer->barrier_start(TIME_LOOP);
+
+  // setup initial Rspace <-> Kspace comm params
+
+  rk_setup();
+
+  // check if OpenMP support fix defined
+
+  Fix *fix_omp;
+  int ifix = modify->find_fix("package_omp");
+  if (ifix < 0) fix_omp = NULL;
+  else fix_omp = modify->fix[ifix];
+
+  // flags for timestepping iterations
+
+  int n_post_integrate = modify->n_post_integrate;
+  int n_pre_exchange = modify->n_pre_exchange;
+  int n_pre_neighbor = modify->n_pre_neighbor;
+  int n_pre_force = modify->n_pre_force;
+  int n_post_force = modify->n_post_force;
+  int n_end_of_step = modify->n_end_of_step;
+
+  if (atom->sortfreq > 0) sortflag = 1;
+  else sortflag = 0;
+
+  for (int i = 0; i < n; i++) {
+
+    ntimestep = ++update->ntimestep;
+    ev_set(ntimestep);
+
+    // initial time integration
+
+    if (master) {
+      modify->initial_integrate(vflag);
+      if (n_post_integrate) modify->post_integrate();
+    }
+
+    // regular communication vs neighbor list rebuild
+
+    if (master) nflag = neighbor->decide();
+    MPI_Bcast(&nflag,1,MPI_INT,1,block);
+
+    if (master) {
+      if (nflag == 0) {
+        timer->stamp();
+        comm->forward_comm();
+        timer->stamp(TIME_COMM);
+      } else {
+        if (n_pre_exchange) modify->pre_exchange();
+        if (triclinic) domain->x2lamda(atom->nlocal);
+        domain->pbc();
+        if (domain->box_change) {
+          domain->reset_box();
+          comm->setup();
+          if (neighbor->style) neighbor->setup_bins();
+        }
+        timer->stamp();
+        comm->exchange();
+        if (sortflag && ntimestep >= atom->nextsort) atom->sort();
+        comm->borders();
+        if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+        timer->stamp(TIME_COMM);
+        if (n_pre_neighbor) modify->pre_neighbor();
+        neighbor->build();
+        timer->stamp(TIME_NEIGHBOR);
+      }
+    }
+
+    // if reneighboring occurred, re-setup Rspace <-> Kspace comm params
+    // comm Rspace atom coords to Kspace procs
+
+    if (nflag) rk_setup();
+    r2k_comm();
+
+    // force computations
+
+    force_clear();
+
+    if (master) {
+      if (n_pre_force) modify->pre_force(vflag);
+
+      timer->stamp();
+      if (force->pair) {
+        force->pair->compute(eflag,vflag);
+        timer->stamp(TIME_PAIR);
+      }
+
+      if (atom->molecular) {
+        if (force->bond) force->bond->compute(eflag,vflag);
+        if (force->angle) force->angle->compute(eflag,vflag);
+        if (force->dihedral) force->dihedral->compute(eflag,vflag);
+        if (force->improper) force->improper->compute(eflag,vflag);
+        timer->stamp(TIME_BOND);
+      }
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (sync_mode == 1) {
+	fix_intel->sync_coprocessor();
+	timer->stamp(TIME_PAIR);
+      }
+      #endif
+
+      if (force->newton) {
+        comm->reverse_comm();
+        timer->stamp(TIME_COMM);
+      }
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (sync_mode == 2) {
+	fix_intel->sync_coprocessor();
+	timer->stamp(TIME_PAIR);
+      }
+      #endif
+
+    } else {
+
+      // run FixOMP as sole pre_force fix, if defined
+
+      if (fix_omp) fix_omp->pre_force(vflag);
+
+      if (force->kspace) {
+        timer->stamp();
+        force->kspace->compute(eflag,vflag);
+        timer->stamp(TIME_KSPACE);
+      }
+
+      // TIP4P PPPM puts forces on ghost atoms, so must reverse_comm()
+
+      if (tip4p_flag && force->newton) {
+        comm->reverse_comm();
+        timer->stamp(TIME_COMM);
+      }
+    }
+
+    // comm and sum Kspace forces back to Rspace procs
+
+    k2r_comm();
+
+    // force modifications, final time integration, diagnostics
+    // all output
+
+    if (master) {
+      if (n_post_force) modify->post_force(vflag);
+      modify->final_integrate();
+      if (n_end_of_step) modify->end_of_step();
+
+      if (ntimestep == output->next) {
+        timer->stamp();
+        output->write(ntimestep);
+        timer->stamp(TIME_OUTPUT);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   setup params for Rspace <-> Kspace communication
+   called initially and after every reneighbor
+   also communcicate atom charges from Rspace to KSpace since static
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::rk_setup()
+{
+  // grow f_kspace array on master procs if necessary
+
+  if (master) {
+    if (atom->nlocal > maxatom) {
+      memory->destroy(f_kspace);
+      maxatom = atom->nmax;
+      memory->create(f_kspace,maxatom,3,"verlet/split:f_kspace");
+    }
+  }
+
+  // qsize = # of atoms owned by each master proc in block
+
+  int n = 0;
+  if (master) n = atom->nlocal;
+  MPI_Gather(&n,1,MPI_INT,qsize,1,MPI_INT,0,block);
+
+  // setup qdisp, xsize, xdisp based on qsize
+  // only needed by Kspace proc
+  // set Kspace nlocal to sum of Rspace nlocals
+  // insure Kspace atom arrays are large enough
+
+  if (!master) {
+    qsize[0] = qdisp[0] = xsize[0] = xdisp[0] = 0;
+    for (int i = 1; i <= ratio; i++) {
+      qdisp[i] = qdisp[i-1]+qsize[i-1];
+      xsize[i] = 3*qsize[i];
+      xdisp[i] = xdisp[i-1]+xsize[i-1];
+    }
+
+    atom->nlocal = qdisp[ratio] + qsize[ratio];
+    while (atom->nmax <= atom->nlocal) atom->avec->grow(0);
+    atom->nghost = 0;
+  }
+
+  // one-time gather of Rspace atom charges to Kspace proc
+
+  MPI_Gatherv(atom->q,n,MPI_DOUBLE,atom->q,qsize,qdisp,MPI_DOUBLE,0,block);
+
+  // for TIP4P also need to send atom type and tag
+  // KSpace procs need to acquire ghost atoms and map all their atoms
+  // map_clear() call is in lieu of comm->exchange() which performs map_clear
+  // borders() call acquires ghost atoms and maps them
+  // NOTE: do atom coords need to be communicated here before borders() call?
+  //   could do this by calling r2k_comm() here and not again from run()
+  //   except that forward_comm() in r2k_comm() is wrong
+
+  if (tip4p_flag) {
+    //r2k_comm();
+    MPI_Gatherv(atom->type,n,MPI_INT,atom->type,qsize,qdisp,MPI_INT,0,block);
+    MPI_Gatherv(atom->tag,n,MPI_LMP_TAGINT,
+                atom->tag,qsize,qdisp,MPI_LMP_TAGINT,0,block);
+    if (!master) {
+      if (triclinic) domain->x2lamda(atom->nlocal);
+      if (domain->box_change) comm->setup();
+      timer->stamp();
+      atom->map_clear();
+      comm->borders();
+      if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+      timer->stamp(TIME_COMM);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   communicate Rspace atom coords to Kspace
+   also eflag,vflag and box bounds if needed
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::r2k_comm()
+{
+  MPI_Status status;
+
+  int n = 0;
+  if (master) n = atom->nlocal;
+  MPI_Gatherv(atom->x[0],n*3,MPI_DOUBLE,atom->x[0],xsize,xdisp,
+              MPI_DOUBLE,0,block);
+
+  // send eflag,vflag from Rspace to Kspace
+
+  if (me_block == 1) {
+    int flags[2];
+    flags[0] = eflag; flags[1] = vflag;
+    MPI_Send(flags,2,MPI_INT,0,0,block);
+  } else if (!master) {
+    int flags[2];
+    MPI_Recv(flags,2,MPI_DOUBLE,1,0,block,&status);
+    eflag = flags[0]; vflag = flags[1];
+  }
+
+  // send box bounds from Rspace to Kspace if simulation box is dynamic
+
+  if (domain->box_change) {
+    if (me_block == 1) {
+      MPI_Send(domain->boxlo,3,MPI_DOUBLE,0,0,block);
+      MPI_Send(domain->boxhi,3,MPI_DOUBLE,0,0,block);
+    } else if (!master) {
+      MPI_Recv(domain->boxlo,3,MPI_DOUBLE,1,0,block,&status);
+      MPI_Recv(domain->boxhi,3,MPI_DOUBLE,1,0,block,&status);
+      domain->set_global_box();
+      domain->set_local_box();
+      force->kspace->setup();
+    }
+  }
+
+  // for TIP4P, Kspace partition needs to update its ghost atoms
+
+  if (tip4p_flag && !master) {
+    timer->stamp();
+    comm->forward_comm();
+    timer->stamp(TIME_COMM);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   communicate and sum Kspace atom forces back to Rspace
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::k2r_comm()
+{
+  if (eflag) MPI_Bcast(&force->kspace->energy,1,MPI_DOUBLE,0,block);
+  if (vflag) MPI_Bcast(force->kspace->virial,6,MPI_DOUBLE,0,block);
+
+  int n = 0;
+  if (master) n = atom->nlocal;
+  MPI_Scatterv(atom->f[0],xsize,xdisp,MPI_DOUBLE,
+               f_kspace[0],n*3,MPI_DOUBLE,0,block);
+
+  if (master) {
+    double **f = atom->f;
+    int nlocal = atom->nlocal;
+    for (int i = 0; i < nlocal; i++) {
+      f[i][0] += f_kspace[i][0];
+      f[i][1] += f_kspace[i][1];
+      f[i][2] += f_kspace[i][2];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of Kspace force array on master procs
+------------------------------------------------------------------------- */
+
+bigint VerletSplitIntel::memory_usage()
+{
+  bigint bytes = maxatom*3 * sizeof(double);
+  return bytes;
+}
diff --git a/src/USER-INTEL/verlet_split_intel.h b/src/USER-INTEL/verlet_split_intel.h
new file mode 100644
index 000000000..3f81d41a9
--- /dev/null
+++ b/src/USER-INTEL/verlet_split_intel.h
@@ -0,0 +1,89 @@
+/* -------------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef INTEGRATE_CLASS
+
+IntegrateStyle(verlet/split/intel,VerletSplitIntel)
+
+#else
+
+#ifndef LMP_VERLET_SPLIT_INTEL_H
+#define LMP_VERLET_SPLIT_INTEL_H
+
+#include "verlet_intel.h"
+#ifdef LMP_INTEL_OFFLOAD
+#include "fix_intel.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class VerletSplitIntel : public VerletIntel {
+ public:
+  VerletSplitIntel(class LAMMPS *, int, char **);
+  ~VerletSplitIntel();
+  void init();
+  void setup();
+  void setup_minimal(int);
+  void run(int);
+  bigint memory_usage();
+
+ private:
+  int master;                        // 1 if an Rspace proc, 0 if Kspace
+  int me_block;                      // proc ID within Rspace/Kspace block
+  int ratio;                         // ratio of Rspace procs to Kspace procs
+  int *qsize,*qdisp,*xsize,*xdisp;   // MPI gather/scatter params for block comm
+  MPI_Comm block;                    // communicator within one block
+  int tip4p_flag;                    // 1 if PPPM/tip4p so do extra comm
+
+  double **f_kspace;                 // copy of Kspace forces on Rspace procs
+  int maxatom;
+
+  void rk_setup();
+  void r2k_comm();
+  void k2r_comm();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Verlet/split requires 2 partitions
+
+See the -partition command-line switch.
+
+E: Verlet/split requires Rspace partition size be multiple of Kspace partition size
+
+This is so there is an equal number of Rspace processors for every
+Kspace processor.
+
+E: Verlet/split requires Rspace partition layout be multiple of Kspace partition layout in each dim
+
+This is controlled by the processors command.
+
+W: No Kspace calculation with verlet/split
+
+The 2nd partition performs a kspace calculation so the kspace_style
+command must be used.
+
+E: Verlet/split does not yet support TIP4P
+
+This is a current limitation.
+
+E: Cannot currently get per-atom virials with Intel package.
+
+The Intel package does not yet support per-atom virial calculation.
+
+*/
diff --git a/src/USER-OMP/pair_gran_hooke_history_omp.cpp b/src/USER-OMP/pair_gran_hooke_history_omp.cpp
index 57827adf2..afe287c60 100644
--- a/src/USER-OMP/pair_gran_hooke_history_omp.cpp
+++ b/src/USER-OMP/pair_gran_hooke_history_omp.cpp
@@ -1,329 +1,326 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "pair_gran_hooke_history_omp.h"
 #include "atom.h"
 #include "comm.h"
 #include "fix.h"
 #include "force.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "update.h"
 
 #include "string.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairGranHookeHistoryOMP::PairGranHookeHistoryOMP(LAMMPS *lmp) :
   PairGranHookeHistory(lmp), ThrOMP(lmp, THR_PAIR)
 {
   suffix_flag |= Suffix::OMP;
   respa_enable = 0;
-  // trigger use of OpenMP version of FixShearHistory
-  suffix = new char[4];
-  memcpy(suffix,"omp",4);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGranHookeHistoryOMP::compute(int eflag, int vflag)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   computeflag = 1;
   const int shearupdate = (update->setupflag) ? 0 : 1;
 
   // update rigid body info for owned & ghost atoms if using FixRigid masses
   // body[i] = which body atom I is in, -1 if none
   // mass_body = mass of each rigid body
 
   if (fix_rigid && neighbor->ago == 0) {
     int tmp;
     int *body = (int *) fix_rigid->extract("body",tmp);
     double *mass_body = (double *) fix_rigid->extract("masstotal",tmp);
     if (atom->nmax > nmax) {
       memory->destroy(mass_rigid);
       nmax = atom->nmax;
       memory->create(mass_rigid,nmax,"pair:mass_rigid");
     }
     int nlocal = atom->nlocal;
     for (int i = 0; i < nlocal; i++)
       if (body[i] >= 0) mass_rigid[i] = mass_body[body[i]];
       else mass_rigid[i] = 0.0;
     comm->forward_comm_pair(this);
   }
 
   const int nall = atom->nlocal + atom->nghost;
   const int nthreads = comm->nthreads;
   const int inum = list->inum;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none) shared(eflag,vflag)
 #endif
   {
     int ifrom, ito, tid;
 
     loop_setup_thr(ifrom, ito, tid, inum, nthreads);
     ThrData *thr = fix->get_thr(tid);
     ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
 
     if (evflag)
       if (shearupdate) eval<1,1>(ifrom, ito, thr);
       else eval<1,0>(ifrom, ito, thr);
     else
       if (shearupdate) eval<0,1>(ifrom, ito, thr);
       else eval<0,0>(ifrom, ito, thr);
 
     reduce_thr(this, eflag, vflag, thr);
   } // end of omp parallel region
 }
 
 template <int EVFLAG, int SHEARUPDATE>
 void PairGranHookeHistoryOMP::eval(int iifrom, int iito, ThrData * const thr)
 {
   int i,j,ii,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz;
   double myshear[3];
   double radi,radj,radsum,rsq,r,rinv,rsqinv;
   double vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3;
   double wr1,wr2,wr3;
   double vtr1,vtr2,vtr3,vrel;
   double mi,mj,meff,damp,ccel,tor1,tor2,tor3;
   double fn,fs,fs1,fs2,fs3;
   double shrmag,rsht;
   int *ilist,*jlist,*numneigh,**firstneigh;
   int *touch,**firsttouch;
   double *allshear,**firstshear;
 
   const double * const * const x = atom->x;
   const double * const * const v = atom->v;
   const double * const * const omega = atom->omega;
   const double * const radius = atom->radius;
   const double * const rmass = atom->rmass;
   const double * const mass = atom->mass;
   double * const * const f = thr->get_f();
   double * const * const torque = thr->get_torque();
   const int * const type = atom->type;
   const int * const mask = atom->mask;
   const int nlocal = atom->nlocal;
   double fxtmp,fytmp,fztmp;
   double t1tmp,t2tmp,t3tmp;
 
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
   firsttouch = listgranhistory->firstneigh;
   firstshear = listgranhistory->firstdouble;
 
   // loop over neighbors of my atoms
 
   for (ii = iifrom; ii < iito; ++ii) {
 
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     radi = radius[i];
     touch = firsttouch[i];
     allshear = firstshear[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
     fxtmp=fytmp=fztmp=t1tmp=t2tmp=t3tmp=0.0;
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       radj = radius[j];
       radsum = radi + radj;
 
       if (rsq >= radsum*radsum) {
 
         // unset non-touching neighbors
 
         touch[jj] = 0;
         myshear[0] = 0.0;
         myshear[1] = 0.0;
         myshear[2] = 0.0;
 
       } else {
         r = sqrt(rsq);
         rinv = 1.0/r;
         rsqinv = 1.0/rsq;
 
         // relative translational velocity
 
         vr1 = v[i][0] - v[j][0];
         vr2 = v[i][1] - v[j][1];
         vr3 = v[i][2] - v[j][2];
 
         // normal component
 
         vnnr = vr1*delx + vr2*dely + vr3*delz;
         vn1 = delx*vnnr * rsqinv;
         vn2 = dely*vnnr * rsqinv;
         vn3 = delz*vnnr * rsqinv;
 
         // tangential component
 
         vt1 = vr1 - vn1;
         vt2 = vr2 - vn2;
         vt3 = vr3 - vn3;
 
         // relative rotational velocity
 
         wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
         wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
         wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
 
         // meff = effective mass of pair of particles
         // if I or J part of rigid body, use body mass
         // if I or J is frozen, meff is other particle
 
         if (rmass) {
           mi = rmass[i];
           mj = rmass[j];
         } else {
           mi = mass[type[i]];
           mj = mass[type[j]];
         }
         if (fix_rigid) {
           if (mass_rigid[i] > 0.0) mi = mass_rigid[i];
           if (mass_rigid[j] > 0.0) mj = mass_rigid[j];
         }
 
         meff = mi*mj / (mi+mj);
         if (mask[i] & freeze_group_bit) meff = mj;
         if (mask[j] & freeze_group_bit) meff = mi;
 
         // normal forces = Hookian contact + normal velocity damping
 
         damp = meff*gamman*vnnr*rsqinv;
         ccel = kn*(radsum-r)*rinv - damp;
 
         // relative velocities
 
         vtr1 = vt1 - (delz*wr2-dely*wr3);
         vtr2 = vt2 - (delx*wr3-delz*wr1);
         vtr3 = vt3 - (dely*wr1-delx*wr2);
         vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
         vrel = sqrt(vrel);
 
         // shear history effects
 
         touch[jj] = 1;
         memcpy(myshear,allshear + 3*jj, 3*sizeof(double));
 
         if (SHEARUPDATE) {
           myshear[0] += vtr1*dt;
           myshear[1] += vtr2*dt;
           myshear[2] += vtr3*dt;
         }
         shrmag = sqrt(myshear[0]*myshear[0] + myshear[1]*myshear[1] +
                       myshear[2]*myshear[2]);
 
         // rotate shear displacements
 
         rsht = myshear[0]*delx + myshear[1]*dely + myshear[2]*delz;
         rsht *= rsqinv;
         if (SHEARUPDATE) {
           myshear[0] -= rsht*delx;
           myshear[1] -= rsht*dely;
           myshear[2] -= rsht*delz;
         }
 
         // tangential forces = shear + tangential velocity damping
 
         fs1 = - (kt*myshear[0] + meff*gammat*vtr1);
         fs2 = - (kt*myshear[1] + meff*gammat*vtr2);
         fs3 = - (kt*myshear[2] + meff*gammat*vtr3);
 
         // rescale frictional displacements and forces if needed
 
         fs = sqrt(fs1*fs1 + fs2*fs2 + fs3*fs3);
         fn = xmu * fabs(ccel*r);
 
         if (fs > fn) {
           if (shrmag != 0.0) {
             const double fnfs = fn/fs;
             const double mgkt = meff*gammat/kt;
             myshear[0] = fnfs * (myshear[0] + mgkt*vtr1) - mgkt*vtr1;
             myshear[1] = fnfs * (myshear[1] + mgkt*vtr2) - mgkt*vtr2;
             myshear[2] = fnfs * (myshear[2] + mgkt*vtr3) - mgkt*vtr3;
             fs1 *= fnfs;
             fs2 *= fnfs;
             fs3 *= fnfs;
           } else fs1 = fs2 = fs3 = 0.0;
         }
 
         // forces & torques
 
         fx = delx*ccel + fs1;
         fy = dely*ccel + fs2;
         fz = delz*ccel + fs3;
         fxtmp  += fx;
         fytmp  += fy;
         fztmp  += fz;
 
         tor1 = rinv * (dely*fs3 - delz*fs2);
         tor2 = rinv * (delz*fs1 - delx*fs3);
         tor3 = rinv * (delx*fs2 - dely*fs1);
         t1tmp -= radi*tor1;
         t2tmp -= radi*tor2;
         t3tmp -= radi*tor3;
 
         if (j < nlocal) {
           f[j][0] -= fx;
           f[j][1] -= fy;
           f[j][2] -= fz;
           torque[j][0] -= radj*tor1;
           torque[j][1] -= radj*tor2;
           torque[j][2] -= radj*tor3;
         }
 
         if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,/* newton_pair */ 0,
                                      0.0,0.0,fx,fy,fz,delx,dely,delz,thr);
 
       }
       memcpy(allshear + 3*jj, myshear, 3*sizeof(double));
     }
     f[i][0] += fxtmp;
     f[i][1] += fytmp;
     f[i][2] += fztmp;
     torque[i][0] += t1tmp;
     torque[i][1] += t2tmp;
     torque[i][2] += t3tmp;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairGranHookeHistoryOMP::memory_usage()
 {
   double bytes = memory_usage_thr();
   bytes += PairGranHookeHistory::memory_usage();
 
   return bytes;
 }
diff --git a/src/angle_hybrid.cpp b/src/angle_hybrid.cpp
index 6f1cceba0..1780c4344 100644
--- a/src/angle_hybrid.cpp
+++ b/src/angle_hybrid.cpp
@@ -1,373 +1,374 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "string.h"
 #include "ctype.h"
 #include "angle_hybrid.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define EXTRA 1000
 
 /* ---------------------------------------------------------------------- */
 
 AngleHybrid::AngleHybrid(LAMMPS *lmp) : Angle(lmp)
 {
   writedata = 0;
   nstyles = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 AngleHybrid::~AngleHybrid()
 {
   if (nstyles) {
     for (int i = 0; i < nstyles; i++) delete styles[i];
     delete [] styles;
     for (int i = 0; i < nstyles; i++) delete [] keywords[i];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(map);
     delete [] nanglelist;
     delete [] maxangle;
     for (int i = 0; i < nstyles; i++)
       memory->destroy(anglelist[i]);
     delete [] anglelist;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void AngleHybrid::compute(int eflag, int vflag)
 {
   int i,j,m,n;
 
   // save ptrs to original anglelist
 
   int nanglelist_orig = neighbor->nanglelist;
   int **anglelist_orig = neighbor->anglelist;
 
   // if this is re-neighbor step, create sub-style anglelists
   // nanglelist[] = length of each sub-style list
   // realloc sub-style anglelist if necessary
   // load sub-style anglelist with 4 values from original anglelist
 
   if (neighbor->ago == 0) {
     for (m = 0; m < nstyles; m++) nanglelist[m] = 0;
     for (i = 0; i < nanglelist_orig; i++) {
       m = map[anglelist_orig[i][3]];
       if (m >= 0) nanglelist[m]++;
     }
     for (m = 0; m < nstyles; m++) {
       if (nanglelist[m] > maxangle[m]) {
         memory->destroy(anglelist[m]);
         maxangle[m] = nanglelist[m] + EXTRA;
         memory->create(anglelist[m],maxangle[m],4,"angle_hybrid:anglelist");
       }
       nanglelist[m] = 0;
     }
     for (i = 0; i < nanglelist_orig; i++) {
       m = map[anglelist_orig[i][3]];
       if (m < 0) continue;
       n = nanglelist[m];
       anglelist[m][n][0] = anglelist_orig[i][0];
       anglelist[m][n][1] = anglelist_orig[i][1];
       anglelist[m][n][2] = anglelist_orig[i][2];
       anglelist[m][n][3] = anglelist_orig[i][3];
       nanglelist[m]++;
     }
   }
 
   // call each sub-style's compute function
   // set neighbor->anglelist to sub-style anglelist before call
   // accumulate sub-style global/peratom energy/virial in hybrid
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   for (m = 0; m < nstyles; m++) {
     neighbor->nanglelist = nanglelist[m];
     neighbor->anglelist = anglelist[m];
 
     styles[m]->compute(eflag,vflag);
 
     if (eflag_global) energy += styles[m]->energy;
     if (vflag_global)
       for (n = 0; n < 6; n++) virial[n] += styles[m]->virial[n];
     if (eflag_atom) {
       n = atom->nlocal;
       if (force->newton_bond) n += atom->nghost;
       double *eatom_substyle = styles[m]->eatom;
       for (i = 0; i < n; i++) eatom[i] += eatom_substyle[i];
     }
     if (vflag_atom) {
       n = atom->nlocal;
       if (force->newton_bond) n += atom->nghost;
       double **vatom_substyle = styles[m]->vatom;
       for (i = 0; i < n; i++)
         for (j = 0; j < 6; j++)
           vatom[i][j] += vatom_substyle[i][j];
     }
   }
 
   // restore ptrs to original anglelist
 
   neighbor->nanglelist = nanglelist_orig;
   neighbor->anglelist = anglelist_orig;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void AngleHybrid::allocate()
 {
   allocated = 1;
   int n = atom->nangletypes;
 
   memory->create(map,n+1,"angle:map");
   memory->create(setflag,n+1,"angle:setflag");
   for (int i = 1; i <= n; i++) setflag[i] = 0;
 
   nanglelist = new int[nstyles];
   maxangle = new int[nstyles];
   anglelist = new int**[nstyles];
   for (int m = 0; m < nstyles; m++) maxangle[m] = 0;
   for (int m = 0; m < nstyles; m++) anglelist[m] = NULL;
 }
 
 /* ----------------------------------------------------------------------
    create one angle style for each arg in list
 ------------------------------------------------------------------------- */
 
 void AngleHybrid::settings(int narg, char **arg)
 {
   int i,m,istyle;
 
   if (narg < 1) error->all(FLERR,"Illegal angle_style command");
 
   // delete old lists, since cannot just change settings
 
   if (nstyles) {
     for (int i = 0; i < nstyles; i++) delete styles[i];
     delete [] styles;
     for (int i = 0; i < nstyles; i++) delete [] keywords[i];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(map);
     delete [] nanglelist;
     delete [] maxangle;
     for (int i = 0; i < nstyles; i++)
       memory->destroy(anglelist[i]);
     delete [] anglelist;
   }
   allocated = 0;
 
   // count sub-styles by skipping numeric args
   // one exception is 1st arg of style "table", which is non-numeric word
   // need a better way to skip these exceptions
 
   nstyles = 0;
   i = 0;
   while (i < narg) {
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     nstyles++;
   }
 
   // allocate list of sub-styles
 
   styles = new Angle*[nstyles];
   keywords = new char*[nstyles];
 
   // allocate each sub-style and call its settings() with subset of args
   // define subset of args for a sub-style by skipping numeric args
   // one exception is 1st arg of style "table", which is non-numeric
   // need a better way to skip these exceptions
 
-  int dummy;
+  int sflag;
   nstyles = 0;
   i = 0;
 
   while (i < narg) {
     for (m = 0; m < nstyles; m++)
       if (strcmp(arg[i],keywords[m]) == 0)
         error->all(FLERR,"Angle style hybrid cannot use "
                    "same angle style twice");
     if (strcmp(arg[i],"hybrid") == 0)
       error->all(FLERR,"Angle style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0)
       error->all(FLERR,"Angle style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_angle(arg[i],lmp->suffix,dummy);
-    keywords[nstyles] = new char[strlen(arg[i])+1];
-    strcpy(keywords[nstyles],arg[i]);
+
+    styles[nstyles] = force->new_angle(arg[i],1,sflag);
+    force->store_style(keywords[nstyles],arg[i],sflag);
+
     istyle = i;
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     styles[nstyles]->settings(i-istyle-1,&arg[istyle+1]);
     nstyles++;
   }
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one type
 ---------------------------------------------------------------------- */
 
 void AngleHybrid::coeff(int narg, char **arg)
 {
   if (!allocated) allocate();
 
   int ilo,ihi;
   force->bounds(arg[0],atom->nangletypes,ilo,ihi);
 
   // 2nd arg = angle sub-style name
   // allow for "none" or "skip" as valid sub-style name
 
   int m;
   for (m = 0; m < nstyles; m++)
     if (strcmp(arg[1],keywords[m]) == 0) break;
 
   int none = 0;
   int skip = 0;
   if (m == nstyles) {
     if (strcmp(arg[1],"none") == 0) none = 1;
     else if (strcmp(arg[1],"skip") == 0) none = skip = 1;
     else if (strcmp(arg[1],"ba") == 0)
       error->all(FLERR,"BondAngle coeff for hybrid angle has invalid format");
     else if (strcmp(arg[1],"bb") == 0)
       error->all(FLERR,"BondBond coeff for hybrid angle has invalid format");
     else error->all(FLERR,"Angle coeff for hybrid has invalid style");
   }
 
   // move 1st arg to 2nd arg
   // just copy ptrs, since arg[] points into original input line
 
   arg[1] = arg[0];
 
   // invoke sub-style coeff() starting with 1st arg
 
   if (!none) styles[m]->coeff(narg-1,&arg[1]);
 
   // set setflag and which type maps to which sub-style
   // if sub-style is skip: auxiliary class2 setting in data file so ignore
   // if sub-style is none: set hybrid setflag, wipe out map
 
   for (int i = ilo; i <= ihi; i++) {
     if (skip) continue;
     else if (none) {
       setflag[i] = 1;
       map[i] = -1;
     } else {
       setflag[i] = styles[m]->setflag[i];
       map[i] = m;
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    run angle style specific initialization
 ------------------------------------------------------------------------- */
 
 void AngleHybrid::init_style()
 {
   for (int m = 0; m < nstyles; m++)
     if (styles[m]) styles[m]->init_style();
 }
 
 /* ----------------------------------------------------------------------
    return an equilbrium angle length
 ------------------------------------------------------------------------- */
 
 double AngleHybrid::equilibrium_angle(int i)
 {
   if (map[i] < 0)
     error->one(FLERR,"Invoked angle equil angle on angle style none");
   return styles[map[i]]->equilibrium_angle(i);
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void AngleHybrid::write_restart(FILE *fp)
 {
   fwrite(&nstyles,sizeof(int),1,fp);
 
   int n;
   for (int m = 0; m < nstyles; m++) {
     n = strlen(keywords[m]) + 1;
     fwrite(&n,sizeof(int),1,fp);
     fwrite(keywords[m],sizeof(char),n,fp);
   }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void AngleHybrid::read_restart(FILE *fp)
 {
   int me = comm->me;
   if (me == 0) fread(&nstyles,sizeof(int),1,fp);
   MPI_Bcast(&nstyles,1,MPI_INT,0,world);
   styles = new Angle*[nstyles];
   keywords = new char*[nstyles];
 
   allocate();
 
   int n,dummy;
   for (int m = 0; m < nstyles; m++) {
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_angle(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_angle(keywords[m],0,dummy);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 double AngleHybrid::single(int type, int i1, int i2, int i3)
 {
   if (map[type] < 0) error->one(FLERR,"Invoked angle single on angle style none");
   return styles[map[type]]->single(type,i1,i2,i3);
 }
 
 /* ----------------------------------------------------------------------
    memory usage
 ------------------------------------------------------------------------- */
 
 double AngleHybrid::memory_usage()
 {
   double bytes = maxeatom * sizeof(double);
   bytes += maxvatom*6 * sizeof(double);
   for (int m = 0; m < nstyles; m++) bytes += maxangle[m]*4 * sizeof(int);
   for (int m = 0; m < nstyles; m++)
     if (styles[m]) bytes += styles[m]->memory_usage();
   return bytes;
 }
diff --git a/src/atom.cpp b/src/atom.cpp
index 7efbf4740..550b959f2 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -1,2003 +1,2018 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "limits.h"
 #include "atom.h"
 #include "style_atom.h"
 #include "atom_vec.h"
 #include "atom_vec_ellipsoid.h"
 #include "comm.h"
 #include "neighbor.h"
 #include "force.h"
 #include "modify.h"
 #include "fix.h"
 #include "output.h"
 #include "thermo.h"
 #include "update.h"
 #include "domain.h"
 #include "group.h"
 #include "molecule.h"
 #include "accelerator_cuda.h"
 #include "atom_masks.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define DELTA 1
 #define DELTA_MEMSTR 1024
 #define EPSILON 1.0e-6
 #define CUDA_CHUNK 3000
 #define MAXBODY 20       // max # of lines in one body, also in ReadData class
 
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
 /* ---------------------------------------------------------------------- */
 
 Atom::Atom(LAMMPS *lmp) : Pointers(lmp)
 {
   natoms = 0;
   nlocal = nghost = nmax = 0;
   ntypes = 0;
   nbondtypes = nangletypes = ndihedraltypes = nimpropertypes = 0;
   nbonds = nangles = ndihedrals = nimpropers = 0;
 
   firstgroupname = NULL;
   sortfreq = 1000;
   nextsort = 0;
   userbinsize = 0.0;
   maxbin = maxnext = 0;
   binhead = NULL;
   next = permute = NULL;
 
   // initialize atom arrays
   // customize by adding new array
 
   tag = NULL;
   type = mask = NULL;
   image = NULL;
   x = v = f = NULL;
 
   molecule = NULL;
   molindex = molatom = NULL;
   q = NULL;
   mu = NULL;
   omega = angmom = torque = NULL;
   radius = rmass = NULL;
   ellipsoid = line = tri = body = NULL;
 
   vfrac = s0 = NULL;
   x0 = NULL;
 
   spin = NULL;
   eradius = ervel = erforce = NULL;
   cs = csforce = vforce = ervelforce = NULL;
   etag = NULL;
 
   rho = drho = e = de = cv = NULL;
   vest = NULL;
 
   bond_per_atom =  extra_bond_per_atom = 0;
   num_bond = NULL;
   bond_type = NULL;
   bond_atom = NULL;
 
   angle_per_atom = extra_angle_per_atom = 0;
   num_angle = NULL;
   angle_type = NULL;
   angle_atom1 = angle_atom2 = angle_atom3 = NULL;
 
   dihedral_per_atom = extra_dihedral_per_atom = 0;
   num_dihedral = NULL;
   dihedral_type = NULL;
   dihedral_atom1 = dihedral_atom2 = dihedral_atom3 = dihedral_atom4 = NULL;
 
   improper_per_atom = extra_improper_per_atom = 0;
   num_improper = NULL;
   improper_type = NULL;
   improper_atom1 = improper_atom2 = improper_atom3 = improper_atom4 = NULL;
 
   maxspecial = 1;
   nspecial = NULL;
   special = NULL;
 
   // user-defined molecules
 
   nmolecule = 0;
   molecules = NULL;
 
   // custom atom arrays
 
   nivector = ndvector = 0;
   ivector = NULL;
   dvector = NULL;
   iname = dname = NULL;
 
   // initialize atom style and array existence flags
   // customize by adding new flag
 
   sphere_flag = peri_flag = electron_flag = 0;
   wavepacket_flag = sph_flag = 0;
 
   molecule_flag = 0;
   q_flag = mu_flag = 0;
   omega_flag = torque_flag = angmom_flag = 0;
   radius_flag = rmass_flag = 0;
   ellipsoid_flag = line_flag = tri_flag = body_flag = 0;
 
   vfrac_flag = 0;
   spin_flag = eradius_flag = ervel_flag = erforce_flag = ervelforce_flag = 0;
   cs_flag = csforce_flag = vforce_flag = etag_flag = 0;
 
   rho_flag = e_flag = cv_flag = vest_flag = 0;
 
   // Peridynamic scale factor
 
   pdscale = 1.0;
 
   // ntype-length arrays
 
   mass = NULL;
   mass_setflag = NULL;
 
   // callback lists & extra restart info
 
   nextra_grow = nextra_restart = nextra_border = 0;
   extra_grow = extra_restart = extra_border = NULL;
   nextra_grow_max = nextra_restart_max = nextra_border_max = 0;
   nextra_store = 0;
   extra = NULL;
 
   // default atom ID and mapping values
 
   tag_enable = 1;
   map_style = map_user = 0;
   map_tag_max = -1;
   map_maxarray = map_nhash = -1;
 
   max_same = 0;
   sametag = NULL;
   map_array = NULL;
   map_bucket = NULL;
   map_hash = NULL;
 
   atom_style = NULL;
   avec = NULL;
 
   datamask = ALL_MASK;
   datamask_ext = ALL_MASK;
 }
 
 /* ---------------------------------------------------------------------- */
 
 Atom::~Atom()
 {
   delete [] atom_style;
   delete avec;
 
   delete [] firstgroupname;
   memory->destroy(binhead);
   memory->destroy(next);
   memory->destroy(permute);
 
   // delete atom arrays
   // customize by adding new array
 
   memory->destroy(tag);
   memory->destroy(type);
   memory->destroy(mask);
   memory->destroy(image);
   memory->destroy(x);
   memory->destroy(v);
   memory->destroy(f);
 
   memory->destroy(molecule);
   memory->destroy(molindex);
   memory->destroy(molatom);
 
   memory->destroy(q);
   memory->destroy(mu);
   memory->destroy(omega);
   memory->destroy(angmom);
   memory->destroy(torque);
   memory->destroy(radius);
   memory->destroy(rmass);
   memory->destroy(ellipsoid);
   memory->destroy(line);
   memory->destroy(tri);
   memory->destroy(body);
 
   memory->destroy(vfrac);
   memory->destroy(s0);
   memory->destroy(x0);
 
   memory->destroy(spin);
   memory->destroy(eradius);
   memory->destroy(ervel);
   memory->destroy(erforce);
   memory->destroy(ervelforce);
   memory->destroy(cs);
   memory->destroy(csforce);
   memory->destroy(vforce);
   memory->destroy(etag);
 
   memory->destroy(rho);
   memory->destroy(drho);
   memory->destroy(e);
   memory->destroy(de);
   memory->destroy(cv);
   memory->destroy(vest);
 
   memory->destroy(nspecial);
   memory->destroy(special);
 
   memory->destroy(num_bond);
   memory->destroy(bond_type);
   memory->destroy(bond_atom);
 
   memory->destroy(num_angle);
   memory->destroy(angle_type);
   memory->destroy(angle_atom1);
   memory->destroy(angle_atom2);
   memory->destroy(angle_atom3);
 
   memory->destroy(num_dihedral);
   memory->destroy(dihedral_type);
   memory->destroy(dihedral_atom1);
   memory->destroy(dihedral_atom2);
   memory->destroy(dihedral_atom3);
   memory->destroy(dihedral_atom4);
 
   memory->destroy(num_improper);
   memory->destroy(improper_type);
   memory->destroy(improper_atom1);
   memory->destroy(improper_atom2);
   memory->destroy(improper_atom3);
   memory->destroy(improper_atom4);
 
   // delete custom atom arrays
 
   for (int i = 0; i < nivector; i++) {
     delete [] iname[i];
     memory->destroy(ivector[i]);
   }
   for (int i = 0; i < ndvector; i++) {
     delete [] dname[i];
     memory->destroy(dvector[i]);
   }
 
   memory->sfree(iname);
   memory->sfree(dname);
   memory->sfree(ivector);
   memory->sfree(dvector);
 
   // delete user-defined molecules
 
   for (int i = 0; i < nmolecule; i++) delete molecules[i];
   memory->sfree(molecules);
 
   // delete per-type arrays
 
   delete [] mass;
   delete [] mass_setflag;
 
   // delete extra arrays
 
   memory->destroy(extra_grow);
   memory->destroy(extra_restart);
   memory->destroy(extra_border);
   memory->destroy(extra);
 
   // delete mapping data structures
 
   map_delete();
 }
 
 /* ----------------------------------------------------------------------
    copy modify settings from old Atom class to current Atom class
 ------------------------------------------------------------------------- */
 
 void Atom::settings(Atom *old)
 {
   tag_enable = old->tag_enable;
   map_user = old->map_user;
   map_style = old->map_style;
   sortfreq = old->sortfreq;
   userbinsize = old->userbinsize;
   if (old->firstgroupname) {
     int n = strlen(old->firstgroupname) + 1;
     firstgroupname = new char[n];
     strcpy(firstgroupname,old->firstgroupname);
   }
 }
 
 /* ----------------------------------------------------------------------
    create an AtomVec style
    called from lammps.cpp, input script, restart file, replicate
 ------------------------------------------------------------------------- */
 
-void Atom::create_avec(const char *style, int narg, char **arg, char *suffix)
+void Atom::create_avec(const char *style, int narg, char **arg, int trysuffix)
 {
   delete [] atom_style;
   if (avec) delete avec;
 
   // unset atom style and array existence flags
   // may have been set by old avec
   // customize by adding new flag
 
   sphere_flag = peri_flag = electron_flag = 0;
   wavepacket_flag = sph_flag = 0;
 
   molecule_flag = 0;
   q_flag = mu_flag = 0;
   omega_flag = torque_flag = angmom_flag = 0;
   radius_flag = rmass_flag = 0;
   ellipsoid_flag = line_flag = tri_flag = body_flag = 0;
 
   vfrac_flag = 0;
   spin_flag = eradius_flag = ervel_flag = erforce_flag = ervelforce_flag = 0;
   cs_flag = csforce_flag = vforce_flag = etag_flag = 0;
 
   rho_flag = e_flag = cv_flag = vest_flag = 0;
 
   // create instance of AtomVec
   // use grow() to initialize atom-based arrays to length 1
   //   so that x[0][0] can always be referenced even if proc has no atoms
 
   int sflag;
-  avec = new_avec(style,suffix,sflag);
+  avec = new_avec(style,trysuffix,sflag);
   avec->store_args(narg,arg);
   avec->process_args(narg,arg);
   avec->grow(1);
 
   if (sflag) {
     char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
+    if (sflag = 1) sprintf(estyle,"%s/%s",style,lmp->suffix);
+    else sprintf(estyle,"%s/%s",style,lmp->suffix2);
     int n = strlen(estyle) + 1;
     atom_style = new char[n];
     strcpy(atom_style,estyle);
   } else {
     int n = strlen(style) + 1;
     atom_style = new char[n];
     strcpy(atom_style,style);
   }
 
   // if molecular system:
   // atom IDs must be defined
   // force atom map to be created
   // map style may be reset by map_init() and its call to map_style_set()
 
   molecular = avec->molecular;
   if (molecular && tag_enable == 0)
     error->all(FLERR,"Atom IDs must be used for molecular systems");
   if (molecular) map_style = 1;
 }
 
 /* ----------------------------------------------------------------------
    generate an AtomVec class, first with suffix appended
 ------------------------------------------------------------------------- */
 
-AtomVec *Atom::new_avec(const char *style, char *suffix, int &sflag)
+AtomVec *Atom::new_avec(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
 
-    if (0) return NULL;
+      if (0) return NULL;
 
 #define ATOM_CLASS
 #define AtomStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_atom.h"
 #undef AtomStyle
 #undef ATOM_CLASS
+    }
+
+    if (lmp->suffix2) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
 
+      if (0) return NULL;
+
+#define ATOM_CLASS
+#define AtomStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_atom.h"
+#undef AtomStyle
+#undef ATOM_CLASS
+    }
   }
 
   sflag = 0;
-
   if (0) return NULL;
 
 #define ATOM_CLASS
 #define AtomStyle(key,Class) \
   else if (strcmp(style,#key) == 0) return new Class(lmp);
 #include "style_atom.h"
 #undef ATOM_CLASS
 
   else error->all(FLERR,"Invalid atom style");
-
   return NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Atom::init()
 {
   // delete extra array since it doesn't persist past first run
 
   if (nextra_store) {
     memory->destroy(extra);
     extra = NULL;
     nextra_store = 0;
   }
 
   // check arrays that are atom type in length
 
   check_mass();
 
   // setup of firstgroup
 
   if (firstgroupname) {
     firstgroup = group->find(firstgroupname);
     if (firstgroup < 0)
       error->all(FLERR,"Could not find atom_modify first group ID");
   } else firstgroup = -1;
 
   // init AtomVec
 
   avec->init();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Atom::setup()
 {
   // setup bins for sorting
   // cannot do this in init() because uses neighbor cutoff
 
   if (sortfreq > 0) setup_sort_bins();
 }
 
 /* ----------------------------------------------------------------------
    return ptr to AtomVec class if matches style or to matching hybrid sub-class
    return NULL if no match
 ------------------------------------------------------------------------- */
 
 AtomVec *Atom::style_match(const char *style)
 {
   if (strcmp(atom_style,style) == 0) return avec;
   else if (strcmp(atom_style,"hybrid") == 0) {
     AtomVecHybrid *avec_hybrid = (AtomVecHybrid *) avec;
     for (int i = 0; i < avec_hybrid->nstyles; i++)
       if (strcmp(avec_hybrid->keywords[i],style) == 0)
         return avec_hybrid->styles[i];
   }
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    modify parameters of the atom style
    some options can only be invoked before simulation box is defined
    first and sort options cannot be used together
 ------------------------------------------------------------------------- */
 
 void Atom::modify_params(int narg, char **arg)
 {
   if (narg == 0) error->all(FLERR,"Illegal atom_modify command");
 
   int iarg = 0;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"id") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal atom_modify command");
       if (domain->box_exist)
         error->all(FLERR,
                    "Atom_modify id command after simulation box is defined");
       if (strcmp(arg[iarg+1],"yes") == 0) tag_enable = 1;
       else if (strcmp(arg[iarg+1],"no") == 0) tag_enable = 2;
       else error->all(FLERR,"Illegal atom_modify command");
       iarg += 2;
     } if (strcmp(arg[iarg],"map") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal atom_modify command");
       if (domain->box_exist)
         error->all(FLERR,
                    "Atom_modify map command after simulation box is defined");
       if (strcmp(arg[iarg+1],"array") == 0) map_user = 1;
       else if (strcmp(arg[iarg+1],"hash") == 0) map_user = 2;
       else error->all(FLERR,"Illegal atom_modify command");
       map_style = map_user;
       iarg += 2;
     } else if (strcmp(arg[iarg],"first") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal atom_modify command");
       if (strcmp(arg[iarg+1],"all") == 0) {
         delete [] firstgroupname;
         firstgroupname = NULL;
       } else {
         int n = strlen(arg[iarg+1]) + 1;
         firstgroupname = new char[n];
         strcpy(firstgroupname,arg[iarg+1]);
         sortfreq = 0;
       }
       iarg += 2;
     } else if (strcmp(arg[iarg],"sort") == 0) {
       if (iarg+3 > narg) error->all(FLERR,"Illegal atom_modify command");
       sortfreq = force->inumeric(FLERR,arg[iarg+1]);
       userbinsize = force->numeric(FLERR,arg[iarg+2]);
       if (sortfreq < 0 || userbinsize < 0.0)
         error->all(FLERR,"Illegal atom_modify command");
       if (sortfreq >= 0 && firstgroupname)
         error->all(FLERR,"Atom_modify sort and first options "
                    "cannot be used together");
       iarg += 3;
     } else error->all(FLERR,"Illegal atom_modify command");
   }
 }
 
 /* ----------------------------------------------------------------------
    check that atom IDs are valid
    error if any atom ID < 0 or atom ID = MAXTAGINT
    if any atom ID > 0, error if any atom ID == 0
    if all atom IDs = 0, tag_enable must be 0
    OK if atom IDs > natoms
    NOTE: not checking that atom IDs are unique
 ------------------------------------------------------------------------- */
 
 void Atom::tag_check()
 {
   int nlocal = atom->nlocal;
   tagint *tag = atom->tag;
 
   tagint min = MAXTAGINT;
   tagint max = 0;
 
   for (int i = 0; i < nlocal; i++) {
     min = MIN(min,tag[i]);
     max = MAX(max,tag[i]);
   }
 
   tagint minall,maxall;
   MPI_Allreduce(&min,&minall,1,MPI_LMP_TAGINT,MPI_MIN,world);
   MPI_Allreduce(&max,&maxall,1,MPI_LMP_TAGINT,MPI_MAX,world);
 
   if (minall < 0) error->all(FLERR,"Atom ID is negative");
   if (maxall >= MAXTAGINT) error->all(FLERR,"Atom ID is too big");
   if (maxall > 0 && minall == 0) error->all(FLERR,"Atom ID is zero");
   if (maxall == 0 && tag_enable && natoms) 
     error->all(FLERR,"Not all atom IDs are 0");
 }
 
 /* ----------------------------------------------------------------------
    add unique tags to any atoms with tag = 0
    new tags are grouped by proc and start after max current tag
    called after creating new atoms
    error if new tags will exceed MAXTAGINT
 ------------------------------------------------------------------------- */
 
 void Atom::tag_extend()
 {
   // maxtag_all = max tag for all atoms
 
   tagint maxtag = 0;
   for (int i = 0; i < nlocal; i++) maxtag = MAX(maxtag,tag[i]);
   tagint maxtag_all;
   MPI_Allreduce(&maxtag,&maxtag_all,1,MPI_LMP_TAGINT,MPI_MAX,world);
 
   // DEBUG: useful for generating 64-bit IDs even for small systems
   // use only when LAMMPS is compiled with BIGBIG
 
   //maxtag_all += 1000000000000;
 
   // notag = # of atoms I own with no tag (tag = 0)
   // notag_sum = # of total atoms on procs <= me with no tag
 
   bigint notag = 0;
   for (int i = 0; i < nlocal; i++) if (tag[i] == 0) notag++;
 
   bigint notag_total;
   MPI_Allreduce(&notag,&notag_total,1,MPI_LMP_BIGINT,MPI_SUM,world);
   if (notag_total >= MAXTAGINT)
     error->all(FLERR,"New atom IDs exceed maximum allowed ID");
 
   bigint notag_sum;
   MPI_Scan(&notag,&notag_sum,1,MPI_LMP_BIGINT,MPI_SUM,world);
 
   // itag = 1st new tag that my untagged atoms should use
 
   tagint itag = maxtag_all + notag_sum - notag + 1;
   for (int i = 0; i < nlocal; i++) if (tag[i] == 0) tag[i] = itag++;
 }
 
 /* ----------------------------------------------------------------------
    check that atom IDs span range from 1 to Natoms inclusive
    return 0 if mintag != 1 or maxtag != Natoms
    return 1 if OK
    doesn't actually check if all tag values are used
 ------------------------------------------------------------------------- */
 
 int Atom::tag_consecutive()
 {
   tagint idmin = MAXTAGINT;
   tagint idmax = 0;
 
   for (int i = 0; i < nlocal; i++) {
     idmin = MIN(idmin,tag[i]);
     idmax = MAX(idmax,tag[i]);
   }
   tagint idminall,idmaxall;
   MPI_Allreduce(&idmin,&idminall,1,MPI_LMP_TAGINT,MPI_MIN,world);
   MPI_Allreduce(&idmax,&idmaxall,1,MPI_LMP_TAGINT,MPI_MAX,world);
 
   if (idminall != 1 || idmaxall != natoms) return 0;
   return 1;
 }
 
 /* ----------------------------------------------------------------------
    count and return words in a single line
    make copy of line before using strtok so as not to change line
    trim anything from '#' onward
 ------------------------------------------------------------------------- */
 
 int Atom::count_words(const char *line)
 {
   int n = strlen(line) + 1;
   char *copy;
   memory->create(copy,n,"atom:copy");
   strcpy(copy,line);
 
   char *ptr;
   if ((ptr = strchr(copy,'#'))) *ptr = '\0';
 
   if (strtok(copy," \t\n\r\f") == NULL) {
     memory->destroy(copy);
     return 0;
   }
   n = 1;
   while (strtok(NULL," \t\n\r\f")) n++;
 
   memory->destroy(copy);
   return n;
 }
 
 /* ----------------------------------------------------------------------
    deallocate molecular topology arrays
    done before realloc with (possibly) new 2nd dimension set to
      correctly initialized per-atom values, e.g. bond_per_atom
    needs to be called whenever 2nd dimensions are changed
      and these arrays are already pre-allocated,
      e.g. due to grow(1) in create_avec()
 ------------------------------------------------------------------------- */
 
 void Atom::deallocate_topology()
 {
   memory->destroy(atom->bond_type);
   memory->destroy(atom->bond_atom);
   atom->bond_type = NULL;
   atom->bond_atom = NULL;
 
   memory->destroy(atom->angle_type);
   memory->destroy(atom->angle_atom1);
   memory->destroy(atom->angle_atom2);
   memory->destroy(atom->angle_atom3);
   atom->angle_type = NULL;
   atom->angle_atom1 = atom->angle_atom2 = atom->angle_atom3 = NULL;
   
   memory->destroy(atom->dihedral_type);
   memory->destroy(atom->dihedral_atom1);
   memory->destroy(atom->dihedral_atom2);
   memory->destroy(atom->dihedral_atom3);
   memory->destroy(atom->dihedral_atom4);
   atom->dihedral_type = NULL;
   atom->dihedral_atom1 = atom->dihedral_atom2 = 
     atom->dihedral_atom3 = atom->dihedral_atom4 = NULL;
   
   memory->destroy(atom->improper_type);
   memory->destroy(atom->improper_atom1);
   memory->destroy(atom->improper_atom2);
   memory->destroy(atom->improper_atom3);
   memory->destroy(atom->improper_atom4);
   atom->improper_type = NULL;
   atom->improper_atom1 = atom->improper_atom2 = 
     atom->improper_atom3 = atom->improper_atom4 = NULL;
 }
 
 /* ----------------------------------------------------------------------
    unpack n lines from Atom section of data file
    call style-specific routine to parse line
 ------------------------------------------------------------------------- */
 
 void Atom::data_atoms(int n, char *buf)
 {
   int m,xptr,iptr;
   imageint imagedata;
   double xdata[3],lamda[3];
   double *coord;
   char *next;
 
   next = strchr(buf,'\n');
   *next = '\0';
   int nwords = count_words(buf);
   *next = '\n';
 
   if (nwords != avec->size_data_atom && nwords != avec->size_data_atom + 3)
     error->all(FLERR,"Incorrect atom format in data file");
 
   char **values = new char*[nwords];
 
   // set bounds for my proc
   // if periodic and I am lo/hi proc, adjust bounds by EPSILON
   // insures all data atoms will be owned even with round-off
 
   int triclinic = domain->triclinic;
 
   double epsilon[3];
   if (triclinic) epsilon[0] = epsilon[1] = epsilon[2] = EPSILON;
   else {
     epsilon[0] = domain->prd[0] * EPSILON;
     epsilon[1] = domain->prd[1] * EPSILON;
     epsilon[2] = domain->prd[2] * EPSILON;
   }
 
   double sublo[3],subhi[3];
   if (triclinic == 0) {
     sublo[0] = domain->sublo[0]; subhi[0] = domain->subhi[0];
     sublo[1] = domain->sublo[1]; subhi[1] = domain->subhi[1];
     sublo[2] = domain->sublo[2]; subhi[2] = domain->subhi[2];
   } else {
     sublo[0] = domain->sublo_lamda[0]; subhi[0] = domain->subhi_lamda[0];
     sublo[1] = domain->sublo_lamda[1]; subhi[1] = domain->subhi_lamda[1];
     sublo[2] = domain->sublo_lamda[2]; subhi[2] = domain->subhi_lamda[2];
   }
 
   if (comm->layout != LAYOUT_TILED) {
     if (domain->xperiodic) {
       if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
       if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
     }
     if (domain->yperiodic) {
       if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
       if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
     }
     if (domain->zperiodic) {
       if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
       if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
     }
 
   } else {
     if (domain->xperiodic) {
       if (comm->mysplit[0][0] == 0.0) sublo[0] -= epsilon[0];
       if (comm->mysplit[0][1] == 1.0) subhi[0] += epsilon[0];
     }
     if (domain->yperiodic) {
       if (comm->mysplit[1][0] == 0.0) sublo[1] -= epsilon[1];
       if (comm->mysplit[1][1] == 1.0) subhi[1] += epsilon[1];
     }
     if (domain->zperiodic) {
       if (comm->mysplit[2][0] == 0.0) sublo[2] -= epsilon[2];
       if (comm->mysplit[2][1] == 1.0) subhi[2] += epsilon[2];
     }
   }
 
   // xptr = which word in line starts xyz coords
   // iptr = which word in line starts ix,iy,iz image flags
 
   xptr = avec->xcol_data - 1;
   int imageflag = 0;
   if (nwords > avec->size_data_atom) imageflag = 1;
   if (imageflag) iptr = nwords - 3;
 
   // loop over lines of atom data
   // tokenize the line into values
   // extract xyz coords and image flags
   // remap atom into simulation box
   // if atom is in my sub-domain, unpack its values
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
 
     values[0] = strtok(buf," \t\n\r\f");
     if (values[0] == NULL)
       error->all(FLERR,"Incorrect atom format in data file");
     for (m = 1; m < nwords; m++) {
       values[m] = strtok(NULL," \t\n\r\f");
       if (values[m] == NULL)
         error->all(FLERR,"Incorrect atom format in data file");
     }
 
     if (imageflag)
       imagedata = ((imageint) (atoi(values[iptr]) + IMGMAX) & IMGMASK) |
         (((imageint) (atoi(values[iptr+1]) + IMGMAX) & IMGMASK) << IMGBITS) |
         (((imageint) (atoi(values[iptr+2]) + IMGMAX) & IMGMASK) << IMG2BITS);
     else imagedata = ((imageint) IMGMAX << IMG2BITS) |
            ((imageint) IMGMAX << IMGBITS) | IMGMAX;
     
     xdata[0] = atof(values[xptr]);
     xdata[1] = atof(values[xptr+1]);
     xdata[2] = atof(values[xptr+2]);
     domain->remap(xdata,imagedata);
     if (triclinic) {
       domain->x2lamda(xdata,lamda);
       coord = lamda;
     } else coord = xdata;
 
     if (coord[0] >= sublo[0] && coord[0] < subhi[0] &&
         coord[1] >= sublo[1] && coord[1] < subhi[1] &&
         coord[2] >= sublo[2] && coord[2] < subhi[2])
       avec->data_atom(xdata,imagedata,values);
 
     buf = next + 1;
   }
 
   delete [] values;
 }
 
 /* ----------------------------------------------------------------------
    unpack n lines from Velocity section of data file
    check that atom IDs are > 0 and <= map_tag_max
    call style-specific routine to parse line
 ------------------------------------------------------------------------- */
 
 void Atom::data_vels(int n, char *buf)
 {
   int j,m;
   tagint tagdata;
   char *next;
 
   next = strchr(buf,'\n');
   *next = '\0';
   int nwords = count_words(buf);
   *next = '\n';
 
   if (nwords != avec->size_data_vel)
     error->all(FLERR,"Incorrect velocity format in data file");
 
   char **values = new char*[nwords];
 
   // loop over lines of atom velocities
   // tokenize the line into values
   // if I own atom tag, unpack its values
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
 
     values[0] = strtok(buf," \t\n\r\f");
     for (j = 1; j < nwords; j++)
       values[j] = strtok(NULL," \t\n\r\f");
 
     tagdata = ATOTAGINT(values[0]);
     if (tagdata <= 0 || tagdata > map_tag_max)
       error->one(FLERR,"Invalid atom ID in Velocities section of data file");
     if ((m = map(tagdata)) >= 0) avec->data_vel(m,&values[1]);
 
     buf = next + 1;
   }
 
   delete [] values;
 }
 
 /* ----------------------------------------------------------------------
    process N bonds read into buf from data files
    if count is non-NULL, just count bonds per atom
    else store them with atoms
    check that atom IDs are > 0 and <= map_tag_max
 ------------------------------------------------------------------------- */
 
 void Atom::data_bonds(int n, char *buf, int *count)
 {
   int m,tmp,itype;
   tagint atom1,atom2;
   char *next;
   int newton_bond = force->newton_bond;
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
     *next = '\0';
     sscanf(buf,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT,
            &tmp,&itype,&atom1,&atom2);
     if (atom1 <= 0 || atom1 > map_tag_max ||
         atom2 <= 0 || atom2 > map_tag_max)
       error->one(FLERR,"Invalid atom ID in Bonds section of data file");
     if (itype <= 0 || itype > nbondtypes)
       error->one(FLERR,"Invalid bond type in Bonds section of data file");
     if ((m = map(atom1)) >= 0) {
       if (count) count[m]++;
       else {
         bond_type[m][num_bond[m]] = itype;
         bond_atom[m][num_bond[m]] = atom2;
         num_bond[m]++;
       }
     }
     if (newton_bond == 0) {
       if ((m = map(atom2)) >= 0) {
         if (count) count[m]++;
         else {
           bond_type[m][num_bond[m]] = itype;
           bond_atom[m][num_bond[m]] = atom1;
           num_bond[m]++;
         }
       }
     }
     buf = next + 1;
   }
 }
 
 /* ----------------------------------------------------------------------
    process N angles read into buf from data files
    if count is non-NULL, just count angles per atom
    else store them with atoms
    check that atom IDs are > 0 and <= map_tag_max
 ------------------------------------------------------------------------- */
 
 void Atom::data_angles(int n, char *buf, int *count)
 {
   int m,tmp,itype;
   tagint atom1,atom2,atom3;
   char *next;
   int newton_bond = force->newton_bond;
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
     *next = '\0';
     sscanf(buf,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT,
            &tmp,&itype,&atom1,&atom2,&atom3);
     if (atom1 <= 0 || atom1 > map_tag_max ||
         atom2 <= 0 || atom2 > map_tag_max ||
         atom3 <= 0 || atom3 > map_tag_max)
       error->one(FLERR,"Invalid atom ID in Angles section of data file");
     if (itype <= 0 || itype > nangletypes)
       error->one(FLERR,"Invalid angle type in Angles section of data file");
     if ((m = map(atom2)) >= 0) {
       if (count) count[m]++;
       else {
         angle_type[m][num_angle[m]] = itype;
         angle_atom1[m][num_angle[m]] = atom1;
         angle_atom2[m][num_angle[m]] = atom2;
         angle_atom3[m][num_angle[m]] = atom3;
         num_angle[m]++;
       }
     }
     if (newton_bond == 0) {
       if ((m = map(atom1)) >= 0) {
         if (count) count[m]++;
         else {
           angle_type[m][num_angle[m]] = itype;
           angle_atom1[m][num_angle[m]] = atom1;
           angle_atom2[m][num_angle[m]] = atom2;
           angle_atom3[m][num_angle[m]] = atom3;
           num_angle[m]++;
         }
       }
       if ((m = map(atom3)) >= 0) {
         if (count) count[m]++;
         else {
           angle_type[m][num_angle[m]] = itype;
           angle_atom1[m][num_angle[m]] = atom1;
           angle_atom2[m][num_angle[m]] = atom2;
           angle_atom3[m][num_angle[m]] = atom3;
           num_angle[m]++;
         }
       }
     }
     buf = next + 1;
   }
 }
 
 /* ----------------------------------------------------------------------
    process N dihedrals read into buf from data files
    if count is non-NULL, just count diihedrals per atom
    else store them with atoms
    check that atom IDs are > 0 and <= map_tag_max
 ------------------------------------------------------------------------- */
 
 void Atom::data_dihedrals(int n, char *buf, int *count)
 {
   int m,tmp,itype;
   tagint atom1,atom2,atom3,atom4;
   char *next;
   int newton_bond = force->newton_bond;
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
     *next = '\0';
     sscanf(buf,"%d %d " 
            TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT,
            &tmp,&itype,&atom1,&atom2,&atom3,&atom4);
     if (atom1 <= 0 || atom1 > map_tag_max ||
         atom2 <= 0 || atom2 > map_tag_max ||
         atom3 <= 0 || atom3 > map_tag_max ||
         atom4 <= 0 || atom4 > map_tag_max)
       error->one(FLERR,"Invalid atom ID in Dihedrals section of data file");
     if (itype <= 0 || itype > ndihedraltypes)
       error->one(FLERR,
                  "Invalid dihedral type in Dihedrals section of data file");
     if ((m = map(atom2)) >= 0) {
       if (count) count[m]++;
       else {
         dihedral_type[m][num_dihedral[m]] = itype;
         dihedral_atom1[m][num_dihedral[m]] = atom1;
         dihedral_atom2[m][num_dihedral[m]] = atom2;
         dihedral_atom3[m][num_dihedral[m]] = atom3;
         dihedral_atom4[m][num_dihedral[m]] = atom4;
         num_dihedral[m]++;
       }
     }
     if (newton_bond == 0) {
       if ((m = map(atom1)) >= 0) {
         if (count) count[m]++;
         else {
           dihedral_type[m][num_dihedral[m]] = itype;
           dihedral_atom1[m][num_dihedral[m]] = atom1;
           dihedral_atom2[m][num_dihedral[m]] = atom2;
           dihedral_atom3[m][num_dihedral[m]] = atom3;
           dihedral_atom4[m][num_dihedral[m]] = atom4;
           num_dihedral[m]++;
         }
       }
       if ((m = map(atom3)) >= 0) {
         if (count) count[m]++;
         else {
           dihedral_type[m][num_dihedral[m]] = itype;
           dihedral_atom1[m][num_dihedral[m]] = atom1;
           dihedral_atom2[m][num_dihedral[m]] = atom2;
           dihedral_atom3[m][num_dihedral[m]] = atom3;
           dihedral_atom4[m][num_dihedral[m]] = atom4;
           num_dihedral[m]++;
         }
       }
       if ((m = map(atom4)) >= 0) {
         if (count) count[m]++;
         else {
           dihedral_type[m][num_dihedral[m]] = itype;
           dihedral_atom1[m][num_dihedral[m]] = atom1;
           dihedral_atom2[m][num_dihedral[m]] = atom2;
           dihedral_atom3[m][num_dihedral[m]] = atom3;
           dihedral_atom4[m][num_dihedral[m]] = atom4;
           num_dihedral[m]++;
         }
       }
     }
     buf = next + 1;
   }
 }
 
 /* ----------------------------------------------------------------------
    process N impropers read into buf from data files
    if count is non-NULL, just count impropers per atom
    else store them with atoms
    check that atom IDs are > 0 and <= map_tag_max
 ------------------------------------------------------------------------- */
 
 void Atom::data_impropers(int n, char *buf, int *count)
 {
   int m,tmp,itype;
   tagint atom1,atom2,atom3,atom4;
   char *next;
   int newton_bond = force->newton_bond;
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
     *next = '\0';
     sscanf(buf,"%d %d " 
            TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT,
            &tmp,&itype,&atom1,&atom2,&atom3,&atom4);
     if (atom1 <= 0 || atom1 > map_tag_max ||
         atom2 <= 0 || atom2 > map_tag_max ||
         atom3 <= 0 || atom3 > map_tag_max ||
         atom4 <= 0 || atom4 > map_tag_max)
       error->one(FLERR,"Invalid atom ID in Impropers section of data file");
     if (itype <= 0 || itype > nimpropertypes)
       error->one(FLERR,
                  "Invalid improper type in Impropers section of data file");
     if ((m = map(atom2)) >= 0) {
       if (count) count[m]++;
       else {
         improper_type[m][num_improper[m]] = itype;
         improper_atom1[m][num_improper[m]] = atom1;
         improper_atom2[m][num_improper[m]] = atom2;
         improper_atom3[m][num_improper[m]] = atom3;
         improper_atom4[m][num_improper[m]] = atom4;
         num_improper[m]++;
       }
     }
     if (newton_bond == 0) {
       if ((m = map(atom1)) >= 0) {
         if (count) count[m]++;
         else {
           improper_type[m][num_improper[m]] = itype;
           improper_atom1[m][num_improper[m]] = atom1;
           improper_atom2[m][num_improper[m]] = atom2;
           improper_atom3[m][num_improper[m]] = atom3;
           improper_atom4[m][num_improper[m]] = atom4;
           num_improper[m]++;
         }
       }
       if ((m = map(atom3)) >= 0) {
         if (count) count[m]++;
         else {
           improper_type[m][num_improper[m]] = itype;
           improper_atom1[m][num_improper[m]] = atom1;
           improper_atom2[m][num_improper[m]] = atom2;
           improper_atom3[m][num_improper[m]] = atom3;
           improper_atom4[m][num_improper[m]] = atom4;
           num_improper[m]++;
         }
       }
       if ((m = map(atom4)) >= 0) {
         if (count) count[m]++;
         else {
           improper_type[m][num_improper[m]] = itype;
           improper_atom1[m][num_improper[m]] = atom1;
           improper_atom2[m][num_improper[m]] = atom2;
           improper_atom3[m][num_improper[m]] = atom3;
           improper_atom4[m][num_improper[m]] = atom4;
           num_improper[m]++;
         }
       }
     }
     buf = next + 1;
   }
 }
 
 /* ----------------------------------------------------------------------
    unpack n lines from atom-style specific section of data file
    check that atom IDs are > 0 and <= map_tag_max
    call style-specific routine to parse line
 ------------------------------------------------------------------------- */
 
 void Atom::data_bonus(int n, char *buf, AtomVec *avec_bonus)
 {
   int j,m,tagdata;
   char *next;
 
   next = strchr(buf,'\n');
   *next = '\0';
   int nwords = count_words(buf);
   *next = '\n';
 
   if (nwords != avec_bonus->size_data_bonus)
     error->all(FLERR,"Incorrect bonus data format in data file");
 
   char **values = new char*[nwords];
 
   // loop over lines of bonus atom data
   // tokenize the line into values
   // if I own atom tag, unpack its values
 
   for (int i = 0; i < n; i++) {
     next = strchr(buf,'\n');
 
     values[0] = strtok(buf," \t\n\r\f");
     for (j = 1; j < nwords; j++)
       values[j] = strtok(NULL," \t\n\r\f");
 
     tagdata = ATOTAGINT(values[0]);
     if (tagdata <= 0 || tagdata > map_tag_max)
       error->one(FLERR,"Invalid atom ID in Bonus section of data file");
 
     // ok to call child's data_atom_bonus() method thru parent avec_bonus,
     // since data_bonus() was called with child ptr, and method is virtual
 
     if ((m = map(tagdata)) >= 0) avec_bonus->data_atom_bonus(m,&values[1]);
 
     buf = next + 1;
   }
 
   delete [] values;
 }
 
 /* ----------------------------------------------------------------------
    unpack n lines from atom-style specific section of data file
    check that atom IDs are > 0 and <= map_tag_max
    call style-specific routine to parse line
 ------------------------------------------------------------------------- */
 
 void Atom::data_bodies(int n, char *buf, AtomVecBody *avec_body)
 {
   int j,m,tagdata,ninteger,ndouble;
 
   char **ivalues = new char*[10*MAXBODY];
   char **dvalues = new char*[10*MAXBODY];
 
   // loop over lines of body data
   // tokenize the lines into ivalues and dvalues
   // if I own atom tag, unpack its values
 
   for (int i = 0; i < n; i++) {
     if (i == 0) tagdata = ATOTAGINT(strtok(buf," \t\n\r\f"));
     else tagdata = ATOTAGINT(strtok(NULL," \t\n\r\f"));
     ninteger = atoi(strtok(NULL," \t\n\r\f"));
     ndouble = atoi(strtok(NULL," \t\n\r\f"));
 
     for (j = 0; j < ninteger; j++)
       ivalues[j] = strtok(NULL," \t\n\r\f");
     for (j = 0; j < ndouble; j++)
       dvalues[j] = strtok(NULL," \t\n\r\f");
 
     if (tagdata <= 0 || tagdata > map_tag_max)
       error->one(FLERR,"Invalid atom ID in Bodies section of data file");
 
     if ((m = map(tagdata)) >= 0)
       avec_body->data_body(m,ninteger,ndouble,ivalues,dvalues);
   }
 
   delete [] ivalues;
   delete [] dvalues;
 }
 
 /* ----------------------------------------------------------------------
    allocate arrays of length ntypes
    only done after ntypes is set
 ------------------------------------------------------------------------- */
 
 void Atom::allocate_type_arrays()
 {
   if (avec->mass_type) {
     mass = new double[ntypes+1];
     mass_setflag = new int[ntypes+1];
     for (int itype = 1; itype <= ntypes; itype++) mass_setflag[itype] = 0;
   }
 }
 
 /* ----------------------------------------------------------------------
    set a mass and flag it as set
    called from reading of data file
 ------------------------------------------------------------------------- */
 
 void Atom::set_mass(const char *str)
 {
   if (mass == NULL) error->all(FLERR,"Cannot set mass for this atom style");
 
   int itype;
   double mass_one;
   int n = sscanf(str,"%d %lg",&itype,&mass_one);
   if (n != 2) error->all(FLERR,"Invalid mass line in data file");
 
   if (itype < 1 || itype > ntypes)
     error->all(FLERR,"Invalid type for mass set");
 
   mass[itype] = mass_one;
   mass_setflag[itype] = 1;
 
   if (mass[itype] <= 0.0) error->all(FLERR,"Invalid mass value");
 }
 
 /* ----------------------------------------------------------------------
    set a mass and flag it as set
    called from EAM pair routine
 ------------------------------------------------------------------------- */
 
 void Atom::set_mass(int itype, double value)
 {
   if (mass == NULL) error->all(FLERR,"Cannot set mass for this atom style");
   if (itype < 1 || itype > ntypes)
     error->all(FLERR,"Invalid type for mass set");
 
   mass[itype] = value;
   mass_setflag[itype] = 1;
 
   if (mass[itype] <= 0.0) error->all(FLERR,"Invalid mass value");
 }
 
 /* ----------------------------------------------------------------------
    set one or more masses and flag them as set
    called from reading of input script
 ------------------------------------------------------------------------- */
 
 void Atom::set_mass(int narg, char **arg)
 {
   if (mass == NULL) error->all(FLERR,"Cannot set mass for this atom style");
 
   int lo,hi;
   force->bounds(arg[0],ntypes,lo,hi);
   if (lo < 1 || hi > ntypes) error->all(FLERR,"Invalid type for mass set");
 
   for (int itype = lo; itype <= hi; itype++) {
     mass[itype] = atof(arg[1]);
     mass_setflag[itype] = 1;
 
     if (mass[itype] <= 0.0) error->all(FLERR,"Invalid mass value");
   }
 }
 
 /* ----------------------------------------------------------------------
    set all masses as read in from restart file
 ------------------------------------------------------------------------- */
 
 void Atom::set_mass(double *values)
 {
   for (int itype = 1; itype <= ntypes; itype++) {
     mass[itype] = values[itype];
     mass_setflag[itype] = 1;
   }
 }
 
 /* ----------------------------------------------------------------------
    check that all masses have been set
 ------------------------------------------------------------------------- */
 
 void Atom::check_mass()
 {
   if (mass == NULL) return;
   for (int itype = 1; itype <= ntypes; itype++)
     if (mass_setflag[itype] == 0) error->all(FLERR,"All masses are not set");
 }
 
 /* ----------------------------------------------------------------------
    check that radii of all particles of itype are the same
    return 1 if true, else return 0
    also return the radius value for that type
 ------------------------------------------------------------------------- */
 
 int Atom::radius_consistency(int itype, double &rad)
 {
   double value = -1.0;
   int flag = 0;
   for (int i = 0; i < nlocal; i++) {
     if (type[i] != itype) continue;
     if (value < 0.0) value = radius[i];
     else if (value != radius[i]) flag = 1;
   }
 
   int flagall;
   MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
   if (flagall) return 0;
 
   MPI_Allreduce(&value,&rad,1,MPI_DOUBLE,MPI_MAX,world);
   return 1;
 }
 
 /* ----------------------------------------------------------------------
    check that shape of all particles of itype are the same
    return 1 if true, else return 0
    also return the 3 shape params for itype
 ------------------------------------------------------------------------- */
 
 int Atom::shape_consistency(int itype,
                             double &shapex, double &shapey, double &shapez)
 {
   double zero[3] = {0.0, 0.0, 0.0};
   double one[3] = {-1.0, -1.0, -1.0};
   double *shape;
 
   AtomVecEllipsoid *avec_ellipsoid =
     (AtomVecEllipsoid *) style_match("ellipsoid");
   AtomVecEllipsoid::Bonus *bonus = avec_ellipsoid->bonus;
 
   int flag = 0;
   for (int i = 0; i < nlocal; i++) {
     if (type[i] != itype) continue;
     if (ellipsoid[i] < 0) shape = zero;
     else shape = bonus[ellipsoid[i]].shape;
 
     if (one[0] < 0.0) {
       one[0] = shape[0];
       one[1] = shape[1];
       one[2] = shape[2];
     } else if (one[0] != shape[0] || one[1] != shape[1] || one[2] != shape[2])
       flag = 1;
   }
 
   int flagall;
   MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
   if (flagall) return 0;
 
   double oneall[3];
   MPI_Allreduce(one,oneall,3,MPI_DOUBLE,MPI_MAX,world);
   shapex = oneall[0];
   shapey = oneall[1];
   shapez = oneall[2];
   return 1;
 }
 
 /* ----------------------------------------------------------------------
    add a new molecule template = set of molecules
 ------------------------------------------------------------------------- */
 
 void Atom::add_molecule(int narg, char **arg)
 {
   if (narg < 2) error->all(FLERR,"Illegal molecule command");
   if (find_molecule(arg[0]) >= 0) 
     error->all(FLERR,"Reuse of molecule template ID");
 
   int nprevious = nmolecule;
   nmolecule += narg-1;
   molecules = (Molecule **)
     memory->srealloc(molecules,nmolecule*sizeof(Molecule *),"atom::molecules");
 
   for (int i = 1; i < narg; i++) {
     molecules[nprevious] = new Molecule(lmp,arg[0],arg[i]);
     if (i == 1) molecules[nprevious]->nset = narg-1;
     else molecules[nprevious]->nset = 0;
     nprevious++;
   }
 }
 
 /* ----------------------------------------------------------------------
    find first molecule in set with template ID
    return -1 if does not exist
 ------------------------------------------------------------------------- */
 
 int Atom::find_molecule(char *id)
 {
   int imol;
   for (imol = 0; imol < nmolecule; imol++)
     if (strcmp(id,molecules[imol]->id) == 0) return imol;
   return -1;
 }
 
 /* ----------------------------------------------------------------------
    add info to current atom ilocal from molecule template onemol and its iatom
    offset = atom ID preceeding IDs of atoms in this molecule
    called by fixes and commands that add molecules
 ------------------------------------------------------------------------- */
 
 void Atom::add_molecule_atom(Molecule *onemol, int iatom,
                              int ilocal, tagint offset)
 {
   if (onemol->qflag && q_flag) q[ilocal] = onemol->q[iatom];
   if (onemol->radiusflag && radius_flag) radius[ilocal] = onemol->radius[iatom];
   if (onemol->rmassflag && rmass_flag) rmass[ilocal] = onemol->rmass[iatom];
   else if (rmass_flag) 
     rmass[ilocal] = 4.0*MY_PI/3.0 *
       radius[ilocal]*radius[ilocal]*radius[ilocal];
 
   if (molecular != 1) return;
 
   // add bond topology info
   // for molecular atom styles, but not atom style template
 
   if (avec->bonds_allow) {
     num_bond[ilocal] = onemol->num_bond[iatom];
     for (int i = 0; i < num_bond[ilocal]; i++) {
       bond_type[ilocal][i] = onemol->bond_type[iatom][i];
       bond_atom[ilocal][i] = onemol->bond_atom[iatom][i] + offset;
     }
   }
 
   if (avec->angles_allow) {
     num_angle[ilocal] = onemol->num_angle[iatom];
     for (int i = 0; i < num_angle[ilocal]; i++) {
       angle_type[ilocal][i] = onemol->angle_type[iatom][i];
       angle_atom1[ilocal][i] = onemol->angle_atom1[iatom][i] + offset;
       angle_atom2[ilocal][i] = onemol->angle_atom2[iatom][i] + offset;
       angle_atom3[ilocal][i] = onemol->angle_atom3[iatom][i] + offset;
     }
   }
 
   if (avec->dihedrals_allow) {
     num_dihedral[ilocal] = onemol->num_dihedral[iatom];
     for (int i = 0; i < num_dihedral[ilocal]; i++) {
       dihedral_type[ilocal][i] = onemol->dihedral_type[iatom][i];
       dihedral_atom1[ilocal][i] = onemol->dihedral_atom1[iatom][i] + offset;
       dihedral_atom2[ilocal][i] = onemol->dihedral_atom2[iatom][i] + offset;
       dihedral_atom3[ilocal][i] = onemol->dihedral_atom3[iatom][i] + offset;
       dihedral_atom4[ilocal][i] = onemol->dihedral_atom4[iatom][i] + offset;
     }
   }
 
   if (avec->impropers_allow) {
     num_improper[ilocal] = onemol->num_improper[iatom];
     for (int i = 0; i < num_improper[ilocal]; i++) {
       improper_type[ilocal][i] = onemol->improper_type[iatom][i];
       improper_atom1[ilocal][i] = onemol->improper_atom1[iatom][i] + offset;
       improper_atom2[ilocal][i] = onemol->improper_atom2[iatom][i] + offset;
       improper_atom3[ilocal][i] = onemol->improper_atom3[iatom][i] + offset;
       improper_atom4[ilocal][i] = onemol->improper_atom4[iatom][i] + offset;
     }
   }
 
   if (onemol->specialflag) {
     nspecial[ilocal][0] = onemol->nspecial[iatom][0];
     nspecial[ilocal][1] = onemol->nspecial[iatom][1];
     int n = nspecial[ilocal][2] = onemol->nspecial[iatom][2];
     for (int i = 0; i < n; i++)
       special[ilocal][i] = onemol->special[iatom][i] + offset;
   }
 }
 
 /* ----------------------------------------------------------------------
    reorder owned atoms so those in firstgroup appear first
    called by comm->exchange() if atom_modify first group is set
    only owned atoms exist at this point, no ghost atoms
 ------------------------------------------------------------------------- */
 
 void Atom::first_reorder()
 {
   // insure there is one extra atom location at end of arrays for swaps
 
   if (nlocal == nmax) avec->grow(0);
 
   // loop over owned atoms
   // nfirst = index of first atom not in firstgroup
   // when find firstgroup atom out of place, swap it with atom nfirst
 
   int bitmask = group->bitmask[firstgroup];
   nfirst = 0;
   while (nfirst < nlocal && mask[nfirst] & bitmask) nfirst++;
 
   for (int i = 0; i < nlocal; i++) {
     if (mask[i] & bitmask && i > nfirst) {
       avec->copy(i,nlocal,0);
       avec->copy(nfirst,i,0);
       avec->copy(nlocal,nfirst,0);
       while (nfirst < nlocal && mask[nfirst] & bitmask) nfirst++;
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    perform spatial sort of atoms within my sub-domain
    always called between comm->exchange() and comm->borders()
    don't have to worry about clearing/setting atom->map since done in comm
 ------------------------------------------------------------------------- */
 
 void Atom::sort()
 {
   int i,m,n,ix,iy,iz,ibin,empty;
 
   // set next timestep for sorting to take place
 
   nextsort = (update->ntimestep/sortfreq)*sortfreq + sortfreq;
 
   // download data from GPU if necessary
 
   if (lmp->cuda && !lmp->cuda->oncpu) lmp->cuda->downloadAll();
 
   // re-setup sort bins if needed
 
   if (domain->box_change) setup_sort_bins();
   if (nbins == 1) return;
 
   // reallocate per-atom vectors if needed
 
   if (nlocal > maxnext) {
     memory->destroy(next);
     memory->destroy(permute);
     maxnext = atom->nmax;
     memory->create(next,maxnext,"atom:next");
     memory->create(permute,maxnext,"atom:permute");
   }
 
   // insure there is one extra atom location at end of arrays for swaps
 
   if (nlocal == nmax) avec->grow(0);
 
   // bin atoms in reverse order so linked list will be in forward order
 
   for (i = 0; i < nbins; i++) binhead[i] = -1;
 
   for (i = nlocal-1; i >= 0; i--) {
     ix = static_cast<int> ((x[i][0]-bboxlo[0])*bininvx);
     iy = static_cast<int> ((x[i][1]-bboxlo[1])*bininvy);
     iz = static_cast<int> ((x[i][2]-bboxlo[2])*bininvz);
     ix = MAX(ix,0);
     iy = MAX(iy,0);
     iz = MAX(iz,0);
     ix = MIN(ix,nbinx-1);
     iy = MIN(iy,nbiny-1);
     iz = MIN(iz,nbinz-1);
     ibin = iz*nbiny*nbinx + iy*nbinx + ix;
     next[i] = binhead[ibin];
     binhead[ibin] = i;
   }
 
   // permute = desired permutation of atoms
   // permute[I] = J means Ith new atom will be Jth old atom
 
   n = 0;
   for (m = 0; m < nbins; m++) {
     i = binhead[m];
     while (i >= 0) {
       permute[n++] = i;
       i = next[i];
     }
   }
 
   // current = current permutation, just reuse next vector
   // current[I] = J means Ith current atom is Jth old atom
 
   int *current = next;
   for (i = 0; i < nlocal; i++) current[i] = i;
 
   // reorder local atom list, when done, current = permute
   // perform "in place" using copy() to extra atom location at end of list
   // inner while loop processes one cycle of the permutation
   // copy before inner-loop moves an atom to end of atom list
   // copy after inner-loop moves atom at end of list back into list
   // empty = location in atom list that is currently empty
 
   for (i = 0; i < nlocal; i++) {
     if (current[i] == permute[i]) continue;
     avec->copy(i,nlocal,0);
     empty = i;
     while (permute[empty] != i) {
       avec->copy(permute[empty],empty,0);
       empty = current[empty] = permute[empty];
     }
     avec->copy(nlocal,empty,0);
     current[empty] = permute[empty];
   }
 
   // upload data back to GPU if necessary
 
   if (lmp->cuda && !lmp->cuda->oncpu) lmp->cuda->uploadAll();
 
   // sanity check that current = permute
 
   //int flag = 0;
   //for (i = 0; i < nlocal; i++)
   //  if (current[i] != permute[i]) flag = 1;
   //int flagall;
   //MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
   //if (flagall) error->all(FLERR,"Atom sort did not operate correctly");
 }
 
 /* ----------------------------------------------------------------------
    setup bins for spatial sorting of atoms
 ------------------------------------------------------------------------- */
 
 void Atom::setup_sort_bins()
 {
   // binsize:
   // user setting if explicitly set
   // 1/2 of neighbor cutoff for non-CUDA
   // CUDA_CHUNK atoms/proc for CUDA
   // check if neighbor cutoff = 0.0
 
   double binsize;
   if (userbinsize > 0.0) binsize = userbinsize;
   else if (!lmp->cuda) binsize = 0.5 * neighbor->cutneighmax;
   else {
     if (domain->dimension == 3) {
       double vol = (domain->boxhi[0]-domain->boxlo[0]) *
         (domain->boxhi[1]-domain->boxlo[1]) *
         (domain->boxhi[2]-domain->boxlo[2]);
       binsize = pow(1.0*CUDA_CHUNK/natoms*vol,1.0/3.0);
     } else {
       double area = (domain->boxhi[0]-domain->boxlo[0]) *
         (domain->boxhi[1]-domain->boxlo[1]);
       binsize = pow(1.0*CUDA_CHUNK/natoms*area,1.0/2.0);
     }
   }
   if (binsize == 0.0) error->all(FLERR,"Atom sorting has bin size = 0.0");
 
   double bininv = 1.0/binsize;
 
   // nbin xyz = local bins
   // bbox lo/hi = bounding box of my sub-domain
 
   if (domain->triclinic)
     domain->bbox(domain->sublo_lamda,domain->subhi_lamda,bboxlo,bboxhi);
   else {
     bboxlo[0] = domain->sublo[0];
     bboxlo[1] = domain->sublo[1];
     bboxlo[2] = domain->sublo[2];
     bboxhi[0] = domain->subhi[0];
     bboxhi[1] = domain->subhi[1];
     bboxhi[2] = domain->subhi[2];
   }
 
   nbinx = static_cast<int> ((bboxhi[0]-bboxlo[0]) * bininv);
   nbiny = static_cast<int> ((bboxhi[1]-bboxlo[1]) * bininv);
   nbinz = static_cast<int> ((bboxhi[2]-bboxlo[2]) * bininv);
   if (domain->dimension == 2) nbinz = 1;
 
   if (nbinx == 0) nbinx = 1;
   if (nbiny == 0) nbiny = 1;
   if (nbinz == 0) nbinz = 1;
 
   bininvx = nbinx / (bboxhi[0]-bboxlo[0]);
   bininvy = nbiny / (bboxhi[1]-bboxlo[1]);
   bininvz = nbinz / (bboxhi[2]-bboxlo[2]);
 
   if (1.0*nbinx*nbiny*nbinz > INT_MAX)
     error->one(FLERR,"Too many atom sorting bins");
 
   nbins = nbinx*nbiny*nbinz;
 
   // reallocate per-bin memory if needed
 
   if (nbins > maxbin) {
     memory->destroy(binhead);
     maxbin = nbins;
     memory->create(binhead,maxbin,"atom:binhead");
   }
 }
 
 /* ----------------------------------------------------------------------
    register a callback to a fix so it can manage atom-based arrays
    happens when fix is created
    flag = 0 for grow, 1 for restart, 2 for border comm
 ------------------------------------------------------------------------- */
 
 void Atom::add_callback(int flag)
 {
   int ifix;
 
   // find the fix
   // if find NULL ptr:
   //   it's this one, since it is being replaced and has just been deleted
   //   at this point in re-creation
   // if don't find NULL ptr:
   //   i is set to nfix = new one currently being added at end of list
 
   for (ifix = 0; ifix < modify->nfix; ifix++)
     if (modify->fix[ifix] == NULL) break;
 
   // add callback to lists, reallocating if necessary
 
   if (flag == 0) {
     if (nextra_grow == nextra_grow_max) {
       nextra_grow_max += DELTA;
       memory->grow(extra_grow,nextra_grow_max,"atom:extra_grow");
     }
     extra_grow[nextra_grow] = ifix;
     nextra_grow++;
   } else if (flag == 1) {
     if (nextra_restart == nextra_restart_max) {
       nextra_restart_max += DELTA;
       memory->grow(extra_restart,nextra_restart_max,"atom:extra_restart");
     }
     extra_restart[nextra_restart] = ifix;
     nextra_restart++;
   } else if (flag == 2) {
     if (nextra_border == nextra_border_max) {
       nextra_border_max += DELTA;
       memory->grow(extra_border,nextra_border_max,"atom:extra_border");
     }
     extra_border[nextra_border] = ifix;
     nextra_border++;
   }
 }
 
 /* ----------------------------------------------------------------------
    unregister a callback to a fix
    happens when fix is deleted, called by its destructor
    flag = 0 for grow, 1 for restart
 ------------------------------------------------------------------------- */
 
 void Atom::delete_callback(const char *id, int flag)
 {
   int ifix;
   for (ifix = 0; ifix < modify->nfix; ifix++)
     if (strcmp(id,modify->fix[ifix]->id) == 0) break;
 
   // compact the list of callbacks
 
   if (flag == 0) {
     int match;
     for (match = 0; match < nextra_grow; match++)
       if (extra_grow[match] == ifix) break;
     for (int i = match; i < nextra_grow-1; i++)
       extra_grow[i] = extra_grow[i+1];
     nextra_grow--;
 
   } else if (flag == 1) {
     int match;
     for (match = 0; match < nextra_restart; match++)
       if (extra_restart[match] == ifix) break;
     for (int i = match; i < nextra_restart-1; i++)
       extra_restart[i] = extra_restart[i+1];
     nextra_restart--;
 
   } else if (flag == 2) {
     int match;
     for (match = 0; match < nextra_border; match++)
       if (extra_border[match] == ifix) break;
     for (int i = match; i < nextra_border-1; i++)
       extra_border[i] = extra_border[i+1];
     nextra_border--;
   }
 }
 
 /* ----------------------------------------------------------------------
    decrement ptrs in callback lists to fixes beyond the deleted ifix
    happens after fix is deleted
 ------------------------------------------------------------------------- */
 
 void Atom::update_callback(int ifix)
 {
   for (int i = 0; i < nextra_grow; i++)
     if (extra_grow[i] > ifix) extra_grow[i]--;
   for (int i = 0; i < nextra_restart; i++)
     if (extra_restart[i] > ifix) extra_restart[i]--;
   for (int i = 0; i < nextra_border; i++)
     if (extra_border[i] > ifix) extra_border[i]--;
 }
 
 /* ----------------------------------------------------------------------
    find custom per-atom vector with name
    return index if found, and flag = 0/1 for int/double
    return -1 if not found
 ------------------------------------------------------------------------- */
 
 int Atom::find_custom(char *name, int &flag)
 {
   for (int i = 0; i < nivector; i++)
     if (iname[i] && strcmp(iname[i],name) == 0) {
       flag = 0;
       return i;
     }
 
   for (int i = 0; i < ndvector; i++)
     if (dname[i] && strcmp(dname[i],name) == 0) {
       flag = 1;
       return i;
     }
 
   return -1;
 }
 
 /* ----------------------------------------------------------------------
    add a custom variable with name of type flag = 0/1 for int/double
    assumes name does not already exist
    return index in ivector or dvector of its location
 ------------------------------------------------------------------------- */
 
 int Atom::add_custom(char *name, int flag)
 {
   int index;
 
   if (flag == 0) {
     index = nivector;
     nivector++;
     iname = (char **) memory->srealloc(iname,nivector*sizeof(char *),
                                        "atom:iname");
     int n = strlen(name) + 1;
     iname[index] = new char[n];
     strcpy(iname[index],name);
     ivector = (int **) memory->srealloc(ivector,nivector*sizeof(int *),
                                         "atom:ivector");
     memory->create(ivector[index],nmax,"atom:ivector");
   } else {
     index = ndvector;
     ndvector++;
     dname = (char **) memory->srealloc(dname,ndvector*sizeof(char *),
                                        "atom:dname");
     int n = strlen(name) + 1;
     dname[index] = new char[n];
     strcpy(dname[index],name);
     dvector = (double **) memory->srealloc(dvector,ndvector*sizeof(double *),
                                            "atom:dvector");
     memory->create(dvector[index],nmax,"atom:dvector");
   }
 
   return index;
 }
 
 /* ----------------------------------------------------------------------
    remove a custom variable of type flag = 0/1 for int/double at index
    free memory for vector and name and set ptrs to NULL
    ivector/dvector and iname/dname lists never shrink
 ------------------------------------------------------------------------- */
 
 void Atom::remove_custom(int flag, int index)
 {
   if (flag == 0) {
     memory->destroy(ivector[index]);
     ivector[index] = NULL;
     delete [] iname[index];
     iname[index] = NULL;
   } else {
     memory->destroy(dvector[index]);
     dvector[index] = NULL;
     delete [] dname[index];
     dname[index] = NULL;
   }
 }
 
 /* ----------------------------------------------------------------------
    return a pointer to a named internal variable
    if don't recognize name, return NULL
    customize by adding names
 ------------------------------------------------------------------------- */
 
 void *Atom::extract(char *name)
 {
   if (strcmp(name,"mass") == 0) return (void *) mass;
 
   if (strcmp(name,"id") == 0) return (void *) tag;
   if (strcmp(name,"type") == 0) return (void *) type;
   if (strcmp(name,"mask") == 0) return (void *) mask;
   if (strcmp(name,"image") == 0) return (void *) image;
   if (strcmp(name,"x") == 0) return (void *) x;
   if (strcmp(name,"v") == 0) return (void *) v;
   if (strcmp(name,"f") == 0) return (void *) f;
   if (strcmp(name,"molecule") == 0) return (void *) molecule;
   if (strcmp(name,"q") == 0) return (void *) q;
   if (strcmp(name,"mu") == 0) return (void *) mu;
   if (strcmp(name,"omega") == 0) return (void *) omega;
   if (strcmp(name,"angmom") == 0) return (void *) angmom;
   if (strcmp(name,"torque") == 0) return (void *) torque;
   if (strcmp(name,"radius") == 0) return (void *) radius;
   if (strcmp(name,"rmass") == 0) return (void *) rmass;
   if (strcmp(name,"ellipsoid") == 0) return (void *) ellipsoid;
   if (strcmp(name,"line") == 0) return (void *) line;
   if (strcmp(name,"tri") == 0) return (void *) tri;
 
   if (strcmp(name,"vfrac") == 0) return (void *) vfrac;
   if (strcmp(name,"s0") == 0) return (void *) s0;
   if (strcmp(name,"x0") == 0) return (void *) x0;
 
   if (strcmp(name,"spin") == 0) return (void *) spin;
   if (strcmp(name,"eradius") == 0) return (void *) eradius;
   if (strcmp(name,"ervel") == 0) return (void *) ervel;
   if (strcmp(name,"erforce") == 0) return (void *) erforce;
   if (strcmp(name,"ervelforce") == 0) return (void *) ervelforce;
   if (strcmp(name,"cs") == 0) return (void *) cs;
   if (strcmp(name,"csforce") == 0) return (void *) csforce;
   if (strcmp(name,"vforce") == 0) return (void *) vforce;
   if (strcmp(name,"etag") == 0) return (void *) etag;
 
   if (strcmp(name,"rho") == 0) return (void *) rho;
   if (strcmp(name,"drho") == 0) return (void *) drho;
   if (strcmp(name,"e") == 0) return (void *) e;
   if (strcmp(name,"de") == 0) return (void *) de;
   if (strcmp(name,"cv") == 0) return (void *) cv;
   if (strcmp(name,"vest") == 0) return (void *) vest;
 
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    return # of bytes of allocated memory
    call to avec tallies per-atom vectors
    add in global to local mapping storage
 ------------------------------------------------------------------------- */
 
 bigint Atom::memory_usage()
 {
   memlength = DELTA_MEMSTR;
   memory->create(memstr,memlength,"atom:memstr");
   memstr[0] = '\0';
   bigint bytes = avec->memory_usage();
   memory->destroy(memstr);
 
   bytes += max_same*sizeof(int);
   if (map_style == 1)
     bytes += memory->usage(map_array,map_maxarray);
   else if (map_style == 2) {
     bytes += map_nbucket*sizeof(int);
     bytes += map_nhash*sizeof(HashElem);
   }
   if (maxnext) {
     bytes += memory->usage(next,maxnext);
     bytes += memory->usage(permute,maxnext);
   }
 
   return bytes;
 }
 
 /* ----------------------------------------------------------------------
    accumulate per-atom vec names in memstr, padded by spaces
    return 1 if padded str is not already in memlist, else 0
 ------------------------------------------------------------------------- */
 
 int Atom::memcheck(const char *str)
 {
   int n = strlen(str) + 3;
   char *padded = new char[n];
   strcpy(padded," ");
   strcat(padded,str);
   strcat(padded," ");
 
   if (strstr(memstr,padded)) {
     delete [] padded;
     return 0;
   }
 
   if (strlen(memstr) + n >= memlength) {
     memlength += DELTA_MEMSTR;
     memory->grow(memstr,memlength,"atom:memstr");
   }
 
   strcat(memstr,padded);
   delete [] padded;
   return 1;
 }
diff --git a/src/atom.h b/src/atom.h
index c6bebe88a..2f21fee72 100644
--- a/src/atom.h
+++ b/src/atom.h
@@ -1,468 +1,468 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_ATOM_H
 #define LMP_ATOM_H
 
 #include "pointers.h"
 
 namespace LAMMPS_NS {
 
 class Atom : protected Pointers {
  public:
   char *atom_style;
   class AtomVec *avec;
 
   // atom counts
 
   bigint natoms;                // total # of atoms in system, could be 0
                                 // natoms may not be current if atoms lost
   int nlocal,nghost;            // # of owned and ghost atoms on this proc
   int nmax;                     // max # of owned+ghost in arrays on this proc
   int tag_enable;               // 0/1 if atom ID tags are defined
   int molecular;                // 0 = atomic, 1 = standard molecular system,
                                 // 2 = molecule template system
 
   bigint nbonds,nangles,ndihedrals,nimpropers;
   int ntypes,nbondtypes,nangletypes,ndihedraltypes,nimpropertypes;
   int bond_per_atom,angle_per_atom,dihedral_per_atom,improper_per_atom;
   int extra_bond_per_atom,extra_angle_per_atom;
   int extra_dihedral_per_atom,extra_improper_per_atom;
 
   int firstgroup;               // store atoms in this group first, -1 if unset
   int nfirst;                   // # of atoms in first group on this proc
   char *firstgroupname;         // group-ID to store first, NULL if unset
 
   // per-atom arrays
   // customize by adding new array
 
   tagint *tag;
   int *type,*mask;
   imageint *image;
   double **x,**v,**f;
 
   tagint *molecule;
   int *molindex,*molatom;
 
   double *q,**mu;
   double **omega,**angmom,**torque;
   double *radius,*rmass;
   int *ellipsoid,*line,*tri,*body;
 
   // PERI package
 
   double *vfrac,*s0;
   double **x0;
 
   // USER-EFF and USER-AWPMD packages
 
   int *spin;
   double *eradius,*ervel,*erforce,*ervelforce;
   double *cs,*csforce,*vforce;
   int *etag;
 
   // USER-SPH package
 
   double *rho,*drho,*e,*de,*cv;
   double **vest;
 
   int **nspecial;               // 0,1,2 = cummulative # of 1-2,1-3,1-4 neighs
   tagint **special;             // IDs of 1-2,1-3,1-4 neighs of each atom
   int maxspecial;               // special[nlocal][maxspecial]
 
   int *num_bond;
   int **bond_type;
   tagint **bond_atom;
 
   int *num_angle;
   int **angle_type;
   tagint **angle_atom1,**angle_atom2,**angle_atom3;
 
   int *num_dihedral;
   int **dihedral_type;
   tagint **dihedral_atom1,**dihedral_atom2,**dihedral_atom3,**dihedral_atom4;
 
   int *num_improper;
   int **improper_type;
   tagint **improper_atom1,**improper_atom2,**improper_atom3,**improper_atom4;
 
   // custom arrays used by fix property/atom
 
   int **ivector;
   double **dvector;
   char **iname,**dname;
   int nivector,ndvector;
 
   // used by USER-CUDA to flag used per-atom arrays
 
   unsigned int datamask;
   unsigned int datamask_ext;
 
   // atom style and per-atom array existence flags
   // customize by adding new flag
 
   int sphere_flag,ellipsoid_flag,line_flag,tri_flag,body_flag;
   int peri_flag,electron_flag;
   int ecp_flag;
   int wavepacket_flag,sph_flag;
 
   int molecule_flag,molindex_flag,molatom_flag;
   int q_flag,mu_flag;
   int rmass_flag,radius_flag,omega_flag,torque_flag,angmom_flag;
   int vfrac_flag,spin_flag,eradius_flag,ervel_flag,erforce_flag;
   int cs_flag,csforce_flag,vforce_flag,ervelforce_flag,etag_flag;
   int rho_flag,e_flag,cv_flag,vest_flag;
 
   // Peridynamics scale factor, used by dump cfg
 
   double pdscale;
 
   // molecule templates
   // each template can be a set of consecutive molecules
   // each with same ID (stored in molecules)
   // 1st molecule in template stores nset = # in set
 
   int nmolecule;
   class Molecule **molecules;
 
   // extra peratom info in restart file destined for fix & diag
 
   double **extra;
 
   // per-type arrays
 
   double *mass;
   int *mass_setflag;
 
   // callback ptrs for atom arrays managed by fix classes
 
   int nextra_grow,nextra_restart,nextra_border;  // # of callbacks of each type
   int *extra_grow,*extra_restart,*extra_border;  // index of fix to callback to
   int nextra_grow_max,nextra_restart_max;        // size of callback lists
   int nextra_border_max;
   int nextra_store;
 
   int map_style;                  // style of atom map: 0=none, 1=array, 2=hash
   int map_user;                   // user selected style = same 0,1,2
   tagint map_tag_max;             // max atom ID that map() is setup for
 
   // spatial sorting of atoms
 
   int sortfreq;             // sort atoms every this many steps, 0 = off
   bigint nextsort;          // next timestep to sort on
 
   // indices of atoms with same ID
 
   int *sametag;      // sametag[I] = next atom with same ID, -1 if no more
 
   // functions
 
   Atom(class LAMMPS *);
   ~Atom();
 
   void settings(class Atom *);
-  void create_avec(const char *, int, char **, char *suffix = NULL);
-  class AtomVec *new_avec(const char *, char *, int &);
+  void create_avec(const char *, int, char **, int);
+  class AtomVec *new_avec(const char *, int, int &);
   void init();
   void setup();
 
   class AtomVec *style_match(const char *);
   void modify_params(int, char **);
   void tag_check();
   void tag_extend();
   int tag_consecutive();
 
   int parse_data(const char *);
   int count_words(const char *);
 
   void deallocate_topology();
 
   void data_atoms(int, char *);
   void data_vels(int, char *);
 
   void data_bonds(int, char *, int *);
   void data_angles(int, char *, int *);
   void data_dihedrals(int, char *, int *);
   void data_impropers(int, char *, int *);
 
   void data_bonus(int, char *, class AtomVec *);
   void data_bodies(int, char *, class AtomVecBody *);
 
   virtual void allocate_type_arrays();
   void set_mass(const char *);
   void set_mass(int, double);
   void set_mass(int, char **);
   void set_mass(double *);
   void check_mass();
 
   int radius_consistency(int, double &);
   int shape_consistency(int, double &, double &, double &);
 
   void add_molecule(int, char **);
   int find_molecule(char *);
   void add_molecule_atom(class Molecule *, int, int, tagint);
 
   void first_reorder();
   virtual void sort();
 
   void add_callback(int);
   void delete_callback(const char *, int);
   void update_callback(int);
 
   int find_custom(char *, int &);
   int add_custom(char *, int);
   void remove_custom(int, int);
 
   void *extract(char *);
 
   inline int* get_map_array() {return map_array;};
   inline int get_map_size() {return map_tag_max+1;};
 
   bigint memory_usage();
   int memcheck(const char *);
 
   // functions for global to local ID mapping
   // map lookup function inlined for efficiency
   // return -1 if no map defined
 
   inline int map(tagint global) {
     if (map_style == 1) return map_array[global];
     else if (map_style == 2) return map_find_hash(global);
     else return -1;
   };
 
   void map_init(int check = 1);
   void map_clear();
   void map_set();
   void map_one(tagint, int);
   int map_style_set();
   void map_delete();
   int map_find_hash(tagint);
 
  protected:
 
   // global to local ID mapping
 
   int *map_array;       // direct map via array that holds map_tag_max
   int map_maxarray;     // allocated size of map_array (1 larger than this)
 
   struct HashElem {     // hashed map
     tagint global;      // key to search on = global ID
     int local;          // value associated with key = local index
     int next;           // next entry in this bucket, -1 if last
   };
   int map_nhash;        // # of entries hash table can hold
   int map_nused;        // # of actual entries in hash table
   int map_free;         // ptr to 1st unused entry in hash table
   int map_nbucket;      // # of hash buckets
   int *map_bucket;      // ptr to 1st entry in each bucket
   HashElem *map_hash;   // hash table
 
   int max_same;         // allocated size of sametag
 
   // spatial sorting of atoms
 
   int nbins;                      // # of sorting bins
   int nbinx,nbiny,nbinz;          // bins in each dimension
   int maxbin;                     // max # of bins
   int maxnext;                    // max size of next,permute
   int *binhead;                   // 1st atom in each bin
   int *next;                      // next atom in bin
   int *permute;                   // permutation vector
   double userbinsize;             // requested sort bin size
   double bininvx,bininvy,bininvz; // inverse actual bin sizes
   double bboxlo[3],bboxhi[3];     // bounding box of my sub-domain
 
   int memlength;                  // allocated size of memstr
   char *memstr;                   // string of array names already counted
 
   void setup_sort_bins();
   int next_prime(int);
 };
 
 }
 
 #endif
 
 /* ERROR/WARNING messages:
 
 E: Atom IDs must be used for molecular systems
 
 Atom IDs are used to identify and find partner atoms in bonds.
 
 E: Invalid atom style
 
 The choice of atom style is unknown.
 
 E: Could not find atom_modify first group ID
 
 Self-explanatory.
 
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
 E: Atom_modify id command after simulation box is defined
 
 The atom_modify id command cannot be used after a read_data,
 read_restart, or create_box command.
 
 E: Atom_modify map command after simulation box is defined
 
 The atom_modify map command cannot be used after a read_data,
 read_restart, or create_box command.
 
 E: Atom_modify sort and first options cannot be used together
 
 Self-explanatory.
 
 E: Atom ID is negative
 
 Self-explanatory.
 
 E: Atom ID is too big
 
 The limit on atom IDs is set by the SMALLBIG, BIGBIG, SMALLSMALL
 setting in your Makefile.  See Section_start 2.2 of the manual for
 more details.
 
 E: Atom ID is zero
 
 Either all atoms IDs must be zero or none of them.
 
 E: Not all atom IDs are 0
 
 Either all atoms IDs must be zero or none of them.
 
 E: New atom IDs exceed maximum allowed ID
 
 See the setting for tagint in the src/lmptype.h file.
 
 E: Incorrect atom format in data file
 
 Number of values per atom line in the data file is not consistent with
 the atom style.
 
 E: Incorrect velocity format in data file
 
 Each atom style defines a format for the Velocity section
 of the data file.  The read-in lines do not match.
 
 E: Invalid atom ID in Velocities section of data file
 
 Atom IDs must be positive integers and within range of defined
 atoms.
 
 E: Invalid atom ID in Bonds section of data file
 
 Atom IDs must be positive integers and within range of defined
 atoms.
 
 E: Invalid bond type in Bonds section of data file
 
 Bond type must be positive integer and within range of specified bond
 types.
 
 E: Invalid atom ID in Angles section of data file
 
 Atom IDs must be positive integers and within range of defined
 atoms.
 
 E: Invalid angle type in Angles section of data file
 
 Angle type must be positive integer and within range of specified angle
 types.
 
 E: Invalid atom ID in Dihedrals section of data file
 
 Atom IDs must be positive integers and within range of defined
 atoms.
 
 E: Invalid dihedral type in Dihedrals section of data file
 
 Dihedral type must be positive integer and within range of specified
 dihedral types.
 
 E: Invalid atom ID in Impropers section of data file
 
 Atom IDs must be positive integers and within range of defined
 atoms.
 
 E: Invalid improper type in Impropers section of data file
 
 Improper type must be positive integer and within range of specified
 improper types.
 
 E: Incorrect bonus data format in data file
 
 See the read_data doc page for a description of how various kinds of
 bonus data must be formatted for certain atom styles.
 
 E: Invalid atom ID in Bonus section of data file
 
 Atom IDs must be positive integers and within range of defined
 atoms.
 
 E: Invalid atom ID in Bodies section of data file
 
 Atom IDs must be positive integers and within range of defined
 atoms.
 
 E: Cannot set mass for this atom style
 
 This atom style does not support mass settings for each atom type.
 Instead they are defined on a per-atom basis in the data file.
 
 E: Invalid mass line in data file
 
 Self-explanatory.
 
 E: Invalid type for mass set
 
 Mass command must set a type from 1-N where N is the number of atom
 types.
 
 E: Invalid mass value
 
 Self-explanatory.
 
 E: All masses are not set
 
 For atom styles that define masses for each atom type, all masses must
 be set in the data file or by the mass command before running a
 simulation.  They must also be set before using the velocity
 command.
 
 E: Reuse of molecule template ID
 
 The template IDs must be unique.
 
 E: Atom sort did not operate correctly
 
 This is an internal LAMMPS error.  Please report it to the
 developers.
 
 E: Atom sorting has bin size = 0.0
 
 The neighbor cutoff is being used as the bin size, but it is zero.
 Thus you must explicitly list a bin size in the atom_modify sort
 command or turn off sorting.
 
 E: Too many atom sorting bins
 
 This is likely due to an immense simulation box that has blown up
 to a large size.
 
 */
diff --git a/src/bond_hybrid.cpp b/src/bond_hybrid.cpp
index 454500861..63357a12e 100644
--- a/src/bond_hybrid.cpp
+++ b/src/bond_hybrid.cpp
@@ -1,359 +1,360 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "string.h"
 #include "ctype.h"
 #include "bond_hybrid.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define EXTRA 1000
 
 /* ---------------------------------------------------------------------- */
 
 BondHybrid::BondHybrid(LAMMPS *lmp) : Bond(lmp)
 {
   writedata = 0;
   nstyles = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 BondHybrid::~BondHybrid()
 {
   if (nstyles) {
     for (int i = 0; i < nstyles; i++) delete styles[i];
     delete [] styles;
     for (int i = 0; i < nstyles; i++) delete [] keywords[i];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(map);
     delete [] nbondlist;
     delete [] maxbond;
     for (int i = 0; i < nstyles; i++)
       memory->destroy(bondlist[i]);
     delete [] bondlist;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void BondHybrid::compute(int eflag, int vflag)
 {
   int i,j,m,n;
 
   // save ptrs to original bondlist
 
   int nbondlist_orig = neighbor->nbondlist;
   int **bondlist_orig = neighbor->bondlist;
 
   // if this is re-neighbor step, create sub-style bondlists
   // nbondlist[] = length of each sub-style list
   // realloc sub-style bondlist if necessary
   // load sub-style bondlist with 3 values from original bondlist
 
   if (neighbor->ago == 0) {
     for (m = 0; m < nstyles; m++) nbondlist[m] = 0;
     for (i = 0; i < nbondlist_orig; i++) {
       m = map[bondlist_orig[i][2]];
       if (m >= 0) nbondlist[m]++;
     }
     for (m = 0; m < nstyles; m++) {
       if (nbondlist[m] > maxbond[m]) {
         memory->destroy(bondlist[m]);
         maxbond[m] = nbondlist[m] + EXTRA;
         memory->create(bondlist[m],maxbond[m],3,"bond_hybrid:bondlist");
       }
       nbondlist[m] = 0;
     }
     for (i = 0; i < nbondlist_orig; i++) {
       m = map[bondlist_orig[i][2]];
       if (m < 0) continue;
       n = nbondlist[m];
       bondlist[m][n][0] = bondlist_orig[i][0];
       bondlist[m][n][1] = bondlist_orig[i][1];
       bondlist[m][n][2] = bondlist_orig[i][2];
       nbondlist[m]++;
     }
   }
 
   // call each sub-style's compute function
   // set neighbor->bondlist to sub-style bondlist before call
   // accumulate sub-style global/peratom energy/virial in hybrid
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   for (m = 0; m < nstyles; m++) {
     neighbor->nbondlist = nbondlist[m];
     neighbor->bondlist = bondlist[m];
 
     styles[m]->compute(eflag,vflag);
 
     if (eflag_global) energy += styles[m]->energy;
     if (vflag_global)
       for (n = 0; n < 6; n++) virial[n] += styles[m]->virial[n];
     if (eflag_atom) {
       n = atom->nlocal;
       if (force->newton_bond) n += atom->nghost;
       double *eatom_substyle = styles[m]->eatom;
       for (i = 0; i < n; i++) eatom[i] += eatom_substyle[i];
     }
     if (vflag_atom) {
       n = atom->nlocal;
       if (force->newton_bond) n += atom->nghost;
       double **vatom_substyle = styles[m]->vatom;
       for (i = 0; i < n; i++)
         for (j = 0; j < 6; j++)
           vatom[i][j] += vatom_substyle[i][j];
     }
   }
 
   // restore ptrs to original bondlist
 
   neighbor->nbondlist = nbondlist_orig;
   neighbor->bondlist = bondlist_orig;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void BondHybrid::allocate()
 {
   allocated = 1;
   int n = atom->nbondtypes;
 
   memory->create(map,n+1,"bond:map");
   memory->create(setflag,n+1,"bond:setflag");
   for (int i = 1; i <= n; i++) setflag[i] = 0;
 
   nbondlist = new int[nstyles];
   maxbond = new int[nstyles];
   bondlist = new int**[nstyles];
   for (int m = 0; m < nstyles; m++) maxbond[m] = 0;
   for (int m = 0; m < nstyles; m++) bondlist[m] = NULL;
 }
 
 /* ----------------------------------------------------------------------
    create one bond style for each arg in list
 ------------------------------------------------------------------------- */
 
 void BondHybrid::settings(int narg, char **arg)
 {
   int i,m,istyle;
 
   if (narg < 1) error->all(FLERR,"Illegal bond_style command");
 
   // delete old lists, since cannot just change settings
 
   if (nstyles) {
     for (int i = 0; i < nstyles; i++) delete styles[i];
     delete [] styles;
     for (int i = 0; i < nstyles; i++) delete [] keywords[i];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(map);
     delete [] nbondlist;
     delete [] maxbond;
     for (int i = 0; i < nstyles; i++)
       memory->destroy(bondlist[i]);
     delete [] bondlist;
   }
   allocated = 0;
 
   // count sub-styles by skipping numeric args
   // one exception is 1st arg of style "table", which is non-numeric word
   // need a better way to skip these exceptions
 
   nstyles = 0;
   i = 0;
   while (i < narg) {
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     nstyles++;
   }
 
   // allocate list of sub-styles
 
   styles = new Bond*[nstyles];
   keywords = new char*[nstyles];
 
   // allocate each sub-style and call its settings() with subset of args
   // define subset of args for a sub-style by skipping numeric args
   // one exception is 1st arg of style "table", which is non-numeric
   // need a better way to skip these exceptions
 
-  int dummy;
+  int sflag;
   nstyles = 0;
   i = 0;
 
   while (i < narg) {
     for (m = 0; m < nstyles; m++)
       if (strcmp(arg[i],keywords[m]) == 0)
         error->all(FLERR,"Bond style hybrid cannot use same bond style twice");
     if (strcmp(arg[i],"hybrid") == 0)
       error->all(FLERR,"Bond style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0)
       error->all(FLERR,"Bond style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_bond(arg[i],lmp->suffix,dummy);
-    keywords[nstyles] = new char[strlen(arg[i])+1];
-    strcpy(keywords[nstyles],arg[i]);
+
+    styles[nstyles] = force->new_bond(arg[i],1,sflag);
+    force->store_style(keywords[nstyles],arg[i],sflag);
+
     istyle = i;
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     styles[nstyles]->settings(i-istyle-1,&arg[istyle+1]);
     nstyles++;
   }
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one type
 ---------------------------------------------------------------------- */
 
 void BondHybrid::coeff(int narg, char **arg)
 {
   if (!allocated) allocate();
 
   int ilo,ihi;
   force->bounds(arg[0],atom->nbondtypes,ilo,ihi);
 
   // 2nd arg = bond sub-style name
   // allow for "none" as valid sub-style name
 
   int m;
   for (m = 0; m < nstyles; m++)
     if (strcmp(arg[1],keywords[m]) == 0) break;
 
   int none = 0;
   if (m == nstyles) {
     if (strcmp(arg[1],"none") == 0) none = 1;
     else error->all(FLERR,"Bond coeff for hybrid has invalid style");
   }
 
   // move 1st arg to 2nd arg
   // just copy ptrs, since arg[] points into original input line
 
   arg[1] = arg[0];
 
   // invoke sub-style coeff() starting with 1st arg
 
   if (!none) styles[m]->coeff(narg-1,&arg[1]);
 
   // set setflag and which type maps to which sub-style
   // if sub-style is none: set hybrid setflag, wipe out map
 
   for (int i = ilo; i <= ihi; i++) {
     setflag[i] = 1;
     if (none) map[i] = -1;
     else map[i] = m;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void BondHybrid::init_style()
 {
   for (int m = 0; m < nstyles; m++)
     if (styles[m]) styles[m]->init_style();
 }
 
 /* ----------------------------------------------------------------------
    return an equilbrium bond length
 ------------------------------------------------------------------------- */
 
 double BondHybrid::equilibrium_distance(int i)
 {
   if (map[i] < 0)
     error->one(FLERR,"Invoked bond equil distance on bond style none");
   return styles[map[i]]->equilibrium_distance(i);
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void BondHybrid::write_restart(FILE *fp)
 {
   fwrite(&nstyles,sizeof(int),1,fp);
 
   int n;
   for (int m = 0; m < nstyles; m++) {
     n = strlen(keywords[m]) + 1;
     fwrite(&n,sizeof(int),1,fp);
     fwrite(keywords[m],sizeof(char),n,fp);
   }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void BondHybrid::read_restart(FILE *fp)
 {
   int me = comm->me;
   if (me == 0) fread(&nstyles,sizeof(int),1,fp);
   MPI_Bcast(&nstyles,1,MPI_INT,0,world);
   styles = new Bond*[nstyles];
   keywords = new char*[nstyles];
 
   allocate();
 
   int n,dummy;
   for (int m = 0; m < nstyles; m++) {
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_bond(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_bond(keywords[m],0,dummy);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 double BondHybrid::single(int type, double rsq, int i, int j,
                           double &fforce)
 
 {
   if (map[type] < 0) error->one(FLERR,"Invoked bond single on bond style none");
   return styles[map[type]]->single(type,rsq,i,j,fforce);
 }
 
 /* ----------------------------------------------------------------------
    memory usage
 ------------------------------------------------------------------------- */
 
 double BondHybrid::memory_usage()
 {
   double bytes = maxeatom * sizeof(double);
   bytes += maxvatom*6 * sizeof(double);
   for (int m = 0; m < nstyles; m++) bytes += maxbond[m]*3 * sizeof(int);
   for (int m = 0; m < nstyles; m++)
     if (styles[m]) bytes += styles[m]->memory_usage();
   return bytes;
 }
diff --git a/src/delete_bonds.cpp b/src/delete_bonds.cpp
index 3b2e9a528..b380508ee 100644
--- a/src/delete_bonds.cpp
+++ b/src/delete_bonds.cpp
@@ -1,592 +1,592 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
 #include "stdlib.h"
 #include "string.h"
 #include "delete_bonds.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "domain.h"
 #include "neighbor.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "special.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 enum{MULTI,ATOM,BOND,ANGLE,DIHEDRAL,IMPROPER,STATS};
 
 /* ---------------------------------------------------------------------- */
 
 DeleteBonds::DeleteBonds(LAMMPS *lmp) : Pointers(lmp) {}
 
 /* ---------------------------------------------------------------------- */
 
 void DeleteBonds::command(int narg, char **arg)
 {
   if (domain->box_exist == 0)
     error->all(FLERR,"Delete_bonds command before simulation box is defined");
   if (atom->natoms == 0)
     error->all(FLERR,"Delete_bonds command with no atoms existing");
   if (atom->molecular != 1)
     error->all(FLERR,"Cannot use delete_bonds with non-molecular system");
 
   if (narg < 2) error->all(FLERR,"Illegal delete_bonds command");
 
   // init entire system since comm->borders is done
   // comm::init needs neighbor::init needs pair::init needs kspace::init, etc
 
   if (comm->me == 0 && screen)
     fprintf(screen,"System init for delete_bonds ...\n");
   lmp->init();
 
   if (comm->me == 0 && screen) fprintf(screen,"Deleting bonds ...\n");
 
   // identify group
 
   int igroup = group->find(arg[0]);
   if (igroup == -1) error->all(FLERR,"Cannot find delete_bonds group ID");
   int groupbit = group->bitmask[igroup];
 
   // set style and which = type value
 
   int style = -1;
   if (strcmp(arg[1],"multi") == 0) style = MULTI;
   else if (strcmp(arg[1],"atom") == 0) style = ATOM;
   else if (strcmp(arg[1],"bond") == 0) style = BOND;
   else if (strcmp(arg[1],"angle") == 0) style = ANGLE;
   else if (strcmp(arg[1],"dihedral") == 0) style = DIHEDRAL;
   else if (strcmp(arg[1],"improper") == 0) style = IMPROPER;
   else if (strcmp(arg[1],"stats") == 0) style = STATS;
   else error->all(FLERR,"Illegal delete_bonds command");
 
   // setup list of types (atom,bond,etc) to consider
   // use force->bounds() to allow setting of range of types
   // range can be 0 to ntypes inclusive
 
   int *tlist = NULL;
 
   int iarg = 2;
   int which;
   if (style != MULTI && style != STATS) {
     if (narg < 3) error->all(FLERR,"Illegal delete_bonds command");
 
     int n = -1;
     if (style == ATOM) n = atom->ntypes;
     if (style == BOND) n = atom->nbondtypes;
     if (style == ANGLE) n = atom->nangletypes;
     if (style == DIHEDRAL) n = atom->ndihedraltypes;
     if (style == IMPROPER) n = atom->nimpropertypes;
 
     tlist = new int[n+1];
     for (int i = 0; i <= n; i++) tlist[i] = 0;
     int nlo,nhi;
     force->bounds(arg[2],n,nlo,nhi,0);
     for (int i = nlo; i <= nhi; i++) tlist[i] = 1;
 
     iarg++;
   }
 
   // grab optional keywords
 
   int any_flag = 0;
   int undo_flag = 0;
   int remove_flag = 0;
   int special_flag = 0;
   int induce_flag = 0;
 
   while (iarg < narg) {
     if (strcmp(arg[iarg],"any") == 0) any_flag = 1;
     else if (strcmp(arg[iarg],"undo") == 0) undo_flag = 1;
     else if (strcmp(arg[iarg],"remove") == 0) remove_flag = 1;
     else if (strcmp(arg[iarg],"special") == 0) special_flag = 1;
     else if (strcmp(arg[iarg],"induce") == 0) induce_flag = 1;
     else error->all(FLERR,"Illegal delete_bonds command");
     iarg++;
   }
 
   // border swap to insure type and mask is current for off-proc atoms
   // enforce PBC before in case atoms are outside box
 
   if (domain->triclinic) domain->x2lamda(atom->nlocal);
   domain->pbc();
   domain->reset_box();
   comm->setup();
   comm->exchange();
   comm->borders();
   if (domain->triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
 
   // set topology interactions either off or on
   // criteria for an interaction to potentially be changed (set flag = 1)
   //   all atoms or any atom in interaction must be in group, based on any_flag
   //   for style = MULTI, all bond/angle/dihedral/improper, no other criteria
   //   for style = ATOM, same as MULTI, plus at least one atom is specified type
   //   for style = BOND/ANGLE/DIHEDRAL/IMPROPER, interaction is specified type
   //   for style = STATS only compute stats, flag is always 0
   // if flag = 1
   //   set interaction type negative if undo_flag = 0
   //   set interaction type positive if undo_flag = 1
 
   int *mask = atom->mask;
   int *type = atom->type;
   int nlocal = atom->nlocal;
 
   int i,m,n,consider,flag,itype;
   int atom1,atom2,atom3,atom4;
 
   if (atom->avec->bonds_allow && 
       (style == BOND || style == MULTI || style == ATOM)) {
     int *num_bond = atom->num_bond;
     int **bond_type = atom->bond_type;
 
     for (i = 0; i < nlocal; i++) {
       for (m = 0; m < num_bond[i]; m++) {
         atom1 = atom->map(atom->bond_atom[i][m]);
         if (atom1 == -1) error->one(FLERR,"Bond atom missing in delete_bonds");
         consider = 0;
         if (!any_flag && mask[i] & groupbit && mask[atom1] & groupbit)
           consider = 1;
         if (any_flag && (mask[i] & groupbit || mask[atom1] & groupbit))
           consider = 1;
         if (consider) {
           flag = 0;
           if (style == MULTI) flag = 1;
           else if (style == ATOM) {
             if (tlist[type[i]] || tlist[type[atom1]]) flag = 1;
           } else if (style == BOND) {
-            itype = static_cast<int> (fabs(bond_type[i][m]));
+            itype = abs(bond_type[i][m]);
             if (tlist[itype]) flag = 1;
           }
           if (flag) {
             if (undo_flag == 0 && bond_type[i][m] > 0)
               bond_type[i][m] = -bond_type[i][m];
             if (undo_flag == 1 && bond_type[i][m] < 0)
               bond_type[i][m] = -bond_type[i][m];
           }
         }
       }
     }
   }
 
   if (atom->avec->angles_allow &&
       (style == ANGLE || style == MULTI || style == ATOM)) {
     int *num_angle = atom->num_angle;
     int **angle_type = atom->angle_type;
 
     for (i = 0; i < nlocal; i++) {
       for (m = 0; m < num_angle[i]; m++) {
         atom1 = atom->map(atom->angle_atom1[i][m]);
         atom2 = atom->map(atom->angle_atom2[i][m]);
         atom3 = atom->map(atom->angle_atom3[i][m]);
         if (atom1 == -1 || atom2 == -1 || atom3 == -1)
           error->one(FLERR,"Angle atom missing in delete_bonds");
         consider = 0;
         if (!any_flag && mask[atom1] & groupbit && mask[atom2] & groupbit &&
             mask[atom3] & groupbit) consider = 1;
         if (any_flag && (mask[atom1] & groupbit || mask[atom2] & groupbit ||
                           mask[atom3] & groupbit)) consider = 1;
         if (consider) {
           flag = 0;
           if (style == MULTI) flag = 1;
           else if (style == ATOM) {
             if (tlist[type[atom1]] || tlist[type[atom2]] ||
                 tlist[type[atom3]]) flag = 1;
           } else if (style == ANGLE) {
-            itype = static_cast<int> (fabs(angle_type[i][m]));
+            itype = abs(angle_type[i][m]);
             if (tlist[itype]) flag = 1;
           }
           if (flag) {
             if (undo_flag == 0 && angle_type[i][m] > 0)
               angle_type[i][m] = -angle_type[i][m];
             if (undo_flag == 1 && angle_type[i][m] < 0)
               angle_type[i][m] = -angle_type[i][m];
           }
         }
       }
     }
   }
 
   if (atom->avec->dihedrals_allow &&
       (style == DIHEDRAL || style == MULTI || style == ATOM)) {
     int *num_dihedral = atom->num_dihedral;
     int **dihedral_type = atom->dihedral_type;
 
     for (i = 0; i < nlocal; i++) {
       for (m = 0; m < num_dihedral[i]; m++) {
         atom1 = atom->map(atom->dihedral_atom1[i][m]);
         atom2 = atom->map(atom->dihedral_atom2[i][m]);
         atom3 = atom->map(atom->dihedral_atom3[i][m]);
         atom4 = atom->map(atom->dihedral_atom4[i][m]);
         if (atom1 == -1 || atom2 == -1 || atom3 == -1 || atom4 == -1)
           error->one(FLERR,"Dihedral atom missing in delete_bonds");
         consider = 0;
         if (!any_flag && mask[atom1] & groupbit && mask[atom2] & groupbit &&
             mask[atom3] & groupbit && mask[atom4] & groupbit) consider = 1;
         if (any_flag && (mask[atom1] & groupbit || mask[atom2] & groupbit ||
                          mask[atom3] & groupbit || mask[atom4] & groupbit))
           consider = 1;
         if (consider) {
           flag = 0;
           if (style == MULTI) flag = 1;
           else if (style == ATOM) {
               if (tlist[type[atom1]] || tlist[type[atom2]] ||
                   tlist[type[atom3]] || tlist[type[atom4]]) flag = 1;
           } else if (style == DIHEDRAL) {
-            itype = static_cast<int> (fabs(dihedral_type[i][m]));
+            itype = abs(dihedral_type[i][m]);
             if (tlist[itype]) flag = 1;
           }
           if (flag) {
             if (undo_flag == 0 && dihedral_type[i][m] > 0)
               dihedral_type[i][m] = -dihedral_type[i][m];
             if (undo_flag == 1 && dihedral_type[i][m] < 0)
               dihedral_type[i][m] = -dihedral_type[i][m];
           }
         }
       }
     }
   }
 
   if (atom->avec->impropers_allow &&
       (style == IMPROPER || style == MULTI || style == ATOM)) {
     int *num_improper = atom->num_improper;
     int **improper_type = atom->improper_type;
 
     for (i = 0; i < nlocal; i++) {
       for (m = 0; m < num_improper[i]; m++) {
         atom1 = atom->map(atom->improper_atom1[i][m]);
         atom2 = atom->map(atom->improper_atom2[i][m]);
         atom3 = atom->map(atom->improper_atom3[i][m]);
         atom4 = atom->map(atom->improper_atom4[i][m]);
         if (atom1 == -1 || atom2 == -1 || atom3 == -1 || atom4 == -1)
           error->one(FLERR,"Improper atom missing in delete_bonds");
         consider = 0;
         if (!any_flag && mask[atom1] & groupbit && mask[atom2] & groupbit &&
             mask[atom3] & groupbit && mask[atom4] & groupbit) consider = 1;
         if (any_flag && (mask[atom1] & groupbit || mask[atom2] & groupbit ||
                          mask[atom3] & groupbit || mask[atom4] & groupbit))
           consider = 1;
         if (consider) {
           flag = 0;
           if (style == MULTI) flag = 1;
           else if (style == ATOM) {
               if (tlist[type[atom1]] || tlist[type[atom2]] ||
                   tlist[type[atom3]] || tlist[type[atom4]]) flag = 1;
           } else if (style == IMPROPER) {
-            itype = static_cast<int> (fabs(improper_type[i][m]));
+            itype = abs(improper_type[i][m]);
             if (tlist[itype]) flag = 1;
           }
           if (flag) {
             if (undo_flag == 0 && improper_type[i][m] > 0)
               improper_type[i][m] = -improper_type[i][m];
             if (undo_flag == 1 && improper_type[i][m] < 0)
               improper_type[i][m] = -improper_type[i][m];
           }
         }
       }
     }
   }
 
   delete [] tlist;
     
   // induce turn off of angles, dihedral, impropers due to turned off bonds
   // induce turn off of dihedrals due to turned off angles
   // all atoms or any atom in interaction must be in group, based on any_flag
 
   if (induce_flag) {
 
     // circulate list of turned off bonds around ring of procs
 
     // circulate list of turned off angles around ring of procs
 
   }
 
   // remove interactions if requested
   // all atoms or any atom in interaction must be in group, based on any_flag
 
   if (remove_flag) {
 
     if (atom->avec->bonds_allow) {
       for (i = 0; i < nlocal; i++) {
         m = 0;
         while (m < atom->num_bond[i]) {
           if (atom->bond_type[i][m] <= 0) {
             atom1 = atom->map(atom->bond_atom[i][m]);
             flag = 0;
             if (!any_flag && mask[i] & groupbit && mask[atom1] & groupbit)
               flag = 1;
             if (any_flag && (mask[i] & groupbit || mask[atom1] & groupbit))
               flag = 1;
             if (flag) {
               n = atom->num_bond[i];
               atom->bond_type[i][m] = atom->bond_type[i][n-1];
               atom->bond_atom[i][m] = atom->bond_atom[i][n-1];
               atom->num_bond[i]--;
             } else m++;
           } else m++;
         }
       }
     }
 
     if (atom->avec->angles_allow) {
       for (i = 0; i < nlocal; i++) {
         m = 0;
         while (m < atom->num_angle[i]) {
           if (atom->angle_type[i][m] <= 0) {
             atom1 = atom->map(atom->angle_atom1[i][m]);
             atom2 = atom->map(atom->angle_atom2[i][m]);
             atom3 = atom->map(atom->angle_atom3[i][m]);
             flag = 0;
             if (!any_flag && mask[atom1] & groupbit && mask[atom2] & groupbit &&
                 mask[atom3] & groupbit) flag = 1;
             if (any_flag && (mask[atom1] & groupbit || mask[atom2] & groupbit ||
                              mask[atom3] & groupbit)) flag = 1;
             if (flag) {
               n = atom->num_angle[i];
               atom->angle_type[i][m] = atom->angle_type[i][n-1];
               atom->angle_atom1[i][m] = atom->angle_atom1[i][n-1];
               atom->angle_atom2[i][m] = atom->angle_atom2[i][n-1];
               atom->angle_atom3[i][m] = atom->angle_atom3[i][n-1];
               atom->num_angle[i]--;
             } else m++;
           } else m++;
         }
       }
     }
 
     if (atom->avec->dihedrals_allow) {
       for (i = 0; i < nlocal; i++) {
         m = 0;
         while (m < atom->num_dihedral[i]) {
           if (atom->dihedral_type[i][m] <= 0) {
             atom1 = atom->map(atom->dihedral_atom1[i][m]);
             atom2 = atom->map(atom->dihedral_atom2[i][m]);
             atom3 = atom->map(atom->dihedral_atom3[i][m]);
             atom4 = atom->map(atom->dihedral_atom4[i][m]);
             flag = 0;
             if (!any_flag && mask[atom1] & groupbit && mask[atom2] & groupbit &&
                 mask[atom3] & groupbit && mask[atom4] & groupbit) flag = 1;
             if (any_flag && (mask[atom1] & groupbit || mask[atom2] & groupbit ||
                              mask[atom3] & groupbit || mask[atom4] & groupbit))
               flag = 1;
             if (flag) {
               n = atom->num_dihedral[i];
               atom->dihedral_type[i][m] = atom->dihedral_type[i][n-1];
               atom->dihedral_atom1[i][m] = atom->dihedral_atom1[i][n-1];
               atom->dihedral_atom2[i][m] = atom->dihedral_atom2[i][n-1];
               atom->dihedral_atom3[i][m] = atom->dihedral_atom3[i][n-1];
               atom->dihedral_atom4[i][m] = atom->dihedral_atom4[i][n-1];
               atom->num_dihedral[i]--;
             } else m++;
           } else m++;
         }
       }
     }
 
     if (atom->avec->impropers_allow) {
       for (i = 0; i < nlocal; i++) {
         m = 0;
         while (m < atom->num_improper[i]) {
           if (atom->improper_type[i][m] <= 0) {
             atom1 = atom->map(atom->improper_atom1[i][m]);
             atom2 = atom->map(atom->improper_atom2[i][m]);
             atom3 = atom->map(atom->improper_atom3[i][m]);
             atom4 = atom->map(atom->improper_atom4[i][m]);
             flag = 0;
             if (!any_flag && mask[atom1] & groupbit && mask[atom2] & groupbit &&
                 mask[atom3] & groupbit && mask[atom4] & groupbit) flag = 1;
             if (any_flag && (mask[atom1] & groupbit || mask[atom2] & groupbit ||
                              mask[atom3] & groupbit || mask[atom4] & groupbit))
               flag = 1;
             if (flag) {
               n = atom->num_improper[i];
               atom->improper_type[i][m] = atom->improper_type[i][n-1];
               atom->improper_atom1[i][m] = atom->improper_atom1[i][n-1];
               atom->improper_atom2[i][m] = atom->improper_atom2[i][n-1];
               atom->improper_atom3[i][m] = atom->improper_atom3[i][n-1];
               atom->improper_atom4[i][m] = atom->improper_atom4[i][n-1];
               atom->num_improper[i]--;
             } else m++;
           } else m++;
         }
       }
     }
 
   }
 
   // if interactions were removed, recompute global counts
 
   if (remove_flag) {
 
     if (atom->avec->bonds_allow) {
       bigint nbonds = 0;
       for (i = 0; i < nlocal; i++) nbonds += atom->num_bond[i];
       MPI_Allreduce(&nbonds,&atom->nbonds,1,MPI_LMP_BIGINT,
                     MPI_SUM,world);
       if (force->newton_bond == 0) atom->nbonds /= 2;
     }
 
     if (atom->avec->angles_allow) {
       bigint nangles = 0;
       for (i = 0; i < nlocal; i++) nangles += atom->num_angle[i];
       MPI_Allreduce(&nangles,&atom->nangles,1,MPI_LMP_BIGINT,
                     MPI_SUM,world);
       if (force->newton_bond == 0) atom->nangles /= 3;
     }
 
     if (atom->avec->dihedrals_allow) {
       bigint ndihedrals = 0;
       for (i = 0; i < nlocal; i++) ndihedrals += atom->num_dihedral[i];
       MPI_Allreduce(&ndihedrals,&atom->ndihedrals,
                     1,MPI_LMP_BIGINT,MPI_SUM,world);
       if (force->newton_bond == 0) atom->ndihedrals /= 4;
     }
 
     if (atom->avec->impropers_allow) {
       bigint nimpropers = 0;
       for (i = 0; i < nlocal; i++) nimpropers += atom->num_improper[i];
       MPI_Allreduce(&nimpropers,&atom->nimpropers,
                     1,MPI_LMP_BIGINT,MPI_SUM,world);
       if (force->newton_bond == 0) atom->nimpropers /= 4;
     }
 
   }
 
   // compute and print stats
 
   bigint tmp;
   bigint bond_on,bond_off;
   bigint angle_on,angle_off;
   bigint dihedral_on,dihedral_off;
   bigint improper_on,improper_off;
 
   if (atom->avec->bonds_allow) {
     bond_on = bond_off = 0;
     for (i = 0; i < nlocal; i++)
       for (m = 0; m < atom->num_bond[i]; m++)
         if (atom->bond_type[i][m] > 0) bond_on++;
         else bond_off++;
     MPI_Allreduce(&bond_on,&tmp,1,MPI_LMP_BIGINT,MPI_SUM,world);
     bond_on = tmp;
     MPI_Allreduce(&bond_off,&tmp,1,MPI_LMP_BIGINT,MPI_SUM,world);
     bond_off = tmp;
     if (force->newton_bond == 0) {
       bond_on /= 2;
       bond_off /= 2;
     }
   }
 
   if (atom->avec->angles_allow) {
     angle_on = angle_off = 0;
     for (i = 0; i < nlocal; i++)
       for (m = 0; m < atom->num_angle[i]; m++)
         if (atom->angle_type[i][m] > 0) angle_on++;
         else angle_off++;
     MPI_Allreduce(&angle_on,&tmp,1,MPI_LMP_BIGINT,MPI_SUM,world);
     angle_on = tmp;
     MPI_Allreduce(&angle_off,&tmp,1,MPI_LMP_BIGINT,MPI_SUM,world);
     angle_off = tmp;
     if (force->newton_bond == 0) {
       angle_on /= 3;
       angle_off /= 3;
     }
   }
 
   if (atom->avec->dihedrals_allow) {
     dihedral_on = dihedral_off = 0;
     for (i = 0; i < nlocal; i++)
       for (m = 0; m < atom->num_dihedral[i]; m++)
         if (atom->dihedral_type[i][m] > 0) dihedral_on++;
         else dihedral_off++;
     MPI_Allreduce(&dihedral_on,&tmp,1,MPI_LMP_BIGINT,MPI_SUM,world);
     dihedral_on = tmp;
     MPI_Allreduce(&dihedral_off,&tmp,1,MPI_LMP_BIGINT,MPI_SUM,world);
     dihedral_off = tmp;
     if (force->newton_bond == 0) {
       dihedral_on /= 4;
       dihedral_off /= 4;
     }
   }
 
   if (atom->avec->impropers_allow) {
     improper_on = improper_off = 0;
     for (i = 0; i < nlocal; i++)
       for (m = 0; m < atom->num_improper[i]; m++)
         if (atom->improper_type[i][m] > 0) improper_on++;
         else improper_off++;
     MPI_Allreduce(&improper_on,&tmp,1,MPI_LMP_BIGINT,MPI_SUM,world);
     improper_on = tmp;
     MPI_Allreduce(&improper_off,&tmp,1,MPI_LMP_BIGINT,MPI_SUM,world);
     improper_off = tmp;
     if (force->newton_bond == 0) {
       improper_on /= 4;
       improper_off /= 4;
     }
   }
 
   if (comm->me == 0) {
     if (atom->avec->bonds_allow) {
       if (screen) fprintf(screen,
                           "  " BIGINT_FORMAT " total bonds, " BIGINT_FORMAT
                           " turned on, " BIGINT_FORMAT " turned off\n",
                           atom->nbonds,bond_on,bond_off);
       if (logfile) fprintf(logfile,
                            "  " BIGINT_FORMAT " total bonds, " BIGINT_FORMAT
                            " turned on, " BIGINT_FORMAT " turned off\n",
                            atom->nbonds,bond_on,bond_off);
     }
     if (atom->avec->angles_allow) {
       if (screen) fprintf(screen,
                           "  " BIGINT_FORMAT " total angles, " BIGINT_FORMAT
                           " turned on, " BIGINT_FORMAT " turned off\n",
                           atom->nangles,angle_on,angle_off);
       if (logfile) fprintf(logfile,
                           "  " BIGINT_FORMAT " total angles, " BIGINT_FORMAT
                            " turned on, " BIGINT_FORMAT " turned off\n",
                            atom->nangles,angle_on,angle_off);
     }
     if (atom->avec->dihedrals_allow) {
       if (screen) fprintf(screen,
                           "  " BIGINT_FORMAT " total dihedrals, "
                           BIGINT_FORMAT " turned on, " BIGINT_FORMAT
                           " turned off\n",
                           atom->ndihedrals,dihedral_on,dihedral_off);
       if (logfile) fprintf(logfile,
                           "  " BIGINT_FORMAT " total dihedrals, "
                           BIGINT_FORMAT " turned on, " BIGINT_FORMAT
                           " turned off\n",
                           atom->ndihedrals,dihedral_on,dihedral_off);
     }
     if (atom->avec->impropers_allow) {
       if (screen) fprintf(screen,
                           "  " BIGINT_FORMAT " total impropers, "
                           BIGINT_FORMAT " turned on, " BIGINT_FORMAT
                           " turned off\n",
                           atom->nimpropers,improper_on,improper_off);
       if (logfile) fprintf(logfile,
                           "  " BIGINT_FORMAT " total impropers, "
                           BIGINT_FORMAT " turned on, " BIGINT_FORMAT
                           " turned off\n",
                           atom->nimpropers,improper_on,improper_off);
     }
   }
 
   // re-compute special list if requested
 
   if (special_flag) {
     Special special(lmp);
     special.build();
   }
 }
diff --git a/src/dihedral_hybrid.cpp b/src/dihedral_hybrid.cpp
index 7b0dea64d..6f8ef5a09 100644
--- a/src/dihedral_hybrid.cpp
+++ b/src/dihedral_hybrid.cpp
@@ -1,350 +1,351 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "string.h"
 #include "ctype.h"
 #include "dihedral_hybrid.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define EXTRA 1000
 
 /* ---------------------------------------------------------------------- */
 
 DihedralHybrid::DihedralHybrid(LAMMPS *lmp) : Dihedral(lmp)
 {
   nstyles = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 DihedralHybrid::~DihedralHybrid()
 {
   if (nstyles) {
     for (int i = 0; i < nstyles; i++) delete styles[i];
     delete [] styles;
     for (int i = 0; i < nstyles; i++) delete [] keywords[i];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(map);
     delete [] ndihedrallist;
     delete [] maxdihedral;
     for (int i = 0; i < nstyles; i++)
       memory->destroy(dihedrallist[i]);
     delete [] dihedrallist;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralHybrid::compute(int eflag, int vflag)
 {
   int i,j,m,n;
 
   // save ptrs to original dihedrallist
 
   int ndihedrallist_orig = neighbor->ndihedrallist;
   int **dihedrallist_orig = neighbor->dihedrallist;
 
   // if this is re-neighbor step, create sub-style dihedrallists
   // ndihedrallist[] = length of each sub-style list
   // realloc sub-style dihedrallist if necessary
   // load sub-style dihedrallist with 5 values from original dihedrallist
 
   if (neighbor->ago == 0) {
     for (m = 0; m < nstyles; m++) ndihedrallist[m] = 0;
     for (i = 0; i < ndihedrallist_orig; i++) {
       m = map[dihedrallist_orig[i][4]];
       if (m >= 0) ndihedrallist[m]++;
     }
     for (m = 0; m < nstyles; m++) {
       if (ndihedrallist[m] > maxdihedral[m]) {
         memory->destroy(dihedrallist[m]);
         maxdihedral[m] = ndihedrallist[m] + EXTRA;
         memory->create(dihedrallist[m],maxdihedral[m],5,
                        "dihedral_hybrid:dihedrallist");
       }
       ndihedrallist[m] = 0;
     }
     for (i = 0; i < ndihedrallist_orig; i++) {
       m = map[dihedrallist_orig[i][4]];
       if (m < 0) continue;
       n = ndihedrallist[m];
       dihedrallist[m][n][0] = dihedrallist_orig[i][0];
       dihedrallist[m][n][1] = dihedrallist_orig[i][1];
       dihedrallist[m][n][2] = dihedrallist_orig[i][2];
       dihedrallist[m][n][3] = dihedrallist_orig[i][3];
       dihedrallist[m][n][4] = dihedrallist_orig[i][4];
       ndihedrallist[m]++;
     }
   }
 
   // call each sub-style's compute function
   // set neighbor->dihedrallist to sub-style dihedrallist before call
   // accumulate sub-style global/peratom energy/virial in hybrid
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   for (m = 0; m < nstyles; m++) {
     neighbor->ndihedrallist = ndihedrallist[m];
     neighbor->dihedrallist = dihedrallist[m];
 
     styles[m]->compute(eflag,vflag);
 
     if (eflag_global) energy += styles[m]->energy;
     if (vflag_global)
       for (n = 0; n < 6; n++) virial[n] += styles[m]->virial[n];
     if (eflag_atom) {
       n = atom->nlocal;
       if (force->newton_bond) n += atom->nghost;
       double *eatom_substyle = styles[m]->eatom;
       for (i = 0; i < n; i++) eatom[i] += eatom_substyle[i];
     }
     if (vflag_atom) {
       n = atom->nlocal;
       if (force->newton_bond) n += atom->nghost;
       double **vatom_substyle = styles[m]->vatom;
       for (i = 0; i < n; i++)
         for (j = 0; j < 6; j++)
           vatom[i][j] += vatom_substyle[i][j];
     }
   }
 
   // restore ptrs to original dihedrallist
 
   neighbor->ndihedrallist = ndihedrallist_orig;
   neighbor->dihedrallist = dihedrallist_orig;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralHybrid::allocate()
 {
   allocated = 1;
   int n = atom->ndihedraltypes;
 
   memory->create(map,n+1,"dihedral:map");
   memory->create(setflag,n+1,"dihedral:setflag");
   for (int i = 1; i <= n; i++) setflag[i] = 0;
 
   ndihedrallist = new int[nstyles];
   maxdihedral = new int[nstyles];
   dihedrallist = new int**[nstyles];
   for (int m = 0; m < nstyles; m++) maxdihedral[m] = 0;
   for (int m = 0; m < nstyles; m++) dihedrallist[m] = NULL;
 }
 
 /* ----------------------------------------------------------------------
    create one dihedral style for each arg in list
 ------------------------------------------------------------------------- */
 
 void DihedralHybrid::settings(int narg, char **arg)
 {
   int i,m,istyle;
 
   if (narg < 1) error->all(FLERR,"Illegal dihedral_style command");
 
   // delete old lists, since cannot just change settings
 
   if (nstyles) {
     for (int i = 0; i < nstyles; i++) delete styles[i];
     delete [] styles;
     for (int i = 0; i < nstyles; i++) delete [] keywords[i];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(map);
     delete [] ndihedrallist;
     delete [] maxdihedral;
     for (int i = 0; i < nstyles; i++)
       memory->destroy(dihedrallist[i]);
     delete [] dihedrallist;
   }
   allocated = 0;
 
   // count sub-styles by skipping numeric args
   // one exception is 1st arg of style "table", which is non-numeric word
   // need a better way to skip these exceptions
 
   nstyles = 0;
   i = 0;
   while (i < narg) {
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     nstyles++;
   }
 
   // allocate list of sub-styles
 
   styles = new Dihedral*[nstyles];
   keywords = new char*[nstyles];
 
   // allocate each sub-style and call its settings() with subset of args
   // define subset of args for a sub-style by skipping numeric args
   // one exception is 1st arg of style "table", which is non-numeric
   // need a better way to skip these exceptions
 
-  int dummy;
+  int sflag;
   nstyles = 0;
   i = 0;
 
   while (i < narg) {
     for (m = 0; m < nstyles; m++)
       if (strcmp(arg[i],keywords[m]) == 0)
         error->all(FLERR,"Dihedral style hybrid cannot use "
                    "same dihedral style twice");
     if (strcmp(arg[i],"hybrid") == 0)
       error->all(FLERR,
                  "Dihedral style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0)
       error->all(FLERR,"Dihedral style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_dihedral(arg[i],lmp->suffix,dummy);
-    keywords[nstyles] = new char[strlen(arg[i])+1];
-    strcpy(keywords[nstyles],arg[i]);
+
+    styles[nstyles] = force->new_dihedral(arg[i],1,sflag);
+    force->store_style(keywords[nstyles],arg[i],sflag);
+
     istyle = i;
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     styles[nstyles]->settings(i-istyle-1,&arg[istyle+1]);
     nstyles++;
   }
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one type
 ---------------------------------------------------------------------- */
 
 void DihedralHybrid::coeff(int narg, char **arg)
 {
   if (!allocated) allocate();
 
   int ilo,ihi;
   force->bounds(arg[0],atom->ndihedraltypes,ilo,ihi);
 
   // 2nd arg = dihedral sub-style name
   // allow for "none" or "skip" as valid sub-style name
 
   int m;
   for (m = 0; m < nstyles; m++)
     if (strcmp(arg[1],keywords[m]) == 0) break;
 
   int none = 0;
   int skip = 0;
   if (m == nstyles) {
     if (strcmp(arg[1],"none") == 0) none = 1;
     else if (strcmp(arg[1],"skip") == 0) none = skip = 1;
     else error->all(FLERR,"Dihedral coeff for hybrid has invalid style");
   }
 
   // move 1st arg to 2nd arg
   // just copy ptrs, since arg[] points into original input line
 
   arg[1] = arg[0];
 
   // invoke sub-style coeff() starting with 1st arg
 
   if (!none) styles[m]->coeff(narg-1,&arg[1]);
 
   // set setflag and which type maps to which sub-style
   // if sub-style is skip: auxiliary class2 setting in data file so ignore
   // if sub-style is none and not skip: set hybrid setflag, wipe out map
 
   for (int i = ilo; i <= ihi; i++) {
     if (skip) continue;
     else if (none) {
       setflag[i] = 1;
       map[i] = -1;
     } else {
       setflag[i] = styles[m]->setflag[i];
       map[i] = m;
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralHybrid::init_style()
 {
   for (int m = 0; m < nstyles; m++)
     if (styles[m]) styles[m]->init_style();
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void DihedralHybrid::write_restart(FILE *fp)
 {
   fwrite(&nstyles,sizeof(int),1,fp);
 
   int n;
   for (int m = 0; m < nstyles; m++) {
     n = strlen(keywords[m]) + 1;
     fwrite(&n,sizeof(int),1,fp);
     fwrite(keywords[m],sizeof(char),n,fp);
   }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void DihedralHybrid::read_restart(FILE *fp)
 {
   int me = comm->me;
   if (me == 0) fread(&nstyles,sizeof(int),1,fp);
   MPI_Bcast(&nstyles,1,MPI_INT,0,world);
   styles = new Dihedral*[nstyles];
   keywords = new char*[nstyles];
 
   allocate();
 
   int n,dummy;
   for (int m = 0; m < nstyles; m++) {
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_dihedral(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_dihedral(keywords[m],0,dummy);
   }
 }
 
 /* ----------------------------------------------------------------------
    memory usage
 ------------------------------------------------------------------------- */
 
 double DihedralHybrid::memory_usage()
 {
   double bytes = maxeatom * sizeof(double);
   bytes += maxvatom*6 * sizeof(double);
   for (int m = 0; m < nstyles; m++) bytes += maxdihedral[m]*5 * sizeof(int);
   for (int m = 0; m < nstyles; m++)
     if (styles[m]) bytes += styles[m]->memory_usage();
   return bytes;
 }
diff --git a/src/force.cpp b/src/force.cpp
index c316c04a5..832133548 100644
--- a/src/force.cpp
+++ b/src/force.cpp
@@ -1,933 +1,975 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "stdlib.h"
 #include "string.h"
 #include "ctype.h"
 #include "force.h"
 #include "style_bond.h"
 #include "style_angle.h"
 #include "style_dihedral.h"
 #include "style_improper.h"
 #include "style_pair.h"
 #include "style_kspace.h"
 #include "atom.h"
 #include "comm.h"
 #include "pair.h"
 #include "pair_hybrid.h"
 #include "pair_hybrid_overlay.h"
 #include "bond.h"
 #include "bond_hybrid.h"
 #include "angle.h"
 #include "dihedral.h"
 #include "improper.h"
 #include "kspace.h"
 #include "group.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 Force::Force(LAMMPS *lmp) : Pointers(lmp)
 {
   newton = newton_pair = newton_bond = 1;
 
   special_lj[0] = special_coul[0] = 1.0;
   special_lj[1] = special_lj[2] = special_lj[3] = 0.0;
   special_coul[1] = special_coul[2] = special_coul[3] = 0.0;
   special_angle = special_dihedral = 0;
   special_extra = 0;
 
   dielectric = 1.0;
 
   pair = NULL;
   bond = NULL;
   angle = NULL;
   dihedral = NULL;
   improper = NULL;
   kspace = NULL;
 
   char *str = (char *) "none";
   int n = strlen(str) + 1;
   pair_style = new char[n];
   strcpy(pair_style,str);
   bond_style = new char[n];
   strcpy(bond_style,str);
   angle_style = new char[n];
   strcpy(angle_style,str);
   dihedral_style = new char[n];
   strcpy(dihedral_style,str);
   improper_style = new char[n];
   strcpy(improper_style,str);
   kspace_style = new char[n];
   strcpy(kspace_style,str);
 
   // fill pair map with pair styles listed in style_pair.h
 
   pair_map = new std::map<std::string,PairCreator>();
 
 #define PAIR_CLASS
 #define PairStyle(key,Class) \
   (*pair_map)[#key] = &pair_creator<Class>;
 #include "style_pair.h"
 #undef PairStyle
 #undef PAIR_CLASS
 }
 
 /* ---------------------------------------------------------------------- */
 
 Force::~Force()
 {
   delete [] pair_style;
   delete [] bond_style;
   delete [] angle_style;
   delete [] dihedral_style;
   delete [] improper_style;
   delete [] kspace_style;
 
   if (pair) delete pair;
   if (bond) delete bond;
   if (angle) delete angle;
   if (dihedral) delete dihedral;
   if (improper) delete improper;
   if (kspace) delete kspace;
 
   delete pair_map;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Force::init()
 {
   qqrd2e = qqr2e/dielectric;
 
   if (kspace) kspace->init();         // kspace must come before pair
   if (pair) pair->init();             // so g_ewald is defined
   if (bond) bond->init();
   if (angle) angle->init();
   if (dihedral) dihedral->init();
   if (improper) improper->init();
 }
 
 /* ----------------------------------------------------------------------
    create a pair style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_pair(const char *style, const char *suffix)
+void Force::create_pair(const char *style, int trysuffix)
 {
   delete [] pair_style;
   if (pair) delete pair;
 
   int sflag;
-  pair = new_pair(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    pair_style = new char[n];
-    strcpy(pair_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    pair_style = new char[n];
-    strcpy(pair_style,style);
-  }
+  pair = new_pair(style,trysuffix,sflag);
+  store_style(pair_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate a pair class
-   try first with suffix appended
+   if trysuffix = 1, try first with suffix1/2 appended
+   return sflag = 0 for no suffix added, 1 or 2 for suffix1/2 added
 ------------------------------------------------------------------------- */
 
-Pair *Force::new_pair(const char *style, const char *suffix, int &sflag)
+Pair *Force::new_pair(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-
-    if (pair_map->find(estyle) != pair_map->end()) {
-      PairCreator pair_creator = (*pair_map)[estyle];
-      return pair_creator(lmp);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      if (pair_map->find(estyle) != pair_map->end()) {
+        PairCreator pair_creator = (*pair_map)[estyle];
+        return pair_creator(lmp);
+      }
+    }
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+      if (pair_map->find(estyle) != pair_map->end()) {
+        PairCreator pair_creator = (*pair_map)[estyle];
+        return pair_creator(lmp);
+      }
     }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
   if (pair_map->find(style) != pair_map->end()) {
     PairCreator pair_creator = (*pair_map)[style];
     return pair_creator(lmp);
   }
 
   error->all(FLERR,"Invalid pair style");
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    one instance per pair style in style_pair.h
 ------------------------------------------------------------------------- */
 
 template <typename T>
 Pair *Force::pair_creator(LAMMPS *lmp)
 {
   return new T(lmp);
 }
 
 /* ----------------------------------------------------------------------
    return ptr to Pair class if matches word or matches hybrid sub-style
    if exact, then style name must be exact match to word
    if not exact, style name must contain word
    return NULL if no match or multiple sub-styles match
 ------------------------------------------------------------------------- */
 
 Pair *Force::pair_match(const char *word, int exact)
 {
   int iwhich,count;
 
   if (exact && strcmp(pair_style,word) == 0) return pair;
   else if (!exact && strstr(pair_style,word)) return pair;
 
   else if (strstr(pair_style,"hybrid/overlay")) {
     PairHybridOverlay *hybrid = (PairHybridOverlay *) pair;
     count = 0;
     for (int i = 0; i < hybrid->nstyles; i++)
       if ((exact && strcmp(hybrid->keywords[i],word) == 0) ||
           (!exact && strstr(hybrid->keywords[i],word))) {
         iwhich = i;
         count++;
       }
     if (count == 1) return hybrid->styles[iwhich];
 
   } else if (strstr(pair_style,"hybrid")) {
     PairHybrid *hybrid = (PairHybrid *) pair;
     count = 0;
     for (int i = 0; i < hybrid->nstyles; i++)
       if ((exact && strcmp(hybrid->keywords[i],word) == 0) ||
           (!exact && strstr(hybrid->keywords[i],word))) {
         iwhich = i;
         count++;
       }
     if (count == 1) return hybrid->styles[iwhich];
   }
 
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    create a bond style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_bond(const char *style, const char *suffix)
+void Force::create_bond(const char *style, int trysuffix)
 {
   delete [] bond_style;
   if (bond) delete bond;
 
   int sflag;
-  bond = new_bond(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    bond_style = new char[n];
-    strcpy(bond_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    bond_style = new char[n];
-    strcpy(bond_style,style);
-  }
+  bond = new_bond(style,trysuffix,sflag);
+  store_style(bond_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate a bond class, fist with suffix appended
 ------------------------------------------------------------------------- */
 
-Bond *Force::new_bond(const char *style, const char *suffix, int &sflag)
+Bond *Force::new_bond(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      
+      if (0) return NULL;
+
+#define BOND_CLASS
+#define BondStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_bond.h"
+#undef BondStyle
+#undef BOND_CLASS
+    }
 
-    if (0) return NULL;
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+      
+      if (0) return NULL;
 
 #define BOND_CLASS
 #define BondStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_bond.h"
 #undef BondStyle
 #undef BOND_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
 
 #define BOND_CLASS
 #define BondStyle(key,Class) \
   else if (strcmp(style,#key) == 0) return new Class(lmp);
 #include "style_bond.h"
 #undef BOND_CLASS
 
   else error->all(FLERR,"Invalid bond style");
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    return ptr to current bond class or hybrid sub-class if matches style
 ------------------------------------------------------------------------- */
 
 Bond *Force::bond_match(const char *style)
 {
   if (strcmp(bond_style,style) == 0) return bond;
   else if (strcmp(bond_style,"hybrid") == 0) {
     BondHybrid *hybrid = (BondHybrid *) bond;
     for (int i = 0; i < hybrid->nstyles; i++)
       if (strcmp(hybrid->keywords[i],style) == 0) return hybrid->styles[i];
   }
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    create an angle style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_angle(const char *style, const char *suffix)
+void Force::create_angle(const char *style, int trysuffix)
 {
   delete [] angle_style;
   if (angle) delete angle;
 
   int sflag;
-  angle = new_angle(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    angle_style = new char[n];
-    strcpy(angle_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    angle_style = new char[n];
-    strcpy(angle_style,style);
-  }
+  angle = new_angle(style,trysuffix,sflag);
+  store_style(angle_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate an angle class
 ------------------------------------------------------------------------- */
 
-Angle *Force::new_angle(const char *style, const char *suffix, int &sflag)
+Angle *Force::new_angle(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-
-    if (0) return NULL;
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      
+      if (0) return NULL;
 
 #define ANGLE_CLASS
 #define AngleStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_angle.h"
 #undef AngleStyle
 #undef ANGLE_CLASS
+    }
 
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      
+      if (0) return NULL;
+
+#define ANGLE_CLASS
+#define AngleStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_angle.h"
+#undef AngleStyle
+#undef ANGLE_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
 
 #define ANGLE_CLASS
 #define AngleStyle(key,Class) \
   else if (strcmp(style,#key) == 0) return new Class(lmp);
 #include "style_angle.h"
 #undef ANGLE_CLASS
 
   else error->all(FLERR,"Invalid angle style");
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    create a dihedral style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_dihedral(const char *style, const char *suffix)
+void Force::create_dihedral(const char *style, int trysuffix)
 {
   delete [] dihedral_style;
   if (dihedral) delete dihedral;
 
   int sflag;
-  dihedral = new_dihedral(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    dihedral_style = new char[n];
-    strcpy(dihedral_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    dihedral_style = new char[n];
-    strcpy(dihedral_style,style);
-  }
+  dihedral = new_dihedral(style,trysuffix,sflag);
+  store_style(dihedral_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate a dihedral class
 ------------------------------------------------------------------------- */
 
-Dihedral *Force::new_dihedral(const char *style, const char *suffix, int &sflag)
+Dihedral *Force::new_dihedral(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
 
-    if (0) return NULL;
+      if (0) return NULL;
 
 #define DIHEDRAL_CLASS
 #define DihedralStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_dihedral.h"
 #undef DihedralStyle
 #undef DIHEDRAL_CLASS
+    }
 
+    if (lmp->suffix) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+
+      if (0) return NULL;
+
+#define DIHEDRAL_CLASS
+#define DihedralStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_dihedral.h"
+#undef DihedralStyle
+#undef DIHEDRAL_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
 
 #define DIHEDRAL_CLASS
 #define DihedralStyle(key,Class) \
   else if (strcmp(style,#key) == 0) return new Class(lmp);
 #include "style_dihedral.h"
 #undef DihedralStyle
 #undef DIHEDRAL_CLASS
 
   else error->all(FLERR,"Invalid dihedral style");
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    create an improper style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_improper(const char *style, const char *suffix)
+void Force::create_improper(const char *style, int trysuffix)
 {
   delete [] improper_style;
   if (improper) delete improper;
 
   int sflag;
-  improper = new_improper(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    improper_style = new char[n];
-    strcpy(improper_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    improper_style = new char[n];
-    strcpy(improper_style,style);
-  }
+  improper = new_improper(style,trysuffix,sflag);
+  store_style(improper_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate a improper class
 ------------------------------------------------------------------------- */
 
-Improper *Force::new_improper(const char *style, const char *suffix, int &sflag)
+Improper *Force::new_improper(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
 
-    if (0) return NULL;
+      if (0) return NULL;
 
 #define IMPROPER_CLASS
 #define ImproperStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_improper.h"
 #undef ImproperStyle
 #undef IMPROPER_CLASS
+    }
 
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+
+      if (0) return NULL;
+
+#define IMPROPER_CLASS
+#define ImproperStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_improper.h"
+#undef ImproperStyle
+#undef IMPROPER_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
 
 #define IMPROPER_CLASS
 #define ImproperStyle(key,Class) \
   else if (strcmp(style,#key) == 0) return new Class(lmp);
 #include "style_improper.h"
 #undef IMPROPER_CLASS
 
   else error->all(FLERR,"Invalid improper style");
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    return ptr to current improper class or hybrid sub-class if matches style
 ------------------------------------------------------------------------- */
 
 Improper *Force::improper_match(const char *style)
 {
   if (strcmp(improper_style,style) == 0) return improper;
   else if (strcmp(improper_style,"hybrid") == 0) {
     ImproperHybrid *hybrid = (ImproperHybrid *) bond;
     for (int i = 0; i < hybrid->nstyles; i++)
       if (strcmp(hybrid->keywords[i],style) == 0) return hybrid->styles[i];
   }
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    new kspace style
 ------------------------------------------------------------------------- */
 
-void Force::create_kspace(int narg, char **arg, const char *suffix)
+void Force::create_kspace(int narg, char **arg, int trysuffix)
 {
   delete [] kspace_style;
   if (kspace) delete kspace;
 
   int sflag;
-  kspace = new_kspace(narg,arg,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",arg[0],suffix);
-    int n = strlen(estyle) + 1;
-    kspace_style = new char[n];
-    strcpy(kspace_style,estyle);
-  } else {
-    int n = strlen(arg[0]) + 1;
-    kspace_style = new char[n];
-    strcpy(kspace_style,arg[0]);
-  }
+  kspace = new_kspace(narg,arg,trysuffix,sflag);
+  store_style(kspace_style,arg[0],sflag);
 
   if (comm->style == 1 && !kspace_match("ewald",0))
     error->all(FLERR,
                "Cannot yet use KSpace solver with grid with comm style tiled");
 }
 
 /* ----------------------------------------------------------------------
    generate a kspace class
 ------------------------------------------------------------------------- */
 
-KSpace *Force::new_kspace(int narg, char **arg, const char *suffix, int &sflag)
+KSpace *Force::new_kspace(int narg, char **arg, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",arg[0],suffix);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[0],lmp->suffix);
 
-    if (0) return NULL;
+      if (0) return NULL;
 
 #define KSPACE_CLASS
 #define KSpaceStyle(key,Class) \
-  else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg-1,&arg[1]);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg-1,&arg[1]);
 #include "style_kspace.h"
 #undef KSpaceStyle
 #undef KSPACE_CLASS
+    }
 
+    if (lmp->suffix2) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[0],lmp->suffix2);
+
+      if (0) return NULL;
+
+#define KSPACE_CLASS
+#define KSpaceStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg-1,&arg[1]);
+#include "style_kspace.h"
+#undef KSpaceStyle
+#undef KSPACE_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(arg[0],"none") == 0) return NULL;
 
 #define KSPACE_CLASS
 #define KSpaceStyle(key,Class) \
   else if (strcmp(arg[0],#key) == 0) return  new Class(lmp,narg-1,&arg[1]);
 #include "style_kspace.h"
 #undef KSPACE_CLASS
 
   else error->all(FLERR,"Invalid kspace style");
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    return ptr to Kspace class if matches word
    if exact, then style name must be exact match to word
    if not exact, style name must contain word
    return NULL if no match
 ------------------------------------------------------------------------- */
 
 KSpace *Force::kspace_match(const char *word, int exact)
 {
   if (exact && strcmp(kspace_style,word) == 0) return kspace;
   else if (!exact && strstr(kspace_style,word)) return kspace;
   return NULL;
 }
 
+/* ----------------------------------------------------------------------
+   store style name in str allocated here
+   if sflag = 0, no suffix
+   if sflag = 1/2, append suffix or suffix2 to style
+------------------------------------------------------------------------- */
+
+void Force::store_style(char *&str, const char *style, int sflag)
+{
+  if (sflag) {
+    char estyle[256];
+    if (sflag == 1) sprintf(estyle,"%s/%s",style,lmp->suffix);
+    else sprintf(estyle,"%s/%s",style,lmp->suffix2);
+    int n = strlen(estyle) + 1;
+    str = new char[n];
+    strcpy(str,estyle);
+  } else {
+    int n = strlen(style) + 1;
+    str = new char[n];
+    strcpy(str,style);
+  }
+}
+
 /* ----------------------------------------------------------------------
    set special bond values
 ------------------------------------------------------------------------- */
 
 void Force::set_special(int narg, char **arg)
 {
   if (narg == 0) error->all(FLERR,"Illegal special_bonds command");
 
   // defaults, but do not reset special_extra
 
   special_lj[1] = special_lj[2] = special_lj[3] = 0.0;
   special_coul[1] = special_coul[2] = special_coul[3] = 0.0;
   special_angle = special_dihedral = 0;
 
   int iarg = 0;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"amber") == 0) {
       if (iarg+1 > narg) error->all(FLERR,"Illegal special_bonds command");
       special_lj[1] = 0.0;
       special_lj[2] = 0.0;
       special_lj[3] = 0.5;
       special_coul[1] = 0.0;
       special_coul[2] = 0.0;
       special_coul[3] = 5.0/6.0;
       iarg += 1;
     } else if (strcmp(arg[iarg],"charmm") == 0) {
       if (iarg+1 > narg) error->all(FLERR,"Illegal special_bonds command");
       special_lj[1] = 0.0;
       special_lj[2] = 0.0;
       special_lj[3] = 0.0;
       special_coul[1] = 0.0;
       special_coul[2] = 0.0;
       special_coul[3] = 0.0;
       iarg += 1;
     } else if (strcmp(arg[iarg],"dreiding") == 0) {
       if (iarg+1 > narg) error->all(FLERR,"Illegal special_bonds command");
       special_lj[1] = 0.0;
       special_lj[2] = 0.0;
       special_lj[3] = 1.0;
       special_coul[1] = 0.0;
       special_coul[2] = 0.0;
       special_coul[3] = 1.0;
       iarg += 1;
     } else if (strcmp(arg[iarg],"fene") == 0) {
       if (iarg+1 > narg) error->all(FLERR,"Illegal special_bonds command");
       special_lj[1] = 0.0;
       special_lj[2] = 1.0;
       special_lj[3] = 1.0;
       special_coul[1] = 0.0;
       special_coul[2] = 1.0;
       special_coul[3] = 1.0;
       iarg += 1;
     } else if (strcmp(arg[iarg],"lj/coul") == 0) {
       if (iarg+4 > narg) error->all(FLERR,"Illegal special_bonds command");
       special_lj[1] = special_coul[1] = atof(arg[iarg+1]);
       special_lj[2] = special_coul[2] = atof(arg[iarg+2]);
       special_lj[3] = special_coul[3] = atof(arg[iarg+3]);
       iarg += 4;
     } else if (strcmp(arg[iarg],"lj") == 0) {
       if (iarg+4 > narg) error->all(FLERR,"Illegal special_bonds command");
       special_lj[1] = atof(arg[iarg+1]);
       special_lj[2] = atof(arg[iarg+2]);
       special_lj[3] = atof(arg[iarg+3]);
       iarg += 4;
     } else if (strcmp(arg[iarg],"coul") == 0) {
       if (iarg+4 > narg) error->all(FLERR,"Illegal special_bonds command");
       special_coul[1] = atof(arg[iarg+1]);
       special_coul[2] = atof(arg[iarg+2]);
       special_coul[3] = atof(arg[iarg+3]);
       iarg += 4;
     } else if (strcmp(arg[iarg],"angle") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal special_bonds command");
       if (strcmp(arg[iarg+1],"no") == 0) special_angle = 0;
       else if (strcmp(arg[iarg+1],"yes") == 0) special_angle = 1;
       else error->all(FLERR,"Illegal special_bonds command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"dihedral") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal special_bonds command");
       if (strcmp(arg[iarg+1],"no") == 0) special_dihedral = 0;
       else if (strcmp(arg[iarg+1],"yes") == 0) special_dihedral = 1;
       else error->all(FLERR,"Illegal special_bonds command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"extra") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal special_bonds command");
       special_extra = atoi(arg[iarg+1]);
       iarg += 2;
     } else error->all(FLERR,"Illegal special_bonds command");
   }
 
   for (int i = 1; i <= 3; i++)
     if (special_lj[i] < 0.0 || special_lj[i] > 1.0 ||
         special_coul[i] < 0.0 || special_coul[i] > 1.0)
       error->all(FLERR,"Illegal special_bonds command");
 
   if (special_extra < 0) error->all(FLERR,"Illegal special_bonds command");
 }
 
 /* ----------------------------------------------------------------------
    compute bounds implied by numeric str with a possible wildcard asterik
    1 = lower bound, nmax = upper bound
    5 possibilities:
      (1) i = i to i, (2) * = nmin to nmax,
      (3) i* = i to nmax, (4) *j = nmin to j, (5) i*j = i to j
    return nlo,nhi
 ------------------------------------------------------------------------- */
 
 void Force::bounds(char *str, int nmax, int &nlo, int &nhi, int nmin)
 {
   char *ptr = strchr(str,'*');
 
   if (ptr == NULL) {
     nlo = nhi = atoi(str);
   } else if (strlen(str) == 1) {
     nlo = nmin;
     nhi = nmax;
   } else if (ptr == str) {
     nlo = nmin;
     nhi = atoi(ptr+1);
   } else if (strlen(ptr+1) == 0) {
     nlo = atoi(str);
     nhi = nmax;
   } else {
     nlo = atoi(str);
     nhi = atoi(ptr+1);
   }
 
   if (nlo < nmin || nhi > nmax) 
     error->all(FLERR,"Numeric index is out of bounds");
 }
 
 /* ----------------------------------------------------------------------
    compute bounds implied by numeric str with a possible wildcard asterik
    1 = lower bound, nmax = upper bound
    5 possibilities:
      (1) i = i to i, (2) * = nmin to nmax,
      (3) i* = i to nmax, (4) *j = nmin to j, (5) i*j = i to j
    return nlo,nhi
 ------------------------------------------------------------------------- */
 
 void Force::boundsbig(char *str, bigint nmax, bigint &nlo, bigint &nhi, 
                       bigint nmin)
 {
   char *ptr = strchr(str,'*');
 
   if (ptr == NULL) {
     nlo = nhi = ATOBIGINT(str);
   } else if (strlen(str) == 1) {
     nlo = nmin;
     nhi = nmax;
   } else if (ptr == str) {
     nlo = nmin;
     nhi = ATOBIGINT(ptr+1);
   } else if (strlen(ptr+1) == 0) {
     nlo = ATOBIGINT(str);
     nhi = nmax;
   } else {
     nlo = ATOBIGINT(str);
     nhi = ATOBIGINT(ptr+1);
   }
 
   if (nlo < nmin || nhi > nmax) 
     error->all(FLERR,"Numeric index is out of bounds");
 }
 
 /* ----------------------------------------------------------------------
    read a floating point value from a string
    generate an error if not a legitimate floating point value
    called by various commands to check validity of their arguments
 ------------------------------------------------------------------------- */
 
 double Force::numeric(const char *file, int line, char *str)
 {
   if (!str)
     error->all(file,line,"Expected floating point parameter "
                "in input script or data file");
   int n = strlen(str);
   if (n == 0)
     error->all(file,line,"Expected floating point parameter "
                "in input script or data file");
 
   for (int i = 0; i < n; i++) {
     if (isdigit(str[i])) continue;
     if (str[i] == '-' || str[i] == '+' || str[i] == '.') continue;
     if (str[i] == 'e' || str[i] == 'E') continue;
     error->all(file,line,"Expected floating point parameter "
                "in input script or data file");
   }
 
   return atof(str);
 }
 
 /* ----------------------------------------------------------------------
    read an integer value from a string
    generate an error if not a legitimate integer value
    called by various commands to check validity of their arguments
 ------------------------------------------------------------------------- */
 
 int Force::inumeric(const char *file, int line, char *str)
 {
   if (!str) 
     error->all(file,line,
                "Expected integer parameter in input script or data file");
   int n = strlen(str);
   if (n == 0) 
     error->all(file,line,
                "Expected integer parameter in input script or data file");
 
   for (int i = 0; i < n; i++) {
     if (isdigit(str[i]) || str[i] == '-' || str[i] == '+') continue;
     error->all(file,line,
                "Expected integer parameter in input script or data file");
   }
 
   return atoi(str);
 }
 
 /* ----------------------------------------------------------------------
    read a big integer value from a string
    generate an error if not a legitimate integer value
    called by various commands to check validity of their arguments
 ------------------------------------------------------------------------- */
 
 bigint Force::bnumeric(const char *file, int line, char *str)
 {
   if (!str) 
     error->all(file,line,
                "Expected integer parameter in input script or data file");
   int n = strlen(str);
   if (n == 0) 
     error->all(file,line,
                "Expected integer parameter in input script or data file");
 
   for (int i = 0; i < n; i++) {
     if (isdigit(str[i]) || str[i] == '-' || str[i] == '+') continue;
     error->all(file,line,
                "Expected integer parameter in input script or data file");
   }
 
   return ATOBIGINT(str);
 }
 
 /* ----------------------------------------------------------------------
    read a tag integer value from a string
    generate an error if not a legitimate integer value
    called by various commands to check validity of their arguments
 ------------------------------------------------------------------------- */
 
 tagint Force::tnumeric(const char *file, int line, char *str)
 {
   if (!str) 
     error->all(file,line,
                "Expected integer parameter in input script or data file");
   int n = strlen(str);
   if (n == 0) 
     error->all(file,line,
                "Expected integer parameter in input script or data file");
 
   for (int i = 0; i < n; i++) {
     if (isdigit(str[i]) || str[i] == '-' || str[i] == '+') continue;
     error->all(file,line,
                "Expected integer parameter in input script or data file");
   }
 
   return ATOTAGINT(str);
 }
 
 /* ----------------------------------------------------------------------
    open a potential file as specified by name; failing that,
    search in dir specified by env variable LAMMPS_POTENTIALS
 ------------------------------------------------------------------------- */
 
 FILE *Force::open_potential(const char *name)
 {
   FILE *fp;
 
   if (name == NULL) return NULL;
 
   // attempt to open file directly
   // if successful, return ptr
 
   fp = fopen(name,"r");
   if (fp) return fp;
 
   // try the environment variable directory
 
   const char *path = getenv("LAMMPS_POTENTIALS");
   if (path == NULL) return NULL;
 
   const char *pot = potname(name);
   if (pot == NULL) return NULL;
 
   size_t len1 = strlen(path);
   size_t len2 = strlen(pot);
   char *newpath = new char[len1+len2+2];
 
   strcpy(newpath,path);
 #if defined(_WIN32)
   newpath[len1] = '\\';
   newpath[len1+1] = 0;
 #else
   newpath[len1] = '/';
   newpath[len1+1] = 0;
 #endif
   strcat(newpath,pot);
 
   fp = fopen(newpath,"r");
   delete[] newpath;
   return fp;
 }
 
 /* ----------------------------------------------------------------------
    strip off leading part of path, return just the filename
 ------------------------------------------------------------------------- */
 
 const char *Force::potname(const char *path)
 {
   const char *pot;
 
   if (path == NULL) return NULL;
 
 #if defined(_WIN32)
   // skip over the disk drive part of windows pathnames
   if (isalpha(path[0]) && path[1] == ':') 
     path += 2;
 #endif
 
   for (pot = path; *path != '\0'; ++path) {
 #if defined(_WIN32)
     if ((*path == '\\') || (*path == '/')) pot = path + 1;
 #else
     if (*path == '/') pot = path + 1;
 #endif
   }
 
   return pot;
 }
 
 /* ----------------------------------------------------------------------
    memory usage of force classes
 ------------------------------------------------------------------------- */
 
 bigint Force::memory_usage()
 {
   bigint bytes = 0;
   if (pair) bytes += static_cast<bigint> (pair->memory_usage());
   if (bond) bytes += static_cast<bigint> (bond->memory_usage());
   if (angle) bytes += static_cast<bigint> (angle->memory_usage());
   if (dihedral) bytes += static_cast<bigint> (dihedral->memory_usage());
   if (improper) bytes += static_cast<bigint> (improper->memory_usage());
   if (kspace) bytes += static_cast<bigint> (kspace->memory_usage());
   return bytes;
 }
diff --git a/src/force.h b/src/force.h
index bf364f253..f857c1a11 100644
--- a/src/force.h
+++ b/src/force.h
@@ -1,161 +1,162 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_FORCE_H
 #define LMP_FORCE_H
 
 #include "pointers.h"
 #include <map>
 #include <string>
 
 namespace LAMMPS_NS {
 
 class Force : protected Pointers {
  public:
   double boltz;                      // Boltzmann constant (eng/degree-K)
   double hplanck;                    // Planck's constant (energy-time)
   double mvv2e;                      // conversion of mv^2 to energy
   double ftm2v;                      // conversion of ft/m to velocity
   double mv2d;                       // conversion of mass/volume to density
   double nktv2p;                     // conversion of NkT/V to pressure
   double qqr2e;                      // conversion of q^2/r to energy
   double qe2f;                       // conversion of qE to force
   double vxmu2f;                     // conversion of vx dynamic-visc to force
   double xxt2kmu;                    // conversion of xx/t to kinematic-visc
   double dielectric;                 // dielectric constant
   double qqrd2e;                     // q^2/r to energy w/ dielectric constant
   double e_mass;                     // electron mass
   double hhmrr2e;                    // conversion of (hbar)^2/(mr^2) to energy
   double mvh2r;                      // conversion of mv/hbar to distance
                                      // hbar = h/(2*pi)
   double angstrom;                   // 1 angstrom in native units
   double femtosecond;                // 1 femtosecond in native units
   double qelectron;                  // 1 electron charge abs() in native units
 
   int newton,newton_pair,newton_bond;   // Newton's 3rd law settings
 
   class Pair *pair;
   char *pair_style;
 
   typedef Pair *(*PairCreator)(LAMMPS *);
   std::map<std::string,PairCreator> *pair_map;
 
   class Bond *bond;
   char *bond_style;
 
   class Angle *angle;
   char *angle_style;
 
   class Dihedral *dihedral;
   char *dihedral_style;
 
   class Improper *improper;
   char *improper_style;
 
   class KSpace *kspace;
   char *kspace_style;
                              // index [0] is not used in these arrays
   double special_lj[4];      // 1-2, 1-3, 1-4 prefactors for LJ
   double special_coul[4];    // 1-2, 1-3, 1-4 prefactors for Coulombics
   int special_angle;         // 0 if defined angles are ignored
                              // 1 if only weight 1,3 atoms if in an angle
   int special_dihedral;      // 0 if defined dihedrals are ignored
                              // 1 if only weight 1,4 atoms if in a dihedral
   int special_extra;         // extra space for added bonds
 
   Force(class LAMMPS *);
   ~Force();
   void init();
 
-  void create_pair(const char *, const char *suffix = NULL);
-  class Pair *new_pair(const char *, const char *, int &);
+  void create_pair(const char *, int);
+  class Pair *new_pair(const char *, int, int &);
   class Pair *pair_match(const char *, int);
 
-  void create_bond(const char *, const char *suffix = NULL);
-  class Bond *new_bond(const char *, const char *, int &);
+  void create_bond(const char *, int);
+  class Bond *new_bond(const char *, int, int &);
   class Bond *bond_match(const char *);
 
-  void create_angle(const char *, const char *suffix = NULL);
-  class Angle *new_angle(const char *, const char *, int &);
+  void create_angle(const char *, int);
+  class Angle *new_angle(const char *, int, int &);
 
-  void create_dihedral(const char *, const char *suffix = NULL);
-  class Dihedral *new_dihedral(const char *, const char *, int &);
+  void create_dihedral(const char *, int);
+  class Dihedral *new_dihedral(const char *, int, int &);
 
-  void create_improper(const char *, const char *suffix = NULL);
-  class Improper *new_improper(const char *, const char *, int &);
+  void create_improper(const char *, int);
+  class Improper *new_improper(const char *, int, int &);
   class Improper *improper_match(const char *);
 
-  void create_kspace(int, char **, const char *suffix = NULL);
-  class KSpace *new_kspace(int, char **, const char *, int &);
+  void create_kspace(int, char **, int);
+  class KSpace *new_kspace(int, char **, int, int &);
   class KSpace *kspace_match(const char *, int);
 
+  void store_style(char *&, const char *, int);
   void set_special(int, char **);
   void bounds(char *, int, int &, int &, int nmin=1);
   void boundsbig(char *, bigint, bigint &, bigint &, bigint nmin=1);
   double numeric(const char *, int, char *);
   int inumeric(const char *, int, char *);
   bigint bnumeric(const char *, int, char *);
   tagint tnumeric(const char *, int, char *);
 
   FILE *open_potential(const char *);
   const char *potname(const char *);
 
   bigint memory_usage();
 
  private:
   template <typename T> static Pair *pair_creator(LAMMPS *);
 };
 
 }
 
 #endif
 
 /* ERROR/WARNING messages:
 
 E: Invalid pair style
 
 The choice of pair style is unknown.
 
 E: Invalid bond style
 
 The choice of bond style is unknown.
 
 E: Invalid angle style
 
 The choice of angle style is unknown.
 
 E: Invalid dihedral style
 
 The choice of dihedral style is unknown.
 
 E: Invalid improper style
 
 The choice of improper style is unknown.
 
 E: Invalid kspace style
 
 The choice of kspace style is unknown.
 
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
 E: Numeric index is out of bounds
 
 A command with an argument that specifies an integer or range of
 integers is using a value that is less than 1 or greater than the
 maximum allowed limit.
 
 */
diff --git a/src/improper_hybrid.cpp b/src/improper_hybrid.cpp
index 9212051e3..09e73ac9b 100644
--- a/src/improper_hybrid.cpp
+++ b/src/improper_hybrid.cpp
@@ -1,338 +1,339 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "string.h"
 #include "ctype.h"
 #include "improper_hybrid.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define EXTRA 1000
 
 /* ---------------------------------------------------------------------- */
 
 ImproperHybrid::ImproperHybrid(LAMMPS *lmp) : Improper(lmp)
 {
   nstyles = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 ImproperHybrid::~ImproperHybrid()
 {
   if (nstyles) {
     for (int i = 0; i < nstyles; i++) delete styles[i];
     delete [] styles;
     for (int i = 0; i < nstyles; i++) delete [] keywords[i];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(map);
     delete [] nimproperlist;
     delete [] maximproper;
     for (int i = 0; i < nstyles; i++)
       memory->destroy(improperlist[i]);
     delete [] improperlist;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ImproperHybrid::compute(int eflag, int vflag)
 {
   int i,j,m,n;
 
   // save ptrs to original improperlist
 
   int nimproperlist_orig = neighbor->nimproperlist;
   int **improperlist_orig = neighbor->improperlist;
 
   // if this is re-neighbor step, create sub-style improperlists
   // nimproperlist[] = length of each sub-style list
   // realloc sub-style improperlist if necessary
   // load sub-style improperlist with 5 values from original improperlist
 
   if (neighbor->ago == 0) {
     for (m = 0; m < nstyles; m++) nimproperlist[m] = 0;
     for (i = 0; i < nimproperlist_orig; i++) {
       m = map[improperlist_orig[i][4]];
       nimproperlist[m]++;
     }
     for (m = 0; m < nstyles; m++) {
       if (nimproperlist[m] > maximproper[m]) {
         memory->destroy(improperlist[m]);
         maximproper[m] = nimproperlist[m] + EXTRA;
         memory->create(improperlist[m],maximproper[m],5,
                        "improper_hybrid:improperlist");
       }
       nimproperlist[m] = 0;
     }
     for (i = 0; i < nimproperlist_orig; i++) {
       m = map[improperlist_orig[i][4]];
       if (m < 0) continue;
       n = nimproperlist[m];
       improperlist[m][n][0] = improperlist_orig[i][0];
       improperlist[m][n][1] = improperlist_orig[i][1];
       improperlist[m][n][2] = improperlist_orig[i][2];
       improperlist[m][n][3] = improperlist_orig[i][3];
       improperlist[m][n][4] = improperlist_orig[i][4];
       nimproperlist[m]++;
     }
   }
 
   // call each sub-style's compute function
   // set neighbor->improperlist to sub-style improperlist before call
   // accumulate sub-style global/peratom energy/virial in hybrid
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   for (m = 0; m < nstyles; m++) {
     neighbor->nimproperlist = nimproperlist[m];
     neighbor->improperlist = improperlist[m];
 
     styles[m]->compute(eflag,vflag);
 
     if (eflag_global) energy += styles[m]->energy;
     if (vflag_global)
       for (n = 0; n < 6; n++) virial[n] += styles[m]->virial[n];
     if (eflag_atom) {
       n = atom->nlocal;
       if (force->newton_bond) n += atom->nghost;
       double *eatom_substyle = styles[m]->eatom;
       for (i = 0; i < n; i++) eatom[i] += eatom_substyle[i];
     }
     if (vflag_atom) {
       n = atom->nlocal;
       if (force->newton_bond) n += atom->nghost;
       double **vatom_substyle = styles[m]->vatom;
       for (i = 0; i < n; i++)
         for (j = 0; j < 6; j++)
           vatom[i][j] += vatom_substyle[i][j];
     }
   }
 
   // restore ptrs to original improperlist
 
   neighbor->nimproperlist = nimproperlist_orig;
   neighbor->improperlist = improperlist_orig;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ImproperHybrid::allocate()
 {
   allocated = 1;
   int n = atom->nimpropertypes;
 
   memory->create(map,n+1,"improper:map");
   memory->create(setflag,n+1,"improper:setflag");
   for (int i = 1; i <= n; i++) setflag[i] = 0;
 
   nimproperlist = new int[nstyles];
   maximproper = new int[nstyles];
   improperlist = new int**[nstyles];
   for (int m = 0; m < nstyles; m++) maximproper[m] = 0;
   for (int m = 0; m < nstyles; m++) improperlist[m] = NULL;
 }
 
 /* ----------------------------------------------------------------------
    create one improper style for each arg in list
 ------------------------------------------------------------------------- */
 
 void ImproperHybrid::settings(int narg, char **arg)
 {
   int i,m,istyle;
 
   if (narg < 1) error->all(FLERR,"Illegal improper_style command");
 
   // delete old lists, since cannot just change settings
 
   if (nstyles) {
     for (int i = 0; i < nstyles; i++) delete styles[i];
     delete [] styles;
     for (int i = 0; i < nstyles; i++) delete [] keywords[i];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(map);
     delete [] nimproperlist;
     delete [] maximproper;
     for (int i = 0; i < nstyles; i++)
       memory->destroy(improperlist[i]);
     delete [] improperlist;
   }
   allocated = 0;
 
   // count sub-styles by skipping numeric args
   // one exception is 1st arg of style "table", which is non-numeric word
   // need a better way to skip these exceptions
 
   nstyles = 0;
   i = 0;
   while (i < narg) {
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     nstyles++;
   }
 
   // allocate list of sub-styles
 
   styles = new Improper*[nstyles];
   keywords = new char*[nstyles];
 
   // allocate each sub-style and call its settings() with subset of args
   // define subset of args for a sub-style by skipping numeric args
   // one exception is 1st arg of style "table", which is non-numeric
   // need a better way to skip these exceptions
 
-  int dummy;
+  int sflag;
   nstyles = 0;
   i = 0;
 
   while (i < narg) {
     for (m = 0; m < nstyles; m++)
       if (strcmp(arg[i],keywords[m]) == 0)
         error->all(FLERR,"Improper style hybrid cannot use "
                    "same improper style twice");
     if (strcmp(arg[i],"hybrid") == 0)
       error->all(FLERR,
                  "Improper style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0)
       error->all(FLERR,"Improper style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_improper(arg[i],lmp->suffix,dummy);
-    keywords[nstyles] = new char[strlen(arg[i])+1];
-    strcpy(keywords[nstyles],arg[i]);
+
+    styles[nstyles] = force->new_improper(arg[i],1,sflag);
+    force->store_style(keywords[nstyles],arg[i],sflag);
+
     istyle = i;
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     styles[nstyles]->settings(i-istyle-1,&arg[istyle+1]);
     nstyles++;
   }
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one type
 ---------------------------------------------------------------------- */
 
 void ImproperHybrid::coeff(int narg, char **arg)
 {
   if (!allocated) allocate();
 
   int ilo,ihi;
   force->bounds(arg[0],atom->nimpropertypes,ilo,ihi);
 
   // 2nd arg = improper sub-style name
   // allow for "none" as valid sub-style name
 
   int m;
   for (m = 0; m < nstyles; m++)
     if (strcmp(arg[1],keywords[m]) == 0) break;
 
   int none = 0;
   if (m == nstyles) {
     if (strcmp(arg[1],"none") == 0) none = 1;
     else error->all(FLERR,"Improper coeff for hybrid has invalid style");
   }
 
   // move 1st arg to 2nd arg
   // just copy ptrs, since arg[] points into original input line
 
   arg[1] = arg[0];
 
   // invoke sub-style coeff() starting with 1st arg
 
   if (!none) styles[m]->coeff(narg-1,&arg[1]);
 
   // set setflag and which type maps to which sub-style
   // if sub-style is none: set hybrid setflag, wipe out map
 
   for (int i = ilo; i <= ihi; i++) {
     if (none) {
       setflag[i] = 1;
       map[i] = -1;
     } else {
       setflag[i] = styles[m]->setflag[i];
       map[i] = m;
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void ImproperHybrid::write_restart(FILE *fp)
 {
   fwrite(&nstyles,sizeof(int),1,fp);
 
   int n;
   for (int m = 0; m < nstyles; m++) {
     n = strlen(keywords[m]) + 1;
     fwrite(&n,sizeof(int),1,fp);
     fwrite(keywords[m],sizeof(char),n,fp);
   }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void ImproperHybrid::read_restart(FILE *fp)
 {
   int me = comm->me;
   if (me == 0) fread(&nstyles,sizeof(int),1,fp);
   MPI_Bcast(&nstyles,1,MPI_INT,0,world);
   styles = new Improper*[nstyles];
   keywords = new char*[nstyles];
 
   allocate();
 
   int n,dummy;
   for (int m = 0; m < nstyles; m++) {
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_improper(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_improper(keywords[m],0,dummy);
   }
 }
 
 /* ----------------------------------------------------------------------
    memory usage
 ------------------------------------------------------------------------- */
 
 double ImproperHybrid::memory_usage()
 {
   double bytes = maxeatom * sizeof(double);
   bytes += maxvatom*6 * sizeof(double);
   for (int m = 0; m < nstyles; m++) bytes += maximproper[m]*5 * sizeof(int);
   for (int m = 0; m < nstyles; m++)
     if (styles[m]) bytes += styles[m]->memory_usage();
   return bytes;
 }
diff --git a/src/input.cpp b/src/input.cpp
index 785929bb2..ac91ab546 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -1,1629 +1,1696 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "ctype.h"
 #include "unistd.h"
 #include "sys/stat.h"
 #include "input.h"
 #include "style_command.h"
 #include "universe.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "comm_brick.h"
 #include "comm_tiled.h"
 #include "group.h"
 #include "domain.h"
 #include "output.h"
 #include "thermo.h"
 #include "force.h"
 #include "pair.h"
 #include "min.h"
 #include "modify.h"
 #include "compute.h"
 #include "bond.h"
 #include "angle.h"
 #include "dihedral.h"
 #include "improper.h"
 #include "kspace.h"
 #include "update.h"
 #include "neighbor.h"
 #include "special.h"
 #include "variable.h"
 #include "accelerator_cuda.h"
 #include "accelerator_kokkos.h"
 #include "error.h"
 #include "memory.h"
 
 #ifdef _OPENMP
 #include "omp.h"
 #endif
 
 #ifdef _WIN32
 #include <direct.h>
 #endif
 
 using namespace LAMMPS_NS;
 
 #define DELTALINE 256
 #define DELTA 4
 
 /* ---------------------------------------------------------------------- */
 
 Input::Input(LAMMPS *lmp, int argc, char **argv) : Pointers(lmp)
 {
   MPI_Comm_rank(world,&me);
 
   maxline = maxcopy = maxwork = 0;
   line = copy = work = NULL;
   narg = maxarg = 0;
   arg = NULL;
 
   echo_screen = 0;
   echo_log = 1;
 
   label_active = 0;
   labelstr = NULL;
   jump_skip = 0;
   ifthenelse_flag = 0;
 
   if (me == 0) {
     nfile = maxfile = 1;
     infiles = (FILE **) memory->smalloc(sizeof(FILE *),"input:infiles");
     infiles[0] = infile;
   } else infiles = NULL;
 
   variable = new Variable(lmp);
 
   // fill map with commands listed in style_command.h
 
   command_map = new std::map<std::string,CommandCreator>();
 
 #define COMMAND_CLASS
 #define CommandStyle(key,Class) \
   (*command_map)[#key] = &command_creator<Class>;
 #include "style_command.h"
 #undef CommandStyle
 #undef COMMAND_CLASS
 
   // process command-line args
   // check for args "-var" and "-echo"
   // caller has already checked that sufficient arguments exist
 
   int iarg = 0;
   while (iarg < argc) {
     if (strcmp(argv[iarg],"-var") == 0 || strcmp(argv[iarg],"-v") == 0) {
       int jarg = iarg+3;
       while (jarg < argc && argv[jarg][0] != '-') jarg++;
       variable->set(argv[iarg+1],jarg-iarg-2,&argv[iarg+2]);
       iarg = jarg;
     } else if (strcmp(argv[iarg],"-echo") == 0 ||
                strcmp(argv[iarg],"-e") == 0) {
       narg = 1;
       char **tmp = arg;        // trick echo() into using argv instead of arg
       arg = &argv[iarg+1];
       echo();
       arg = tmp;
       iarg += 2;
      } else iarg++;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 Input::~Input()
 {
   // don't free command and arg strings
   // they just point to other allocated memory
 
   memory->sfree(line);
   memory->sfree(copy);
   memory->sfree(work);
   if (labelstr) delete [] labelstr;
   memory->sfree(arg);
   memory->sfree(infiles);
   delete variable;
 
   delete command_map;
 }
 
 /* ----------------------------------------------------------------------
    process all input from infile
    infile = stdin or file if command-line arg "-in" was used
 ------------------------------------------------------------------------- */
 
 void Input::file()
 {
   int m,n;
   
   while (1) {
     
     // read a line from input script
     // n = length of line including str terminator, 0 if end of file
     // if line ends in continuation char '&', concatenate next line
     
     if (me == 0) {
       m = 0;
       while (1) {
         if (maxline-m < 2) reallocate(line,maxline,0);
         if (fgets(&line[m],maxline-m,infile) == NULL) {
           if (m) n = strlen(line) + 1;
           else n = 0;
           break;
         }
         m = strlen(line);
         if (line[m-1] != '\n') continue;
         
         m--;
         while (m >= 0 && isspace(line[m])) m--;
         if (m < 0 || line[m] != '&') {
           line[m+1] = '\0';
           n = m+2;
           break;
         }
       }
     }
     
     // bcast the line
     // if n = 0, end-of-file
     // error if label_active is set, since label wasn't encountered
     // if original input file, code is done
     // else go back to previous input file
     
     MPI_Bcast(&n,1,MPI_INT,0,world);
     if (n == 0) {
       if (label_active) error->all(FLERR,"Label wasn't found in input script");
       if (me == 0) {
         if (infile != stdin) {
           fclose(infile);
           infile = NULL;
         }
         nfile--;
       }
       MPI_Bcast(&nfile,1,MPI_INT,0,world);
       if (nfile == 0) break;
       if (me == 0) infile = infiles[nfile-1];
       continue;
     }
     
     if (n > maxline) reallocate(line,maxline,n);
     MPI_Bcast(line,n,MPI_CHAR,0,world);
     
     // echo the command unless scanning for label
     
     if (me == 0 && label_active == 0) {
       if (echo_screen && screen) fprintf(screen,"%s\n",line);
       if (echo_log && logfile) fprintf(logfile,"%s\n",line);
     }
     
     // parse the line
     // if no command, skip to next line in input script
     
     parse();
     if (command == NULL) continue;
     
     // if scanning for label, skip command unless it's a label command
     
     if (label_active && strcmp(command,"label") != 0) continue;
     
     // execute the command
     
     if (execute_command()) {
       char *str = new char[maxline+32];
       sprintf(str,"Unknown command: %s",line);
       error->all(FLERR,str);
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    process all input from filename
    called from library interface
 ------------------------------------------------------------------------- */
 
 void Input::file(const char *filename)
 {
   // error if another nested file still open, should not be possible
   // open new filename and set infile, infiles[0], nfile
   // call to file() will close filename and decrement nfile
 
   if (me == 0) {
     if (nfile > 1)
       error->one(FLERR,"Invalid use of library file() function");
 
     if (infile && infile != stdin) fclose(infile); 
     infile = fopen(filename,"r");
     if (infile == NULL) {
       char str[128];
       sprintf(str,"Cannot open input script %s",filename);
       error->one(FLERR,str);
     }
     infiles[0] = infile;
     nfile = 1;
   }
   
   file();
 }
 
 /* ----------------------------------------------------------------------
    copy command in single to line, parse and execute it
    return command name to caller
 ------------------------------------------------------------------------- */
 
 char *Input::one(const char *single)
 {
   int n = strlen(single) + 1;
   if (n > maxline) reallocate(line,maxline,n);
   strcpy(line,single);
   
   // echo the command unless scanning for label
   
   if (me == 0 && label_active == 0) {
     if (echo_screen && screen) fprintf(screen,"%s\n",line);
     if (echo_log && logfile) fprintf(logfile,"%s\n",line);
   }
   
   // parse the line
   // if no command, just return NULL
   
   parse();
   if (command == NULL) return NULL;
   
   // if scanning for label, skip command unless it's a label command
   
   if (label_active && strcmp(command,"label") != 0) return NULL;
   
   // execute the command and return its name
   
   if (execute_command()) {
     char *str = new char[maxline+32];
     sprintf(str,"Unknown command: %s",line);
     error->all(FLERR,str);
   }
   
   return command;
 }
 
 /* ----------------------------------------------------------------------
    parse copy of command line by inserting string terminators
    strip comment = all chars from # on
    replace all $ via variable substitution
    command = first word
    narg = # of args
    arg[] = individual args
    treat text between single/double quotes as one arg
 ------------------------------------------------------------------------- */
 
 void Input::parse()
 {
   // duplicate line into copy string to break into words
   
   int n = strlen(line) + 1;
   if (n > maxcopy) reallocate(copy,maxcopy,n);
   strcpy(copy,line);
   
   // strip any # comment by replacing it with 0
   // do not strip # inside single/double quotes
   
   char quote = '\0';
   char *ptr = copy;
   while (*ptr) {
     if (*ptr == '#' && !quote) {
       *ptr = '\0';
       break;
     }
     if (*ptr == quote) quote = '\0';
     else if (*ptr == '"' || *ptr == '\'') quote = *ptr;
     ptr++;
   }
   
   // perform $ variable substitution (print changes)
   // except if searching for a label since earlier variable may not be defined
   
   if (!label_active) substitute(copy,work,maxcopy,maxwork,1);
   
   // command = 1st arg in copy string
   
   char *next;
   command = nextword(copy,&next);
   if (command == NULL) return;
   
   // point arg[] at each subsequent arg in copy string
   // nextword() inserts string terminators into copy string to delimit args
   // nextword() treats text between single/double quotes as one arg
   
   narg = 0;
   ptr = next;
   while (ptr) {
     if (narg == maxarg) {
       maxarg += DELTA;
       arg = (char **) memory->srealloc(arg,maxarg*sizeof(char *),"input:arg");
     }
     arg[narg] = nextword(ptr,&next);
     if (!arg[narg]) break;
     narg++;
     ptr = next;
   }
 }
 
 /* ----------------------------------------------------------------------
    find next word in str
    insert 0 at end of word
    ignore leading whitespace
    treat text between single/double quotes as one arg
    matching quote must be followed by whitespace char if not end of string
    strip quotes from returned word
    return ptr to start of word
    return next = ptr after word or NULL if word ended with 0
    return NULL if no word in string
 ------------------------------------------------------------------------- */
 
 char *Input::nextword(char *str, char **next)
 {
   char *start,*stop;
   
   start = &str[strspn(str," \t\n\v\f\r")];
   if (*start == '\0') return NULL;
   
   if (*start == '"' || *start == '\'') {
     stop = strchr(&start[1],*start);
     if (!stop) error->all(FLERR,"Unbalanced quotes in input line");
     if (stop[1] && !isspace(stop[1]))
       error->all(FLERR,"Input line quote not followed by whitespace");
     start++;
   } else stop = &start[strcspn(start," \t\n\v\f\r")];
   
   if (*stop == '\0') *next = NULL;
   else *next = stop+1;
   *stop = '\0';
   return start;
 }
 
 /* ----------------------------------------------------------------------
    substitute for $ variables in str using work str2 and return it
    reallocate str/str2 to hold expanded version if necessary & reset max/max2
    print updated string if flag is set and not searching for label
    label_active will be 0 if called from external class
 ------------------------------------------------------------------------- */
 
 void Input::substitute(char *&str, char *&str2, int &max, int &max2, int flag)
 {
   // use str2 as scratch space to expand str, then copy back to str
   // reallocate str and str2 as necessary
   // do not replace $ inside single/double quotes
   // var = pts at variable name, ended by NULL
   //   if $ is followed by '{', trailing '}' becomes NULL
   //   else $x becomes x followed by NULL
   // beyond = points to text following variable
   
   int i,n,paren_count;
   char immediate[256];
   char *var,*value,*beyond;
   char quote = '\0';
   char *ptr = str;
   
   n = strlen(str) + 1;
   if (n > max2) reallocate(str2,max2,n);
   *str2 = '\0';
   char *ptr2 = str2;
   
   while (*ptr) {
     // variable substitution
     
     if (*ptr == '$' && !quote) {
       
       // value = ptr to expanded variable
       // variable name between curly braces, e.g. ${a}
       
       if (*(ptr+1) == '{') {
         var = ptr+2;
         i = 0;
         
         while (var[i] != '\0' && var[i] != '}') i++;
         
         if (var[i] == '\0') error->one(FLERR,"Invalid variable name");
         var[i] = '\0';
         beyond = ptr + strlen(var) + 3;
         value = variable->retrieve(var);
         
         // immediate variable between parenthesis, e.g. $(1/2)
         
       } else if (*(ptr+1) == '(') {
         var = ptr+2;
         paren_count = 0;
         i = 0;
         
         while (var[i] != '\0' && !(var[i] == ')' && paren_count == 0)) {
           switch (var[i]) {
           case '(': paren_count++; break;
           case ')': paren_count--; break;
           default: ;
           }
           i++;
         }
         
         if (var[i] == '\0') error->one(FLERR,"Invalid immediate variable");
         var[i] = '\0';
         beyond = ptr + strlen(var) + 3;
         sprintf(immediate,"%.20g",variable->compute_equal(var));
         value = immediate;
         
         // single character variable name, e.g. $a
         
       } else {
         var = ptr;
         var[0] = var[1];
         var[1] = '\0';
         beyond = ptr + 2;
         value = variable->retrieve(var);
       }
       
       if (value == NULL) error->one(FLERR,"Substitution for illegal variable");
       
       // check if storage in str2 needs to be expanded
       // re-initialize ptr and ptr2 to the point beyond the variable.
       
       n = strlen(str2) + strlen(value) + strlen(beyond) + 1;
       if (n > max2) reallocate(str2,max2,n);
       strcat(str2,value);
       ptr2 = str2 + strlen(str2);
       ptr = beyond;
       
       // output substitution progress if requested
       
       if (flag && me == 0 && label_active == 0) {
         if (echo_screen && screen) fprintf(screen,"%s%s\n",str2,beyond);
         if (echo_log && logfile) fprintf(logfile,"%s%s\n",str2,beyond);
       }
       
       continue;
     }
     
     if (*ptr == quote) quote = '\0';
     else if (*ptr == '"' || *ptr == '\'') quote = *ptr;
     
     // copy current character into str2
     
     *ptr2++ = *ptr++;
     *ptr2 = '\0';
   }
   
   // set length of input str to length of work str2
   // copy work string back to input str
   
   if (max2 > max) reallocate(str,max,max2);
   strcpy(str,str2);
 }
 
 /* ----------------------------------------------------------------------
    rellocate a string
    if n > 0: set max >= n in increments of DELTALINE
    if n = 0: just increment max by DELTALINE
 ------------------------------------------------------------------------- */
 
 void Input::reallocate(char *&str, int &max, int n)
 {
   if (n) {
     while (n > max) max += DELTALINE;
   } else max += DELTALINE;
   
   str = (char *) memory->srealloc(str,max*sizeof(char),"input:str");
 }
 
 /* ----------------------------------------------------------------------
    process a single parsed command
    return 0 if successful, -1 if did not recognize command
 ------------------------------------------------------------------------- */
 
 int Input::execute_command()
 {
   int flag = 1;
   
   if (!strcmp(command,"clear")) clear();
   else if (!strcmp(command,"echo")) echo();
   else if (!strcmp(command,"if")) ifthenelse();
   else if (!strcmp(command,"include")) include();
   else if (!strcmp(command,"jump")) jump();
   else if (!strcmp(command,"label")) label();
   else if (!strcmp(command,"log")) log();
   else if (!strcmp(command,"next")) next_command();
   else if (!strcmp(command,"partition")) partition();
   else if (!strcmp(command,"print")) print();
   else if (!strcmp(command,"quit")) quit();
   else if (!strcmp(command,"shell")) shell();
   else if (!strcmp(command,"variable")) variable_command();
   
   else if (!strcmp(command,"angle_coeff")) angle_coeff();
   else if (!strcmp(command,"angle_style")) angle_style();
   else if (!strcmp(command,"atom_modify")) atom_modify();
   else if (!strcmp(command,"atom_style")) atom_style();
   else if (!strcmp(command,"bond_coeff")) bond_coeff();
   else if (!strcmp(command,"bond_style")) bond_style();
   else if (!strcmp(command,"boundary")) boundary();
   else if (!strcmp(command,"box")) box();
   else if (!strcmp(command,"comm_modify")) comm_modify();
   else if (!strcmp(command,"comm_style")) comm_style();
   else if (!strcmp(command,"compute")) compute();
   else if (!strcmp(command,"compute_modify")) compute_modify();
   else if (!strcmp(command,"dielectric")) dielectric();
   else if (!strcmp(command,"dihedral_coeff")) dihedral_coeff();
   else if (!strcmp(command,"dihedral_style")) dihedral_style();
   else if (!strcmp(command,"dimension")) dimension();
   else if (!strcmp(command,"dump")) dump();
   else if (!strcmp(command,"dump_modify")) dump_modify();
   else if (!strcmp(command,"fix")) fix();
   else if (!strcmp(command,"fix_modify")) fix_modify();
   else if (!strcmp(command,"group")) group_command();
   else if (!strcmp(command,"improper_coeff")) improper_coeff();
   else if (!strcmp(command,"improper_style")) improper_style();
   else if (!strcmp(command,"kspace_modify")) kspace_modify();
   else if (!strcmp(command,"kspace_style")) kspace_style();
   else if (!strcmp(command,"lattice")) lattice();
   else if (!strcmp(command,"mass")) mass();
   else if (!strcmp(command,"min_modify")) min_modify();
   else if (!strcmp(command,"min_style")) min_style();
   else if (!strcmp(command,"molecule")) molecule();
   else if (!strcmp(command,"neigh_modify")) neigh_modify();
   else if (!strcmp(command,"neighbor")) neighbor_command();
   else if (!strcmp(command,"newton")) newton();
   else if (!strcmp(command,"package")) package();
   else if (!strcmp(command,"pair_coeff")) pair_coeff();
   else if (!strcmp(command,"pair_modify")) pair_modify();
   else if (!strcmp(command,"pair_style")) pair_style();
   else if (!strcmp(command,"pair_write")) pair_write();
   else if (!strcmp(command,"processors")) processors();
   else if (!strcmp(command,"region")) region();
   else if (!strcmp(command,"reset_timestep")) reset_timestep();
   else if (!strcmp(command,"restart")) restart();
   else if (!strcmp(command,"run_style")) run_style();
   else if (!strcmp(command,"special_bonds")) special_bonds();
   else if (!strcmp(command,"suffix")) suffix();
   else if (!strcmp(command,"thermo")) thermo();
   else if (!strcmp(command,"thermo_modify")) thermo_modify();
   else if (!strcmp(command,"thermo_style")) thermo_style();
   else if (!strcmp(command,"timestep")) timestep();
   else if (!strcmp(command,"uncompute")) uncompute();
   else if (!strcmp(command,"undump")) undump();
   else if (!strcmp(command,"unfix")) unfix();
   else if (!strcmp(command,"units")) units();
   
   else flag = 0;
   
   // return if command was listed above
   
   if (flag) return 0;
   
   // invoke commands added via style_command.h
   
   if (command_map->find(command) != command_map->end()) {
     CommandCreator command_creator = (*command_map)[command];
     command_creator(lmp,narg,arg);
     return 0;
   }
   
   // unrecognized command
   
   return -1;
 }
 
 /* ----------------------------------------------------------------------
    one instance per command in style_command.h
 ------------------------------------------------------------------------- */
 
 template <typename T>
 void Input::command_creator(LAMMPS *lmp, int narg, char **arg)
 {
   T cmd(lmp);
   cmd.command(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 /* ---------------------------------------------------------------------- */
 /* ---------------------------------------------------------------------- */
 
 /* ---------------------------------------------------------------------- */
 
 void Input::clear()
 {
   if (narg > 0) error->all(FLERR,"Illegal clear command");
   lmp->destroy();
   lmp->create();
   lmp->post_create();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::echo()
 {
   if (narg != 1) error->all(FLERR,"Illegal echo command");
 
   if (strcmp(arg[0],"none") == 0) {
     echo_screen = 0;
     echo_log = 0;
   } else if (strcmp(arg[0],"screen") == 0) {
     echo_screen = 1;
     echo_log = 0;
   } else if (strcmp(arg[0],"log") == 0) {
     echo_screen = 0;
     echo_log = 1;
   } else if (strcmp(arg[0],"both") == 0) {
     echo_screen = 1;
     echo_log = 1;
   } else error->all(FLERR,"Illegal echo command");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::ifthenelse()
 {
   if (narg < 3) error->all(FLERR,"Illegal if command");
 
   // substitute for variables in Boolean expression for "if"
   // in case expression was enclosed in quotes
   // must substitute on copy of arg else will step on subsequent args
 
   int n = strlen(arg[0]) + 1;
   if (n > maxline) reallocate(line,maxline,n);
   strcpy(line,arg[0]);
   substitute(line,work,maxline,maxwork,0);
 
   // evaluate Boolean expression for "if"
 
   double btest = variable->evaluate_boolean(line);
 
   // bound "then" commands
 
   if (strcmp(arg[1],"then") != 0) error->all(FLERR,"Illegal if command");
 
   int first = 2;
   int iarg = first;
   while (iarg < narg &&
          (strcmp(arg[iarg],"elif") != 0 && strcmp(arg[iarg],"else") != 0))
     iarg++;
   int last = iarg-1;
 
   // execute "then" commands
   // make copies of all arg string commands
   // required because re-parsing a command via one() will wipe out args
 
   if (btest != 0.0) {
     int ncommands = last-first + 1;
     if (ncommands <= 0) error->all(FLERR,"Illegal if command");
 
     char **commands = new char*[ncommands];
     ncommands = 0;
     for (int i = first; i <= last; i++) {
       int n = strlen(arg[i]) + 1;
       if (n == 1) error->all(FLERR,"Illegal if command");
       commands[ncommands] = new char[n];
       strcpy(commands[ncommands],arg[i]);
       ncommands++;
     }
 
     ifthenelse_flag = 1;
     for (int i = 0; i < ncommands; i++) one(commands[i]);
     ifthenelse_flag = 0;
 
     for (int i = 0; i < ncommands; i++) delete [] commands[i];
     delete [] commands;
 
     return;
   }
 
   // done if no "elif" or "else"
 
   if (iarg == narg) return;
 
   // check "elif" or "else" until find commands to execute
   // substitute for variables and evaluate Boolean expression for "elif"
   // must substitute on copy of arg else will step on subsequent args
   // bound and execute "elif" or "else" commands
 
   while (iarg != narg) {
     if (iarg+2 > narg) error->all(FLERR,"Illegal if command");
     if (strcmp(arg[iarg],"elif") == 0) {
       n = strlen(arg[iarg+1]) + 1;
       if (n > maxline) reallocate(line,maxline,n);
       strcpy(line,arg[iarg+1]);
       substitute(line,work,maxline,maxwork,0);
       btest = variable->evaluate_boolean(line);
       first = iarg+2;
     } else {
       btest = 1.0;
       first = iarg+1;
     }
 
     iarg = first;
     while (iarg < narg &&
            (strcmp(arg[iarg],"elif") != 0 && strcmp(arg[iarg],"else") != 0))
       iarg++;
     last = iarg-1;
 
     if (btest == 0.0) continue;
 
     int ncommands = last-first + 1;
     if (ncommands <= 0) error->all(FLERR,"Illegal if command");
 
     char **commands = new char*[ncommands];
     ncommands = 0;
     for (int i = first; i <= last; i++) {
       int n = strlen(arg[i]) + 1;
       if (n == 1) error->all(FLERR,"Illegal if command");
       commands[ncommands] = new char[n];
       strcpy(commands[ncommands],arg[i]);
       ncommands++;
     }
 
     // execute the list of commands
 
     ifthenelse_flag = 1;
     for (int i = 0; i < ncommands; i++) one(commands[i]);
     ifthenelse_flag = 0;
 
     // clean up
 
     for (int i = 0; i < ncommands; i++) delete [] commands[i];
     delete [] commands;
 
     return;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::include()
 {
   if (narg != 1) error->all(FLERR,"Illegal include command");
 
   // do not allow include inside an if command
   // NOTE: this check will fail if a 2nd if command was inside the if command
   //       and came before the include
 
   if (ifthenelse_flag) 
     error->all(FLERR,"Cannot use include command within an if command");
 
   if (me == 0) {
     if (nfile == maxfile) {
       maxfile++;
       infiles = (FILE **)
         memory->srealloc(infiles,maxfile*sizeof(FILE *),"input:infiles");
     }
     infile = fopen(arg[0],"r");
     if (infile == NULL) {
       char str[128];
       sprintf(str,"Cannot open input script %s",arg[0]);
       error->one(FLERR,str);
     }
     infiles[nfile++] = infile;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::jump()
 {
   if (narg < 1 || narg > 2) error->all(FLERR,"Illegal jump command");
 
   if (jump_skip) {
     jump_skip = 0;
     return;
   }
 
   if (me == 0) {
     if (strcmp(arg[0],"SELF") == 0) rewind(infile);
     else {
       if (infile && infile != stdin) fclose(infile);
       infile = fopen(arg[0],"r");
       if (infile == NULL) {
         char str[128];
         sprintf(str,"Cannot open input script %s",arg[0]);
         error->one(FLERR,str);
       }
       infiles[nfile-1] = infile;
     }
   }
 
   if (narg == 2) {
     label_active = 1;
     if (labelstr) delete [] labelstr;
     int n = strlen(arg[1]) + 1;
     labelstr = new char[n];
     strcpy(labelstr,arg[1]);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::label()
 {
   if (narg != 1) error->all(FLERR,"Illegal label command");
   if (label_active && strcmp(labelstr,arg[0]) == 0) label_active = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::log()
 {
   if (narg > 2) error->all(FLERR,"Illegal log command");
 
   int appendflag = 0;
   if (narg == 2) {
     if (strcmp(arg[1],"append") == 0) appendflag = 1;
     else error->all(FLERR,"Illegal log command");
   }
 
   if (me == 0) {
     if (logfile) fclose(logfile);
     if (strcmp(arg[0],"none") == 0) logfile = NULL;
     else {
       if (appendflag) logfile = fopen(arg[0],"a");
       else logfile = fopen(arg[0],"w");
 
       if (logfile == NULL) {
         char str[128];
         sprintf(str,"Cannot open logfile %s",arg[0]);
         error->one(FLERR,str);
       }
     }
     if (universe->nworlds == 1) universe->ulogfile = logfile;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::next_command()
 {
   if (variable->next(narg,arg)) jump_skip = 1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::partition()
 {
   if (narg < 3) error->all(FLERR,"Illegal partition command");
 
   int yesflag;
   if (strcmp(arg[0],"yes") == 0) yesflag = 1;
   else if (strcmp(arg[0],"no") == 0) yesflag = 0;
   else error->all(FLERR,"Illegal partition command");
 
   int ilo,ihi;
   force->bounds(arg[1],universe->nworlds,ilo,ihi);
 
   // copy original line to copy, since will use strtok() on it
   // ptr = start of 4th word
 
   strcpy(copy,line);
   char *ptr = strtok(copy," \t\n\r\f");
   ptr = strtok(NULL," \t\n\r\f");
   ptr = strtok(NULL," \t\n\r\f");
   ptr += strlen(ptr) + 1;
   ptr += strspn(ptr," \t\n\r\f");
 
   // execute the remaining command line on requested partitions
 
   if (yesflag) {
     if (universe->iworld+1 >= ilo && universe->iworld+1 <= ihi) one(ptr);
   } else {
     if (universe->iworld+1 < ilo || universe->iworld+1 > ihi) one(ptr);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::print()
 {
   if (narg < 1) error->all(FLERR,"Illegal print command");
 
   // copy 1st arg back into line (copy is being used)
   // check maxline since arg[0] could have been exanded by variables
   // substitute for $ variables (no printing) and print arg
 
   int n = strlen(arg[0]) + 1;
   if (n > maxline) reallocate(line,maxline,n);
   strcpy(line,arg[0]);
   substitute(line,work,maxline,maxwork,0);
 
   // parse optional args
 
   FILE *fp = NULL;
   int screenflag = 1;
 
   int iarg = 1;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"file") == 0 || strcmp(arg[iarg],"append") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal print command");
       if (me == 0) {
         if (strcmp(arg[iarg],"file") == 0) fp = fopen(arg[iarg+1],"w");
         else fp = fopen(arg[iarg+1],"a");
         if (fp == NULL) {
           char str[128];
           sprintf(str,"Cannot open print file %s",arg[iarg+1]);
           error->one(FLERR,str);
         }
       }
       iarg += 2;
     } else if (strcmp(arg[iarg],"screen") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal print command");
       if (strcmp(arg[iarg+1],"yes") == 0) screenflag = 1;
       else if (strcmp(arg[iarg+1],"no") == 0) screenflag = 0;
       else error->all(FLERR,"Illegal print command");
       iarg += 2;
     } else error->all(FLERR,"Illegal print command");
   }
 
   if (me == 0) {
     if (screenflag && screen) fprintf(screen,"%s\n",line);
     if (screenflag && logfile) fprintf(logfile,"%s\n",line);
     if (fp) {
       fprintf(fp,"%s\n",line);
       fclose(fp);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::quit()
 {
   if (narg) error->all(FLERR,"Illegal quit command");
   error->done();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::shell()
 {
   if (narg < 1) error->all(FLERR,"Illegal shell command");
 
   if (strcmp(arg[0],"cd") == 0) {
     if (narg != 2) error->all(FLERR,"Illegal shell cd command");
     chdir(arg[1]);
 
   } else if (strcmp(arg[0],"mkdir") == 0) {
     if (narg < 2) error->all(FLERR,"Illegal shell mkdir command");
     if (me == 0)
       for (int i = 1; i < narg; i++) {
 #if defined(_WIN32)
         _mkdir(arg[i]);
 #else
         mkdir(arg[i], S_IRWXU | S_IRGRP | S_IXGRP);
 #endif
       }
 
   } else if (strcmp(arg[0],"mv") == 0) {
     if (narg != 3) error->all(FLERR,"Illegal shell mv command");
     if (me == 0) rename(arg[1],arg[2]);
 
   } else if (strcmp(arg[0],"rm") == 0) {
     if (narg < 2) error->all(FLERR,"Illegal shell rm command");
     if (me == 0)
       for (int i = 1; i < narg; i++) unlink(arg[i]);
 
   } else if (strcmp(arg[0],"rmdir") == 0) {
     if (narg < 2) error->all(FLERR,"Illegal shell rmdir command");
     if (me == 0)
       for (int i = 1; i < narg; i++) rmdir(arg[i]);
 
   } else if (strcmp(arg[0],"putenv") == 0) {
     if (narg < 2) error->all(FLERR,"Illegal shell putenv command");
     for (int i = 1; i < narg; i++) {
       char *ptr = strdup(arg[i]);
 #ifdef _WIN32 
       if (ptr != NULL) _putenv(ptr);
 #else
       if (ptr != NULL) putenv(ptr);
 #endif
     }
 
   // use work string to concat args back into one string separated by spaces
   // invoke string in shell via system()
 
   } else {
     int n = 0;
     for (int i = 0; i < narg; i++) n += strlen(arg[i]) + 1;
     if (n > maxwork) reallocate(work,maxwork,n);
 
     strcpy(work,arg[0]);
     for (int i = 1; i < narg; i++) {
       strcat(work," ");
       strcat(work,arg[i]);
     }
 
     if (me == 0) system(work);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::variable_command()
 {
   variable->set(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 /* ---------------------------------------------------------------------- */
 /* ---------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    one function for each LAMMPS-specific input script command
 ------------------------------------------------------------------------- */
 
 /* ---------------------------------------------------------------------- */
 
 void Input::angle_coeff()
 {
   if (domain->box_exist == 0)
     error->all(FLERR,"Angle_coeff command before simulation box is defined");
   if (force->angle == NULL)
     error->all(FLERR,"Angle_coeff command before angle_style is defined");
   if (atom->avec->angles_allow == 0)
     error->all(FLERR,"Angle_coeff command when no angles allowed");
   force->angle->coeff(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::angle_style()
 {
   if (narg < 1) error->all(FLERR,"Illegal angle_style command");
   if (atom->avec->angles_allow == 0)
     error->all(FLERR,"Angle_style command when no angles allowed");
-  force->create_angle(arg[0],lmp->suffix);
+  force->create_angle(arg[0],1);
   if (force->angle) force->angle->settings(narg-1,&arg[1]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::atom_modify()
 {
   atom->modify_params(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::atom_style()
 {
   if (narg < 1) error->all(FLERR,"Illegal atom_style command");
   if (domain->box_exist)
     error->all(FLERR,"Atom_style command after simulation box is defined");
-  atom->create_avec(arg[0],narg-1,&arg[1],lmp->suffix);
+  atom->create_avec(arg[0],narg-1,&arg[1],1);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::bond_coeff()
 {
   if (domain->box_exist == 0)
     error->all(FLERR,"Bond_coeff command before simulation box is defined");
   if (force->bond == NULL)
     error->all(FLERR,"Bond_coeff command before bond_style is defined");
   if (atom->avec->bonds_allow == 0)
     error->all(FLERR,"Bond_coeff command when no bonds allowed");
   force->bond->coeff(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::bond_style()
 {
   if (narg < 1) error->all(FLERR,"Illegal bond_style command");
   if (atom->avec->bonds_allow == 0)
     error->all(FLERR,"Bond_style command when no bonds allowed");
-  force->create_bond(arg[0],lmp->suffix);
+  force->create_bond(arg[0],1);
   if (force->bond) force->bond->settings(narg-1,&arg[1]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::boundary()
 {
   if (domain->box_exist)
     error->all(FLERR,"Boundary command after simulation box is defined");
   domain->set_boundary(narg,arg,0);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::box()
 {
   if (domain->box_exist)
     error->all(FLERR,"Box command after simulation box is defined");
   domain->set_box(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::comm_modify()
 {
   comm->modify_params(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::comm_style()
 {
   if (narg < 1) error->all(FLERR,"Illegal comm_style command");
   if (strcmp(arg[0],"brick") == 0) {
     if (comm->style == 0) return;
     Comm *oldcomm = comm;
     comm = new CommBrick(lmp,oldcomm);
     delete oldcomm;
   } else if (strcmp(arg[0],"tiled") == 0) {
     if (comm->style == 1) return;
     Comm *oldcomm = comm;
     comm = new CommTiled(lmp,oldcomm);
     delete oldcomm;
   } else error->all(FLERR,"Illegal comm_style command");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::compute()
 {
-  modify->add_compute(narg,arg,lmp->suffix);
+  modify->add_compute(narg,arg,1);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::compute_modify()
 {
   modify->modify_compute(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::dielectric()
 {
   if (narg != 1) error->all(FLERR,"Illegal dielectric command");
   force->dielectric = force->numeric(FLERR,arg[0]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::dihedral_coeff()
 {
   if (domain->box_exist == 0)
     error->all(FLERR,"Dihedral_coeff command before simulation box is defined");
   if (force->dihedral == NULL)
     error->all(FLERR,"Dihedral_coeff command before dihedral_style is defined");
   if (atom->avec->dihedrals_allow == 0)
     error->all(FLERR,"Dihedral_coeff command when no dihedrals allowed");
   force->dihedral->coeff(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::dihedral_style()
 {
   if (narg < 1) error->all(FLERR,"Illegal dihedral_style command");
   if (atom->avec->dihedrals_allow == 0)
     error->all(FLERR,"Dihedral_style command when no dihedrals allowed");
-  force->create_dihedral(arg[0],lmp->suffix);
+  force->create_dihedral(arg[0],1);
   if (force->dihedral) force->dihedral->settings(narg-1,&arg[1]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::dimension()
 {
   if (narg != 1) error->all(FLERR,"Illegal dimension command");
   if (domain->box_exist)
     error->all(FLERR,"Dimension command after simulation box is defined");
   domain->dimension = force->inumeric(FLERR,arg[0]);
   if (domain->dimension != 2 && domain->dimension != 3)
     error->all(FLERR,"Illegal dimension command");
 
   // must reset default extra_dof of all computes
   // since some were created before dimension command is encountered
 
   for (int i = 0; i < modify->ncompute; i++)
     modify->compute[i]->reset_extra_dof();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::dump()
 {
   output->add_dump(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::dump_modify()
 {
   output->modify_dump(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::fix()
 {
-  modify->add_fix(narg,arg,lmp->suffix);
+  modify->add_fix(narg,arg,1);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::fix_modify()
 {
   modify->modify_fix(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::group_command()
 {
   group->assign(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::improper_coeff()
 {
   if (domain->box_exist == 0)
     error->all(FLERR,"Improper_coeff command before simulation box is defined");
   if (force->improper == NULL)
     error->all(FLERR,"Improper_coeff command before improper_style is defined");
   if (atom->avec->impropers_allow == 0)
     error->all(FLERR,"Improper_coeff command when no impropers allowed");
   force->improper->coeff(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::improper_style()
 {
   if (narg < 1) error->all(FLERR,"Illegal improper_style command");
   if (atom->avec->impropers_allow == 0)
     error->all(FLERR,"Improper_style command when no impropers allowed");
-  force->create_improper(arg[0],lmp->suffix);
+  force->create_improper(arg[0],1);
   if (force->improper) force->improper->settings(narg-1,&arg[1]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::kspace_modify()
 {
   if (force->kspace == NULL)
     error->all(FLERR,"KSpace style has not yet been set");
   force->kspace->modify_params(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::kspace_style()
 {
-  force->create_kspace(narg,arg,lmp->suffix);
+  force->create_kspace(narg,arg,1);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::lattice()
 {
   domain->set_lattice(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::mass()
 {
   if (narg != 2) error->all(FLERR,"Illegal mass command");
   if (domain->box_exist == 0)
     error->all(FLERR,"Mass command before simulation box is defined");
   atom->set_mass(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::min_modify()
 {
   update->minimize->modify_params(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::min_style()
 {
   if (domain->box_exist == 0)
     error->all(FLERR,"Min_style command before simulation box is defined");
   update->create_minimize(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::molecule()
 {
   atom->add_molecule(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::neigh_modify()
 {
   neighbor->modify_params(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::neighbor_command()
 {
   neighbor->set(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::newton()
 {
   int newton_pair=1,newton_bond=1;
 
   if (narg == 1) {
     if (strcmp(arg[0],"off") == 0) newton_pair = newton_bond = 0;
     else if (strcmp(arg[0],"on") == 0) newton_pair = newton_bond = 1;
     else error->all(FLERR,"Illegal newton command");
   } else if (narg == 2) {
     if (strcmp(arg[0],"off") == 0) newton_pair = 0;
     else if (strcmp(arg[0],"on") == 0) newton_pair= 1;
     else error->all(FLERR,"Illegal newton command");
     if (strcmp(arg[1],"off") == 0) newton_bond = 0;
     else if (strcmp(arg[1],"on") == 0) newton_bond = 1;
     else error->all(FLERR,"Illegal newton command");
   } else error->all(FLERR,"Illegal newton command");
 
   force->newton_pair = newton_pair;
 
   if (domain->box_exist && (newton_bond != force->newton_bond))
     error->all(FLERR,"Newton bond change after simulation box is defined");
   force->newton_bond = newton_bond;
 
   if (newton_pair || newton_bond) force->newton = 1;
   else force->newton = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::package()
 {
   if (domain->box_exist)
     error->all(FLERR,"Package command after simulation box is defined");
   if (narg < 1) error->all(FLERR,"Illegal package command");
 
   if (strcmp(arg[0],"cuda") == 0) {
     if (!lmp->cuda)
       error->all(FLERR,"Package cuda command without USER-CUDA installed");
     lmp->cuda->accelerator(narg-1,&arg[1]);
 
   } else if (strcmp(arg[0],"gpu") == 0) {
     char **fixarg = new char*[2+narg];
     fixarg[0] = (char *) "package_gpu";
     fixarg[1] = (char *) "all";
     fixarg[2] = (char *) "GPU";
     for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i];
-    modify->add_fix(2+narg,fixarg,NULL);
+    modify->add_fix(2+narg,fixarg);
     delete [] fixarg;
     force->newton_pair = 0;
 
   } else if (strcmp(arg[0],"kokkos") == 0) {
     if (!lmp->kokkos)
       error->all(FLERR,"Package kokkos command without KOKKOS installed");
     lmp->kokkos->accelerator(narg-1,&arg[1]);
 
   } else if (strcmp(arg[0],"omp") == 0) {
     char **fixarg = new char*[2+narg];
     fixarg[0] = (char *) "package_omp";
     fixarg[1] = (char *) "all";
     fixarg[2] = (char *) "OMP";
     for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i];
-    modify->add_fix(2+narg,fixarg,NULL);
+    modify->add_fix(2+narg,fixarg);
     delete [] fixarg;
 
+ } else if (strcmp(arg[0],"intel") == 0) {
+
+    // add omp package for non-pair routines
+
+    /*
+    char **fixarg = new char*[2+narg];
+    fixarg[0] = (char *) "package_omp";
+    fixarg[1] = (char *) "all";
+    fixarg[2] = (char *) "OMP";
+    int omp_narg = 3;
+    if (narg > 1) {
+      fixarg[3] = arg[1];
+      omp_narg++;
+      if (narg > 2)
+	for (int i = 2; i < narg; i++)
+	  if (strcmp(arg[i],"mixed") == 0) {
+	    fixarg[4] = arg[i];
+	    omp_narg++;
+	  }
+    }
+    modify->add_fix(omp_narg,fixarg);
+
+    // add intel package for neighbor and pair routines
+    */
+
+    char **fixarg = new char*[2+narg];
+    fixarg[0] = (char *) "package_intel";
+    fixarg[1] = (char *) "all";
+    fixarg[2] = (char *) "Intel";
+    for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i];
+    modify->add_fix(2+narg,fixarg);
+    delete [] fixarg;
+
+    /*
+    // if running with offload, set run_style to verlet/intel
+
+    #ifdef LMP_INTEL_OFFLOAD
+    #ifdef __INTEL_OFFLOAD
+    char *str;
+    str = (char *) "verlet/intel";
+    update->create_integrate(1,&str,0);
+    #endif
+    #endif
+    */
+
   } else error->all(FLERR,"Illegal package command");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::pair_coeff()
 {
   if (domain->box_exist == 0)
     error->all(FLERR,"Pair_coeff command before simulation box is defined");
   if (force->pair == NULL)
     error->all(FLERR,"Pair_coeff command before pair_style is defined");
   force->pair->coeff(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::pair_modify()
 {
   if (force->pair == NULL)
     error->all(FLERR,"Pair_modify command before pair_style is defined");
   force->pair->modify_params(narg,arg);
 }
 
 /* ----------------------------------------------------------------------
    if old pair style exists and new style is same, just change settings
    else create new pair class
 ------------------------------------------------------------------------- */
 
 void Input::pair_style()
 {
   if (narg < 1) error->all(FLERR,"Illegal pair_style command");
-  if (force->pair && strcmp(arg[0],force->pair_style) == 0) {
-    force->pair->settings(narg-1,&arg[1]);
-    return;
+  if (force->pair) {
+    int match = 0;
+    if (strcmp(arg[0],force->pair_style) == 0) match = 1;
+    if (!match && lmp->suffix_enable) {
+      char estyle[256];
+      if (lmp->suffix) {
+        sprintf(estyle,"%s/%s",arg[0],lmp->suffix);
+        if (strcmp(estyle,force->pair_style) == 0) match = 1;
+      }
+      if (lmp->suffix2) {
+        sprintf(estyle,"%s/%s",arg[0],lmp->suffix2);
+        if (strcmp(estyle,force->pair_style) == 0) match = 1;
+      }
+    }
+    if (match) {
+      force->pair->settings(narg-1,&arg[1]);
+      return;
+    }
   }
-  force->create_pair(arg[0],lmp->suffix);
+
+  force->create_pair(arg[0],1);
   if (force->pair) force->pair->settings(narg-1,&arg[1]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::pair_write()
 {
   if (force->pair == NULL)
     error->all(FLERR,"Pair_write command before pair_style is defined");
   force->pair->write_file(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::processors()
 {
   if (domain->box_exist)
     error->all(FLERR,"Processors command after simulation box is defined");
   comm->set_processors(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::region()
 {
   domain->add_region(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::reset_timestep()
 {
   update->reset_timestep(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::restart()
 {
   output->create_restart(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::run_style()
 {
   if (domain->box_exist == 0)
     error->all(FLERR,"Run_style command before simulation box is defined");
-  update->create_integrate(narg,arg,lmp->suffix);
+  update->create_integrate(narg,arg,1);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::special_bonds()
 {
   // store 1-3,1-4 and dihedral/extra flag values before change
   // change in 1-2 coeffs will not change the special list
 
   double lj2 = force->special_lj[2];
   double lj3 = force->special_lj[3];
   double coul2 = force->special_coul[2];
   double coul3 = force->special_coul[3];
   int angle = force->special_angle;
   int dihedral = force->special_dihedral;
   int extra = force->special_extra;
 
   force->set_special(narg,arg);
 
   // if simulation box defined and saved values changed, redo special list
 
   if (domain->box_exist && atom->molecular == 1) {
     if (lj2 != force->special_lj[2] || lj3 != force->special_lj[3] ||
         coul2 != force->special_coul[2] || coul3 != force->special_coul[3] ||
         angle != force->special_angle ||
         dihedral != force->special_dihedral ||
         extra != force->special_extra) {
       Special special(lmp);
       special.build();
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::suffix()
 {
   if (narg != 1) error->all(FLERR,"Illegal suffix command");
 
   if (strcmp(arg[0],"off") == 0) lmp->suffix_enable = 0;
   else if (strcmp(arg[0],"on") == 0) lmp->suffix_enable = 1;
   else {
     delete [] lmp->suffix;
     int n = strlen(arg[0]) + 1;
     lmp->suffix = new char[n];
     strcpy(lmp->suffix,arg[0]);
+    // set 2nd suffix = "omp" when suffix = "intel"
+    if (strcmp(lmp->suffix,"intel") == 0) {
+      delete [] lmp->suffix2;
+      lmp->suffix2 = new char[4];
+      strcpy(lmp->suffix2,"omp");
+    }
     lmp->suffix_enable = 1;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::thermo()
 {
   output->set_thermo(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::thermo_modify()
 {
   output->thermo->modify_params(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::thermo_style()
 {
   output->create_thermo(narg,arg);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::timestep()
 {
   if (narg != 1) error->all(FLERR,"Illegal timestep command");
   update->dt = force->numeric(FLERR,arg[0]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::uncompute()
 {
   if (narg != 1) error->all(FLERR,"Illegal uncompute command");
   modify->delete_compute(arg[0]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::undump()
 {
   if (narg != 1) error->all(FLERR,"Illegal undump command");
   output->delete_dump(arg[0]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::unfix()
 {
   if (narg != 1) error->all(FLERR,"Illegal unfix command");
   modify->delete_fix(arg[0]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Input::units()
 {
   if (narg != 1) error->all(FLERR,"Illegal units command");
   if (domain->box_exist)
     error->all(FLERR,"Units command after simulation box is defined");
   update->set_units(arg[0]);
 }
diff --git a/src/lammps.cpp b/src/lammps.cpp
index d1e84cf9b..69945a805 100644
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@@ -1,821 +1,835 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
 #include "string.h"
 #include "ctype.h"
 #include "lammps.h"
 #include "style_angle.h"
 #include "style_atom.h"
 #include "style_bond.h"
 #include "style_command.h"
 #include "style_compute.h"
 #include "style_dihedral.h"
 #include "style_dump.h"
 #include "style_fix.h"
 #include "style_improper.h"
 #include "style_integrate.h"
 #include "style_kspace.h"
 #include "style_minimize.h"
 #include "style_pair.h"
 #include "style_region.h"
 #include "universe.h"
 #include "input.h"
 #include "atom.h"
 #include "update.h"
 #include "neighbor.h"
 #include "comm.h"
 #include "comm_brick.h"
 #include "domain.h"
 #include "force.h"
 #include "modify.h"
 #include "group.h"
 #include "output.h"
 #include "citeme.h"
 #include "accelerator_cuda.h"
 #include "accelerator_kokkos.h"
 #include "accelerator_omp.h"
+#include "accelerator_intel.h"
 #include "timer.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 /* ----------------------------------------------------------------------
    start up LAMMPS
    allocate fundamental classes (memory, error, universe, input)
    parse input switches
    initialize communicators, screen & logfile output
    input is allocated at end after MPI info is setup
 ------------------------------------------------------------------------- */
 
 LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
 {
   memory = new Memory(this);
   error = new Error(this);
   universe = new Universe(this,communicator);
   output = NULL;
 
   screen = NULL;
   logfile = NULL;
   infile = NULL;
 
   // parse input switches
 
   int inflag = 0;
   int screenflag = 0;
   int logflag = 0;
   int partscreenflag = 0;
   int partlogflag = 0;
   int cudaflag = 0;
   int kokkosflag = 0;
   int restartflag = 0;
   int restartremapflag = 0;
   int citeflag = 1;
   int helpflag = 0;
 
-  suffix = NULL;
+  suffix = suffix2 = NULL;
   suffix_enable = 0;
   char *rfile = NULL;
   char *dfile = NULL;
   int wdfirst,wdlast;
   int kkfirst,kklast;
 
   int iarg = 1;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"-partition") == 0 ||
         strcmp(arg[iarg],"-p") == 0) {
       universe->existflag = 1;
       if (iarg+2 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       iarg++;
       while (iarg < narg && arg[iarg][0] != '-') {
         universe->add_world(arg[iarg]);
         iarg++;
       }
     } else if (strcmp(arg[iarg],"-in") == 0 ||
                strcmp(arg[iarg],"-i") == 0) {
       if (iarg+2 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       inflag = iarg + 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-screen") == 0 ||
                strcmp(arg[iarg],"-sc") == 0) {
       if (iarg+2 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       screenflag = iarg + 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-log") == 0 ||
                strcmp(arg[iarg],"-l") == 0) {
       if (iarg+2 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       logflag = iarg + 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-var") == 0 ||
                strcmp(arg[iarg],"-v") == 0) {
       if (iarg+3 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       iarg += 3;
       while (iarg < narg && arg[iarg][0] != '-') iarg++;
     } else if (strcmp(arg[iarg],"-echo") == 0 ||
                strcmp(arg[iarg],"-e") == 0) {
       if (iarg+2 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       iarg += 2;
     } else if (strcmp(arg[iarg],"-pscreen") == 0 ||
                strcmp(arg[iarg],"-ps") == 0) {
       if (iarg+2 > narg)
        error->universe_all(FLERR,"Invalid command-line argument");
       partscreenflag = iarg + 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-plog") == 0 ||
                strcmp(arg[iarg],"-pl") == 0) {
       if (iarg+2 > narg)
        error->universe_all(FLERR,"Invalid command-line argument");
       partlogflag = iarg + 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-cuda") == 0 ||
                strcmp(arg[iarg],"-c") == 0) {
       if (iarg+2 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       if (strcmp(arg[iarg+1],"on") == 0) cudaflag = 1;
       else if (strcmp(arg[iarg+1],"off") == 0) cudaflag = 0;
       else error->universe_all(FLERR,"Invalid command-line argument");
       iarg += 2;
     } else if (strcmp(arg[iarg],"-kokkos") == 0 ||
                strcmp(arg[iarg],"-k") == 0) {
       if (iarg+2 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       if (strcmp(arg[iarg+1],"on") == 0) kokkosflag = 1;
       else if (strcmp(arg[iarg+1],"off") == 0) kokkosflag = 0;
       else error->universe_all(FLERR,"Invalid command-line argument");
       iarg += 2;
       // delimit any extra args for the Kokkos instantiation
       kkfirst = iarg;
       while (iarg < narg && arg[iarg][0] != '-') iarg++;
       kklast = iarg;
     } else if (strcmp(arg[iarg],"-suffix") == 0 ||
                strcmp(arg[iarg],"-sf") == 0) {
       if (iarg+2 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       delete [] suffix;
       int n = strlen(arg[iarg+1]) + 1;
       suffix = new char[n];
       strcpy(suffix,arg[iarg+1]);
+      // set 2nd suffix = "omp" when suffix = "intel"
+      if (strcmp(suffix,"intel") == 0) {
+        suffix2 = new char[4];
+        strcpy(suffix2,"omp");
+      }
       suffix_enable = 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-reorder") == 0 ||
                strcmp(arg[iarg],"-ro") == 0) {
       if (iarg+3 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       if (universe->existflag)
         error->universe_all(FLERR,"Cannot use -reorder after -partition");
       universe->reorder(arg[iarg+1],arg[iarg+2]);
       iarg += 3;
     } else if (strcmp(arg[iarg],"-restart") == 0 ||
                strcmp(arg[iarg],"-r") == 0) {
       if (iarg+3 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       restartflag = 1;
       rfile = arg[iarg+1];
       dfile = arg[iarg+2];
       // check for restart remap flag
       if (strcmp(dfile,"remap") == 0) {
         if (iarg+4 > narg)
           error->universe_all(FLERR,"Invalid command-line argument");
         restartremapflag = 1;
         dfile = arg[iarg+3];
         iarg++;
       }
       iarg += 3;
       // delimit any extra args for the write_data command
       wdfirst = iarg;
       while (iarg < narg && arg[iarg][0] != '-') iarg++;
       wdlast = iarg;
     } else if (strcmp(arg[iarg],"-nocite") == 0 ||
                strcmp(arg[iarg],"-nc") == 0) {
       citeflag = 0;
       iarg++;
     } else if (strcmp(arg[iarg],"-help") == 0 ||
                strcmp(arg[iarg],"-h") == 0) {
       if (iarg+1 > narg)
         error->universe_all(FLERR,"Invalid command-line argument");
       helpflag = 1;
       citeflag = 0;
       iarg += 1;
     } else error->universe_all(FLERR,"Invalid command-line argument");
   }
 
   // if no partition command-line switch, universe is one world with all procs
 
   if (universe->existflag == 0) universe->add_world(NULL);
 
   // sum of procs in all worlds must equal total # of procs
 
   if (!universe->consistent())
     error->universe_all(FLERR,"Processor partitions are inconsistent");
 
   // universe cannot use stdin for input file
 
   if (universe->existflag && inflag == 0)
     error->universe_all(FLERR,"Must use -in switch with multiple partitions");
 
   // if no partition command-line switch, cannot use -pscreen option
 
   if (universe->existflag == 0 && partscreenflag)
     error->universe_all(FLERR,"Can only use -pscreen with multiple partitions");
 
   // if no partition command-line switch, cannot use -plog option
 
   if (universe->existflag == 0 && partlogflag)
     error->universe_all(FLERR,"Can only use -plog with multiple partitions");
 
   // set universe screen and logfile
 
   if (universe->me == 0) {
     if (screenflag == 0)
       universe->uscreen = stdout;
     else if (strcmp(arg[screenflag],"none") == 0)
       universe->uscreen = NULL;
     else {
       universe->uscreen = fopen(arg[screenflag],"w");
       if (universe->uscreen == NULL)
         error->universe_one(FLERR,"Cannot open universe screen file");
     }
     if (logflag == 0) {
       if (helpflag == 0) {
         universe->ulogfile = fopen("log.lammps","w");
         if (universe->ulogfile == NULL)
           error->universe_warn(FLERR,"Cannot open log.lammps for writing");
       }
     } else if (strcmp(arg[logflag],"none") == 0)
       universe->ulogfile = NULL;
     else {
       universe->ulogfile = fopen(arg[logflag],"w");
       if (universe->ulogfile == NULL)
         error->universe_one(FLERR,"Cannot open universe log file");
     }
   }
 
   if (universe->me > 0) {
     if (screenflag == 0) universe->uscreen = stdout;
     else universe->uscreen = NULL;
     universe->ulogfile = NULL;
   }
 
   // make universe and single world the same, since no partition switch
   // world inherits settings from universe
   // set world screen, logfile, communicator, infile
   // open input script if from file
 
   if (universe->existflag == 0) {
     screen = universe->uscreen;
     logfile = universe->ulogfile;
     world = universe->uworld;
 
     if (universe->me == 0) {
       if (inflag == 0) infile = stdin;
       else infile = fopen(arg[inflag],"r");
       if (infile == NULL) {
         char str[128];
         sprintf(str,"Cannot open input script %s",arg[inflag]);
         error->one(FLERR,str);
       }
     }
 
     if (universe->me == 0) {
       if (screen) fprintf(screen,"LAMMPS (%s)\n",universe->version);
       if (logfile) fprintf(logfile,"LAMMPS (%s)\n",universe->version);
     }
 
   // universe is one or more worlds, as setup by partition switch
   // split universe communicator into separate world communicators
   // set world screen, logfile, communicator, infile
   // open input script
 
   } else {
     int me;
     MPI_Comm_split(universe->uworld,universe->iworld,0,&world);
     MPI_Comm_rank(world,&me);
 
     if (me == 0)
       if (partscreenflag == 0)
        if (screenflag == 0) {
          char str[32];
          sprintf(str,"screen.%d",universe->iworld);
          screen = fopen(str,"w");
          if (screen == NULL) error->one(FLERR,"Cannot open screen file");
        } else if (strcmp(arg[screenflag],"none") == 0)
          screen = NULL;
        else {
          char str[128];
          sprintf(str,"%s.%d",arg[screenflag],universe->iworld);
          screen = fopen(str,"w");
          if (screen == NULL) error->one(FLERR,"Cannot open screen file");
        }
       else if (strcmp(arg[partscreenflag],"none") == 0)
         screen = NULL;
       else {
         char str[128];
         sprintf(str,"%s.%d",arg[partscreenflag],universe->iworld);
         screen = fopen(str,"w");
         if (screen == NULL) error->one(FLERR,"Cannot open screen file");
       } else screen = NULL;
 
     if (me == 0)
       if (partlogflag == 0)
        if (logflag == 0) {
          char str[32];
          sprintf(str,"log.lammps.%d",universe->iworld);
          logfile = fopen(str,"w");
          if (logfile == NULL) error->one(FLERR,"Cannot open logfile");
        } else if (strcmp(arg[logflag],"none") == 0)
          logfile = NULL;
        else {
          char str[128];
          sprintf(str,"%s.%d",arg[logflag],universe->iworld);
          logfile = fopen(str,"w");
          if (logfile == NULL) error->one(FLERR,"Cannot open logfile");
        }
       else if (strcmp(arg[partlogflag],"none") == 0)
         logfile = NULL;
       else {
         char str[128];
         sprintf(str,"%s.%d",arg[partlogflag],universe->iworld);
         logfile = fopen(str,"w");
         if (logfile == NULL) error->one(FLERR,"Cannot open logfile");
       } else logfile = NULL;
 
     if (me == 0) {
       infile = fopen(arg[inflag],"r");
       if (infile == NULL) {
         char str[128];
         sprintf(str,"Cannot open input script %s",arg[inflag]);
         error->one(FLERR,str);
       }
     } else infile = NULL;
 
     // screen and logfile messages for universe and world
 
     if (universe->me == 0) {
       if (universe->uscreen) {
         fprintf(universe->uscreen,"LAMMPS (%s)\n",universe->version);
         fprintf(universe->uscreen,"Running on %d partitions of processors\n",
                 universe->nworlds);
       }
       if (universe->ulogfile) {
         fprintf(universe->ulogfile,"LAMMPS (%s)\n",universe->version);
         fprintf(universe->ulogfile,"Running on %d partitions of processors\n",
                 universe->nworlds);
       }
     }
 
     if (me == 0) {
       if (screen) {
         fprintf(screen,"LAMMPS (%s)\n",universe->version);
         fprintf(screen,"Processor partition = %d\n",universe->iworld);
       }
       if (logfile) {
         fprintf(logfile,"LAMMPS (%s)\n",universe->version);
         fprintf(logfile,"Processor partition = %d\n",universe->iworld);
       }
     }
   }
 
   // check consistency of datatype settings in lmptype.h
 
   if (sizeof(smallint) != sizeof(int))
     error->all(FLERR,"Smallint setting in lmptype.h is invalid");
   if (sizeof(imageint) < sizeof(smallint))
     error->all(FLERR,"Imageint setting in lmptype.h is invalid");
   if (sizeof(tagint) < sizeof(smallint))
     error->all(FLERR,"Tagint setting in lmptype.h is invalid");
   if (sizeof(bigint) < sizeof(imageint) || sizeof(bigint) < sizeof(tagint))
     error->all(FLERR,"Bigint setting in lmptype.h is invalid");
 
   int mpisize;
   MPI_Type_size(MPI_LMP_TAGINT,&mpisize);
   if (mpisize != sizeof(tagint))
       error->all(FLERR,"MPI_LMP_TAGINT and tagint in "
                  "lmptype.h are not compatible");
   MPI_Type_size(MPI_LMP_BIGINT,&mpisize);
   if (mpisize != sizeof(bigint))
       error->all(FLERR,"MPI_LMP_BIGINT and bigint in "
                  "lmptype.h are not compatible");
 
 #ifdef LAMMPS_SMALLBIG
   if (sizeof(smallint) != 4 || sizeof(imageint) != 4 || 
       sizeof(tagint) != 4 || sizeof(bigint) != 8)
     error->all(FLERR,"Small to big integers are not sized correctly");
 #endif
 #ifdef LAMMPS_BIGBIG
   if (sizeof(smallint) != 4 || sizeof(imageint) != 8 || 
       sizeof(tagint) != 8 || sizeof(bigint) != 8)
     error->all(FLERR,"Small to big integers are not sized correctly");
 #endif
 #ifdef LAMMPS_SMALLSMALL
   if (sizeof(smallint) != 4 || sizeof(imageint) != 4 || 
       sizeof(tagint) != 4 || sizeof(bigint) != 4)
     error->all(FLERR,"Small to big integers are not sized correctly");
 #endif
 
   // error check on accelerator packages
 
   if (cudaflag == 1 && kokkosflag == 1) 
     error->all(FLERR,"Cannot use -cuda on and -kokkos on together");
 
   // create Cuda class if USER-CUDA installed, unless explicitly switched off
   // instantiation creates dummy Cuda class if USER-CUDA is not installed
 
   cuda = NULL;
   if (cudaflag == 1) {
     cuda = new Cuda(this);
     if (!cuda->cuda_exists)
       error->all(FLERR,"Cannot use -cuda on without USER-CUDA installed");
   }
 
   int me;
   MPI_Comm_rank(world,&me);
   if (cuda && me == 0) error->message(FLERR,"USER-CUDA mode is enabled");
 
   // create Kokkos class if KOKKOS installed, unless explicitly switched off
   // instantiation creates dummy Kokkos class if KOKKOS is not installed
   // add args between kkfirst and kklast to Kokkos instantiation
 
   kokkos = NULL;
   if (kokkosflag == 1) {
     kokkos = new KokkosLMP(this,kklast-kkfirst,&arg[kkfirst]);
     if (!kokkos->kokkos_exists)
       error->all(FLERR,"Cannot use -kokkos on without KOKKOS installed");
   }
 
   MPI_Comm_rank(world,&me);
   if (kokkos && me == 0) error->message(FLERR,"KOKKOS mode is enabled");
 
   // allocate CiteMe class if enabled
 
   if (citeflag) citeme = new CiteMe(this);
   else citeme = NULL;
 
   // allocate input class now that MPI is fully setup
 
   input = new Input(this,narg,arg);
 
   // allocate top-level classes
 
   create();
   post_create();
 
   // if helpflag set, print help and quit
 
   if (helpflag) {
     if (universe->me == 0 && screen) help();
     error->done();
   }
 
   // if restartflag set, invoke 2 commands and quit
   // add args between wdfirst and wdlast to write_data command
   // also add "noinit" to prevent write_data from doing system init
 
   if (restartflag) {
     char cmd[128];
     sprintf(cmd,"read_restart %s\n",rfile);
     if (restartremapflag) strcat(cmd," remap\n");
     input->one(cmd);
     sprintf(cmd,"write_data %s",dfile);
     for (iarg = wdfirst; iarg < wdlast; iarg++)
       sprintf(&cmd[strlen(cmd)]," %s",arg[iarg]);
     strcat(cmd," noinit\n");
     input->one(cmd);
     error->done();
   }
 }
 
 /* ----------------------------------------------------------------------
    shutdown LAMMPS
    delete top-level classes
    close screen and log files in world and universe
    output files were already closed in destroy()
    delete fundamental classes
 ------------------------------------------------------------------------- */
 
 LAMMPS::~LAMMPS()
 {
   destroy();
 
   delete citeme;
 
   if (universe->nworlds == 1) {
     if (screen && screen != stdout) fclose(screen);
     if (logfile) fclose(logfile);
     logfile = NULL;
     if (screen != stdout) screen = NULL;
   } else {
     if (screen && screen != stdout) fclose(screen);
     if (logfile) fclose(logfile);
     if (universe->ulogfile) fclose(universe->ulogfile);
     logfile = NULL;
     if (screen != stdout) screen = NULL;
   }
 
   if (infile && infile != stdin) fclose(infile);
 
   if (world != universe->uworld) MPI_Comm_free(&world);
 
   delete cuda;
   delete kokkos;
   delete [] suffix;
+  delete [] suffix2;
 
   delete input;
   delete universe;
   delete error;
   delete memory;
 }
 
 /* ----------------------------------------------------------------------
    allocate single instance of top-level classes
    fundamental classes are allocated in constructor
    some classes have package variants
 ------------------------------------------------------------------------- */
 
 void LAMMPS::create()
 {
   // Comm class must be created before Atom class
   // so that nthreads is defined when create_avec invokes grow()
 
   if (cuda) comm = new CommCuda(this);
   else if (kokkos) comm = new CommKokkos(this);
   else comm = new CommBrick(this);
 
   if (cuda) neighbor = new NeighborCuda(this);
   else if (kokkos) neighbor = new NeighborKokkos(this);
   else neighbor = new Neighbor(this);
 
   if (cuda) domain = new DomainCuda(this);
   else if (kokkos) domain = new DomainKokkos(this);
 #ifdef LMP_USER_OMP
   else domain = new DomainOMP(this);
 #else
   else domain = new Domain(this);
 #endif
 
   if (kokkos) atom = new AtomKokkos(this);
   else atom = new Atom(this);
-  atom->create_avec("atomic",0,NULL,suffix);
+  atom->create_avec("atomic",0,NULL,1);
 
   group = new Group(this);
   force = new Force(this);    // must be after group, to create temperature
 
   if (cuda) modify = new ModifyCuda(this);
   else if (kokkos) modify = new ModifyKokkos(this);
   else modify = new Modify(this);
 
   output = new Output(this);  // must be after group, so "all" exists
                               // must be after modify so can create Computes
   update = new Update(this);  // must be after output, force, neighbor
   timer = new Timer(this);
 }
 
 /* ----------------------------------------------------------------------
    invoke package-specific setup commands
    called from LAMMPS constructor and after clear() command
    only invoke if suffix is set and enabled
+   also check if suffix2 is set
 ------------------------------------------------------------------------- */
 
 void LAMMPS::post_create()
 {
-  if (suffix && suffix_enable) {
+  if (!suffix_enable) return;
+  if (suffix) {
     if (strcmp(suffix,"gpu") == 0) input->one("package gpu force/neigh 0 0 1");
     if (strcmp(suffix,"omp") == 0) input->one("package omp *");
+    if (strcmp(suffix,"intel") == 0) 
+      input->one("package intel * mixed balance -1");
+  }
+  if (suffix2) {
+    if (strcmp(suffix,"omp") == 0) input->one("package omp *");
   }
 }
 
 /* ----------------------------------------------------------------------
    initialize top-level classes
    do not initialize Timer class, other classes like Run() do that explicitly
 ------------------------------------------------------------------------- */
 
 void LAMMPS::init()
 {
   if (cuda) cuda->accelerator(0,NULL);
   if (kokkos) kokkos->accelerator(0,NULL);
 
   update->init();
   force->init();         // pair must come after update due to minimizer
   domain->init();
   atom->init();          // atom must come after force and domain
                          //   atom deletes extra array
                          //   used by fix shear_history::unpack_restart()
                          //   when force->pair->gran_history creates fix ??
                          //   atom_vec init uses deform_vremap
   modify->init();        // modify must come after update, force, atom, domain
   neighbor->init();      // neighbor must come after force, modify
   comm->init();          // comm must come after force, modify, neighbor, atom
   output->init();        // output must come after domain, force, modify
 }
 
 /* ----------------------------------------------------------------------
    delete single instance of top-level classes
    fundamental classes are deleted in destructor
 ------------------------------------------------------------------------- */
 
 void LAMMPS::destroy()
 {
   delete update;
   delete neighbor;
   delete comm;
   delete force;
   delete group;
   delete output;
   delete modify;          // modify must come after output, force, update
                           //   since they delete fixes
   delete domain;          // domain must come after modify
                           //   since fix destructors access domain
   delete atom;            // atom must come after modify, neighbor
                           //   since fixes delete callbacks in atom
   delete timer;
 
   modify = NULL;          // necessary since input->variable->varreader
                           // will be destructed later
 }
 
 /* ----------------------------------------------------------------------
    help message for command line options and styles present in executable
 ------------------------------------------------------------------------- */
 
 void LAMMPS::help()
 {
   fprintf(screen,
           "\nCommand line options:\n\n"
           "-cuda on/off                : turn CUDA mode on or off (-c)\n"
           "-echo none/screen/log/both  : echoing of input script (-e)\n"
           "-in filename                : read input from file, not stdin (-i)\n"
           "-help                       : print this help message (-h)\n"
           "-kokkos on/off ...          : turn KOKKOS mode on or off (-k)\n"
           "-log none/filename          : where to send log output (-l)\n"
           "-nocite                     : disable writing log.cite file (-nc)\n"
           "-partition size1 size2 ...  : assign partition sizes (-p)\n"
           "-plog basename              : basename for partition logs (-pl)\n"
           "-pscreen basename           : basename for partition screens (-ps)\n"
           "-reorder topology-specs     : processor reordering (-r)\n"
           "-screen none/filename       : where to send screen output (-sc)\n"
           "-suffix cuda/gpu/opt/omp    : style suffix to apply (-sf)\n"
           "-var varname value          : set index style variable (-v)\n\n");
   
   fprintf(screen,"Style options compiled with this executable\n\n");
 
   int pos = 80;
   fprintf(screen,"* Atom styles:\n");
 #define ATOM_CLASS
 #define AtomStyle(key,Class) print_style(#key,pos);
 #include "style_atom.h"
 #undef ATOM_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Integrate styles:\n");
 #define INTEGRATE_CLASS
 #define IntegrateStyle(key,Class) print_style(#key,pos);
 #include "style_integrate.h"
 #undef INTEGRATE_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Minimize styles:\n");
 #define MINIMIZE_CLASS
 #define MinimizeStyle(key,Class) print_style(#key,pos);
 #include "style_minimize.h"
 #undef MINIMIZE_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Pair styles:\n");
 #define PAIR_CLASS
 #define PairStyle(key,Class) print_style(#key,pos);
 #include "style_pair.h"
 #undef PAIR_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Bond styles:\n");
 #define BOND_CLASS
 #define BondStyle(key,Class) print_style(#key,pos);
 #include "style_bond.h"
 #undef BOND_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Angle styles:\n");
 #define ANGLE_CLASS
 #define AngleStyle(key,Class) print_style(#key,pos);
 #include "style_angle.h"
 #undef ANGLE_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Dihedral styles:\n");
 #define DIHEDRAL_CLASS
 #define DihedralStyle(key,Class) print_style(#key,pos);
 #include "style_dihedral.h"
 #undef DIHEDRAL_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Improper styles:\n");
 #define IMPROPER_CLASS
 #define ImproperStyle(key,Class) print_style(#key,pos);
 #include "style_improper.h"
 #undef IMPROPER_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* KSpace styles:\n");
 #define KSPACE_CLASS
 #define KSpaceStyle(key,Class) print_style(#key,pos);
 #include "style_kspace.h"
 #undef KSPACE_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Fix styles\n");
 #define FIX_CLASS
 #define FixStyle(key,Class) print_style(#key,pos);
 #include "style_fix.h"
 #undef FIX_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Compute styles:\n");
 #define COMPUTE_CLASS
 #define ComputeStyle(key,Class) print_style(#key,pos);
 #include "style_compute.h"
 #undef COMPUTE_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Region styles:\n");
 #define REGION_CLASS
 #define RegionStyle(key,Class) print_style(#key,pos);
 #include "style_region.h"
 #undef REGION_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Dump styles:\n");
 #define DUMP_CLASS
 #define DumpStyle(key,Class) print_style(#key,pos);
 #include "style_dump.h"
 #undef DUMP_CLASS
   fprintf(screen,"\n\n");
 
   pos = 80;
   fprintf(screen,"* Command styles\n");
 #define COMMAND_CLASS
 #define CommandStyle(key,Class) print_style(#key,pos);
 #include "style_command.h"
 #undef COMMAND_CLASS
   fprintf(screen,"\n");
 }
 
 /* ----------------------------------------------------------------------
    print style names in columns
    skip any style that starts with upper-case letter, since internal
 ------------------------------------------------------------------------- */
 
 void LAMMPS::print_style(const char *str, int &pos)
 {
   if (isupper(str[0])) return;
 
   int len = strlen(str);
   if (pos+len > 80) { 
     fprintf(screen,"\n");
     pos = 0;
   }
 
   if (len < 16) {
     fprintf(screen,"%-16s",str);
     pos += 16;
   } else if (len < 32) {
     fprintf(screen,"%-32s",str);
     pos += 32;
   } else if (len < 48) {
     fprintf(screen,"%-48s",str);
     pos += 48;
   } else if (len < 64) {
     fprintf(screen,"%-64s",str);
     pos += 64;
   } else {
     fprintf(screen,"%-80s",str);
     pos += 80;
   }
 }
diff --git a/src/lammps.h b/src/lammps.h
index 44c7921bd..8ff0eca06 100644
--- a/src/lammps.h
+++ b/src/lammps.h
@@ -1,168 +1,171 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_LAMMPS_H
 #define LMP_LAMMPS_H
 
 #include "stdio.h"
 
 namespace LAMMPS_NS {
 
 class LAMMPS {
  public:
                                  // ptrs to fundamental LAMMPS classes
   class Memory *memory;          // memory allocation functions
   class Error *error;            // error handling
   class Universe *universe;      // universe of processors
   class Input *input;            // input script processing
                                  // ptrs to top-level LAMMPS-specific classes
   class Atom *atom;              // atom-based quantities
   class Update *update;          // integrators/minimizers
   class Neighbor *neighbor;      // neighbor lists
   class Comm *comm;              // inter-processor communication
   class Domain *domain;          // simulation box
   class Force *force;            // inter-particle forces
   class Modify *modify;          // fixes and computes
   class Group *group;            // groups of atoms
   class Output *output;          // thermo/dump/restart
   class Timer *timer;            // CPU timing info
 
   MPI_Comm world;                // MPI communicator
   FILE *infile;                  // infile
   FILE *screen;                  // screen output
   FILE *logfile;                 // logfile
 
-  char *suffix;                  // suffix to add to input script style names
-  int suffix_enable;             // 1 if suffix enabled, 0 if disabled
+  char *suffix,*suffix2;         // suffixes to add to input script style names
+  int suffix_enable;             // 1 if suffixes are enabled, 0 if disabled
   int cite_enable;               // 1 if generating log.cite, 0 if disabled
 
   class Cuda *cuda;              // CUDA accelerator class
+  //class GPU *gpu;                // GPU accelerator class
+  //class Intel *intel;            // Intel accelerator class
+  //class OMP *omp;                // OMP accelerator class
   class KokkosLMP *kokkos;       // KOKKOS accelerator class
 
   class CiteMe *citeme;          // citation info
 
   LAMMPS(int, char **, MPI_Comm);
   ~LAMMPS();
   void create();
   void post_create();
   void init();
   void destroy();
 
  private:
   void help();
   void print_style(const char *, int &);
   LAMMPS() {};                   // prohibit using the default constructor
   LAMMPS(const LAMMPS &) {};     // prohibit using the copy constructor
 };
 
 }
 
 #endif
 
 /* ERROR/WARNING messages:
 
 E: Invalid command-line argument
 
 One or more command-line arguments is invalid.  Check the syntax of
 the command you are using to launch LAMMPS.
 
 E: Cannot use -reorder after -partition
 
 Self-explanatory.  See doc page discussion of command-line switches.
 
 E: Processor partitions are inconsistent
 
 The total number of processors in all partitions must match the number
 of processors LAMMPS is running on.
 
 E: Must use -in switch with multiple partitions
 
 A multi-partition simulation cannot read the input script from stdin.
 The -in command-line option must be used to specify a file.
 
 E: Can only use -pscreen with multiple partitions
 
 Self-explanatory.  See doc page discussion of command-line switches.
 
 E: Can only use -plog with multiple partitions
 
 Self-explanatory.  See doc page discussion of command-line switches.
 
 E: Cannot open universe screen file
 
 For a multi-partition run, the master screen file cannot be opened.
 Check that the directory you are running in allows for files to be
 created.
 
 E: Cannot open log.lammps for writing
 
 The default LAMMPS log file cannot be opened.  Check that the
 directory you are running in allows for files to be created.
 
 E: Cannot open universe log file
 
 For a multi-partition run, the master log file cannot be opened.
 Check that the directory you are running in allows for files to be
 created.
 
 E: Cannot open input script %s
 
 Self-explanatory.
 
 E: Cannot open screen file
 
 The screen file specified as a command-line argument cannot be
 opened.  Check that the directory you are running in allows for files
 to be created.
 
 E: Cannot open logfile
 
 The LAMMPS log file named in a command-line argument cannot be opened.
 Check that the path and name are correct.
 
 E: Smallint setting in lmptype.h is invalid
 
 It has to be the size of an integer.
 
 E: Imageint setting in lmptype.h is invalid
 
 Imageint must be as large or larger than smallint.
 
 E: Tagint setting in lmptype.h is invalid
 
 Tagint must be as large or larger than smallint.
 
 E: Bigint setting in lmptype.h is invalid
 
 Size of bigint is less than size of tagint.
 
 E: MPI_LMP_TAGINT and tagint in lmptype.h are not compatible
 
 The size of the MPI datatype does not match the size of a tagint.
 
 E: MPI_LMP_BIGINT and bigint in lmptype.h are not compatible
 
 The size of the MPI datatype does not match the size of a bigint.
 
 E: Small to big integers are not sized correctly
 
 This error occurs whenthe sizes of smallint, imageint, tagint, bigint,
 as defined in src/lmptype.h are not what is expected.  Contact
 the developers if this occurs.
 
 E: Cannot use -cuda on without USER-CUDA installed
 
 The USER-CUDA package must be installed via "make yes-user-cuda"
 before LAMMPS is built.
 
 */
diff --git a/src/modify.cpp b/src/modify.cpp
index b55f368fe..065838f4c 100644
--- a/src/modify.cpp
+++ b/src/modify.cpp
@@ -1,1241 +1,1277 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "stdio.h"
 #include "string.h"
 #include "modify.h"
 #include "style_compute.h"
 #include "style_fix.h"
 #include "atom.h"
 #include "comm.h"
 #include "fix.h"
 #include "compute.h"
 #include "group.h"
 #include "update.h"
 #include "domain.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 #define DELTA 4
 #define BIG 1.0e20
-#define NEXCEPT 4       // change when add to exceptions in add_fix()
+#define NEXCEPT 5       // change when add to exceptions in add_fix()
 
 /* ---------------------------------------------------------------------- */
 
 Modify::Modify(LAMMPS *lmp) : Pointers(lmp)
 {
   nfix = maxfix = 0;
   n_initial_integrate = n_post_integrate = 0;
   n_pre_exchange = n_pre_neighbor = 0;
   n_pre_force = n_post_force = 0;
   n_final_integrate = n_end_of_step = n_thermo_energy = 0;
   n_initial_integrate_respa = n_post_integrate_respa = 0;
   n_pre_force_respa = n_post_force_respa = n_final_integrate_respa = 0;
   n_min_pre_exchange = n_min_pre_force = n_min_post_force = n_min_energy = 0;
 
   fix = NULL;
   fmask = NULL;
   list_initial_integrate = list_post_integrate = NULL;
   list_pre_exchange = list_pre_neighbor = NULL;
   list_pre_force = list_post_force = NULL;
   list_final_integrate = list_end_of_step = NULL;
   list_thermo_energy = NULL;
   list_initial_integrate_respa = list_post_integrate_respa = NULL;
   list_pre_force_respa = list_post_force_respa = NULL;
   list_final_integrate_respa = NULL;
   list_min_pre_exchange = list_min_pre_neighbor = NULL;
   list_min_pre_force = list_min_post_force = NULL;
   list_min_energy = NULL;
 
   end_of_step_every = NULL;
 
   list_timeflag = NULL;
 
   nfix_restart_global = 0;
   id_restart_global = style_restart_global = state_restart_global = NULL;
   nfix_restart_peratom = 0;
   id_restart_peratom = style_restart_peratom = NULL;
   index_restart_peratom = NULL;
 
   ncompute = maxcompute = 0;
   compute = NULL;
 
   // fill map with fixes listed in style_fix.h
 
   fix_map = new std::map<std::string,FixCreator>();
 
 #define FIX_CLASS
 #define FixStyle(key,Class) \
   (*fix_map)[#key] = &fix_creator<Class>;
 #include "style_fix.h"
 #undef FixStyle
 #undef FIX_CLASS
 
   // fill map with computes listed in style_compute.h
 
   compute_map = new std::map<std::string,ComputeCreator>();
 
 #define COMPUTE_CLASS
 #define ComputeStyle(key,Class) \
   (*compute_map)[#key] = &compute_creator<Class>;
 #include "style_compute.h"
 #undef ComputeStyle
 #undef COMPUTE_CLASS
 }
 
 /* ---------------------------------------------------------------------- */
 
 Modify::~Modify()
 {
   // delete all fixes
   // do it via delete_fix() so callbacks in Atom are also updated correctly
 
   while (nfix) delete_fix(fix[0]->id);
   memory->sfree(fix);
   memory->destroy(fmask);
 
   // delete all computes
 
   for (int i = 0; i < ncompute; i++) delete compute[i];
   memory->sfree(compute);
 
   delete [] list_initial_integrate;
   delete [] list_post_integrate;
   delete [] list_pre_exchange;
   delete [] list_pre_neighbor;
   delete [] list_pre_force;
   delete [] list_post_force;
   delete [] list_final_integrate;
   delete [] list_end_of_step;
   delete [] list_thermo_energy;
   delete [] list_initial_integrate_respa;
   delete [] list_post_integrate_respa;
   delete [] list_pre_force_respa;
   delete [] list_post_force_respa;
   delete [] list_final_integrate_respa;
   delete [] list_min_pre_exchange;
   delete [] list_min_pre_neighbor;
   delete [] list_min_pre_force;
   delete [] list_min_post_force;
   delete [] list_min_energy;
 
   delete [] end_of_step_every;
   delete [] list_timeflag;
 
   restart_deallocate();
 
   delete compute_map;
   delete fix_map;
 }
 
 /* ----------------------------------------------------------------------
    initialize all fixes and computes
 ------------------------------------------------------------------------- */
 
 void Modify::init()
 {
   int i,j;
 
   // delete storage of restart info since it is not valid after 1st run
 
   restart_deallocate();
 
   // create lists of fixes to call at each stage of run
 
   list_init(INITIAL_INTEGRATE,n_initial_integrate,list_initial_integrate);
   list_init(POST_INTEGRATE,n_post_integrate,list_post_integrate);
   list_init(PRE_EXCHANGE,n_pre_exchange,list_pre_exchange);
   list_init(PRE_NEIGHBOR,n_pre_neighbor,list_pre_neighbor);
   list_init(PRE_FORCE,n_pre_force,list_pre_force);
   list_init(POST_FORCE,n_post_force,list_post_force);
   list_init(FINAL_INTEGRATE,n_final_integrate,list_final_integrate);
   list_init_end_of_step(END_OF_STEP,n_end_of_step,list_end_of_step);
   list_init_thermo_energy(THERMO_ENERGY,n_thermo_energy,list_thermo_energy);
 
   list_init(INITIAL_INTEGRATE_RESPA,
             n_initial_integrate_respa,list_initial_integrate_respa);
   list_init(POST_INTEGRATE_RESPA,
             n_post_integrate_respa,list_post_integrate_respa);
   list_init(POST_FORCE_RESPA,
             n_post_force_respa,list_post_force_respa);
   list_init(PRE_FORCE_RESPA,
             n_pre_force_respa,list_pre_force_respa);
   list_init(FINAL_INTEGRATE_RESPA,
             n_final_integrate_respa,list_final_integrate_respa);
 
   list_init(MIN_PRE_EXCHANGE,n_min_pre_exchange,list_min_pre_exchange);
   list_init(MIN_PRE_NEIGHBOR,n_min_pre_neighbor,list_min_pre_neighbor);
   list_init(MIN_PRE_FORCE,n_min_pre_force,list_min_pre_force);
   list_init(MIN_POST_FORCE,n_min_post_force,list_min_post_force);
   list_init(MIN_ENERGY,n_min_energy,list_min_energy);
 
   // init each fix
   // not sure if now needs to come before compute init
   // used to b/c temperature computes called fix->dof() in their init,
   // and fix rigid required its own init before its dof() could be called,
   // but computes now do their DOF in setup()
 
   for (i = 0; i < nfix; i++) fix[i]->init();
 
   // set global flag if any fix has its restart_pbc flag set
 
   restart_pbc_any = 0;
   for (i = 0; i < nfix; i++)
     if (fix[i]->restart_pbc) restart_pbc_any = 1;
 
   // create list of computes that store invocation times
 
   list_init_compute();
 
   // init each compute
   // set invoked_scalar,vector,etc to -1 to force new run to re-compute them
   // add initial timestep to all computes that store invocation times
   //   since any of them may be invoked by initial thermo
   // do not clear out invocation times stored within a compute,
   //   b/c some may be holdovers from previous run, like for ave fixes
 
   for (i = 0; i < ncompute; i++) {
     compute[i]->init();
     compute[i]->invoked_scalar = -1;
     compute[i]->invoked_vector = -1;
     compute[i]->invoked_array = -1;
     compute[i]->invoked_peratom = -1;
     compute[i]->invoked_local = -1;
   }
   addstep_compute_all(update->ntimestep);
 
   // error if any fix or compute is using a dynamic group when not allowed
 
   for (i = 0; i < nfix; i++)
     if (!fix[i]->dynamic_group_allow && group->dynamic[fix[i]->igroup]) {
       char str[128];
       sprintf(str,"Fix %s does not allow use of dynamic group",fix[i]->id);
       error->all(FLERR,str);
     }
 
   for (i = 0; i < ncompute; i++)
     if (!compute[i]->dynamic_group_allow && 
         group->dynamic[compute[i]->igroup]) {
       char str[128];
       sprintf(str,"Compute %s does not allow use of dynamic group",fix[i]->id);
       error->all(FLERR,str);
     }
 
   // warn if any particle is time integrated more than once
 
   int nlocal = atom->nlocal;
   int *mask = atom->mask;
 
   int *flag = new int[nlocal];
   for (i = 0; i < nlocal; i++) flag[i] = 0;
 
   int groupbit;
   for (i = 0; i < nfix; i++) {
     if (fix[i]->time_integrate == 0) continue;
     groupbit = fix[i]->groupbit;
     for (j = 0; j < nlocal; j++)
       if (mask[j] & groupbit) flag[j]++;
   }
 
   int check = 0;
   for (i = 0; i < nlocal; i++)
     if (flag[i] > 1) check = 1;
 
   delete [] flag;
 
   int checkall;
   MPI_Allreduce(&check,&checkall,1,MPI_INT,MPI_SUM,world);
   if (comm->me == 0 && checkall)
     error->warning(FLERR,
                    "One or more atoms are time integrated more than once");
 }
 
 /* ----------------------------------------------------------------------
    setup for run, calls setup() of all fixes and computes
    called from Verlet, RESPA, Min
 ------------------------------------------------------------------------- */
 
 void Modify::setup(int vflag)
 {
   // compute setup needs to come before fix setup
   // b/c NH fixes need DOF of temperature computes
 
   for (int i = 0; i < ncompute; i++) compute[i]->setup();
 
   if (update->whichflag == 1)
     for (int i = 0; i < nfix; i++) fix[i]->setup(vflag);
   else if (update->whichflag == 2)
     for (int i = 0; i < nfix; i++) fix[i]->min_setup(vflag);
 }
 
 /* ----------------------------------------------------------------------
    setup pre_exchange call, only for fixes that define pre_exchange
    called from Verlet, RESPA, Min, and WriteRestart with whichflag = 0
 ------------------------------------------------------------------------- */
 
 void Modify::setup_pre_exchange()
 {
   if (update->whichflag <= 1)
     for (int i = 0; i < n_pre_exchange; i++)
       fix[list_pre_exchange[i]]->setup_pre_exchange();
   else if (update->whichflag == 2)
     for (int i = 0; i < n_min_pre_exchange; i++)
       fix[list_min_pre_exchange[i]]->min_setup_pre_exchange();
 }
 
 /* ----------------------------------------------------------------------
    setup pre_neighbor call, only for fixes that define pre_neighbor
    called from Verlet, RESPA
 ------------------------------------------------------------------------- */
 
 void Modify::setup_pre_neighbor()
 {
   if (update->whichflag == 1)
     for (int i = 0; i < n_pre_neighbor; i++)
       fix[list_pre_neighbor[i]]->setup_pre_neighbor();
   else if (update->whichflag == 2)
     for (int i = 0; i < n_min_pre_neighbor; i++)
       fix[list_min_pre_neighbor[i]]->min_setup_pre_neighbor();
 }
 
 /* ----------------------------------------------------------------------
    setup pre_force call, only for fixes that define pre_force
    called from Verlet, RESPA, Min
 ------------------------------------------------------------------------- */
 
 void Modify::setup_pre_force(int vflag)
 {
   if (update->whichflag == 1)
     for (int i = 0; i < n_pre_force; i++)
       fix[list_pre_force[i]]->setup_pre_force(vflag);
   else if (update->whichflag == 2)
     for (int i = 0; i < n_min_pre_force; i++)
       fix[list_min_pre_force[i]]->min_setup_pre_force(vflag);
 }
 
 /* ----------------------------------------------------------------------
    1st half of integrate call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::initial_integrate(int vflag)
 {
   for (int i = 0; i < n_initial_integrate; i++)
     fix[list_initial_integrate[i]]->initial_integrate(vflag);
 }
 
 /* ----------------------------------------------------------------------
    post_integrate call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::post_integrate()
 {
   for (int i = 0; i < n_post_integrate; i++)
     fix[list_post_integrate[i]]->post_integrate();
 }
 
 /* ----------------------------------------------------------------------
    pre_exchange call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::pre_exchange()
 {
   for (int i = 0; i < n_pre_exchange; i++)
     fix[list_pre_exchange[i]]->pre_exchange();
 }
 
 /* ----------------------------------------------------------------------
    pre_neighbor call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::pre_neighbor()
 {
   for (int i = 0; i < n_pre_neighbor; i++)
     fix[list_pre_neighbor[i]]->pre_neighbor();
 }
 
 /* ----------------------------------------------------------------------
    pre_force call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::pre_force(int vflag)
 {
   for (int i = 0; i < n_pre_force; i++)
     fix[list_pre_force[i]]->pre_force(vflag);
 }
 
 /* ----------------------------------------------------------------------
    post_force call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::post_force(int vflag)
 {
   for (int i = 0; i < n_post_force; i++)
     fix[list_post_force[i]]->post_force(vflag);
 }
 
 /* ----------------------------------------------------------------------
    2nd half of integrate call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::final_integrate()
 {
   for (int i = 0; i < n_final_integrate; i++)
     fix[list_final_integrate[i]]->final_integrate();
 }
 
 /* ----------------------------------------------------------------------
    end-of-timestep call, only for relevant fixes
    only call fix->end_of_step() on timesteps that are multiples of nevery
 ------------------------------------------------------------------------- */
 
 void Modify::end_of_step()
 {
   for (int i = 0; i < n_end_of_step; i++)
     if (update->ntimestep % end_of_step_every[i] == 0)
       fix[list_end_of_step[i]]->end_of_step();
 }
 
 /* ----------------------------------------------------------------------
    thermo energy call, only for relevant fixes
    called by Thermo class
    compute_scalar() is fix call to return energy
 ------------------------------------------------------------------------- */
 
 double Modify::thermo_energy()
 {
   double energy = 0.0;
   for (int i = 0; i < n_thermo_energy; i++)
     energy += fix[list_thermo_energy[i]]->compute_scalar();
   return energy;
 }
 
 /* ----------------------------------------------------------------------
    post_run call
 ------------------------------------------------------------------------- */
 
 void Modify::post_run()
 {
   for (int i = 0; i < nfix; i++) fix[i]->post_run();
 }
 
 /* ----------------------------------------------------------------------
    setup rRESPA pre_force call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::setup_pre_force_respa(int vflag, int ilevel)
 {
   for (int i = 0; i < n_pre_force_respa; i++)
     fix[list_pre_force_respa[i]]->setup_pre_force_respa(vflag,ilevel);
 }
 
 /* ----------------------------------------------------------------------
    1st half of rRESPA integrate call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::initial_integrate_respa(int vflag, int ilevel, int iloop)
 {
   for (int i = 0; i < n_initial_integrate_respa; i++)
     fix[list_initial_integrate_respa[i]]->
       initial_integrate_respa(vflag,ilevel,iloop);
 }
 
 /* ----------------------------------------------------------------------
    rRESPA post_integrate call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::post_integrate_respa(int ilevel, int iloop)
 {
   for (int i = 0; i < n_post_integrate_respa; i++)
     fix[list_post_integrate_respa[i]]->post_integrate_respa(ilevel,iloop);
 }
 
 /* ----------------------------------------------------------------------
    rRESPA pre_force call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::pre_force_respa(int vflag, int ilevel, int iloop)
 {
   for (int i = 0; i < n_pre_force_respa; i++)
     fix[list_pre_force_respa[i]]->pre_force_respa(vflag,ilevel,iloop);
 }
 
 /* ----------------------------------------------------------------------
    rRESPA post_force call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::post_force_respa(int vflag, int ilevel, int iloop)
 {
   for (int i = 0; i < n_post_force_respa; i++)
     fix[list_post_force_respa[i]]->post_force_respa(vflag,ilevel,iloop);
 }
 
 /* ----------------------------------------------------------------------
    2nd half of rRESPA integrate call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::final_integrate_respa(int ilevel, int iloop)
 {
   for (int i = 0; i < n_final_integrate_respa; i++)
     fix[list_final_integrate_respa[i]]->final_integrate_respa(ilevel,iloop);
 }
 
 /* ----------------------------------------------------------------------
    minimizer pre-exchange call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::min_pre_exchange()
 {
   for (int i = 0; i < n_min_pre_exchange; i++)
     fix[list_min_pre_exchange[i]]->min_pre_exchange();
 }
 
 /* ----------------------------------------------------------------------
    minimizer pre-neighbor call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::min_pre_neighbor()
 {
   for (int i = 0; i < n_min_pre_neighbor; i++)
     fix[list_min_pre_neighbor[i]]->min_pre_neighbor();
 }
 
 /* ----------------------------------------------------------------------
    minimizer pre-force call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::min_pre_force(int vflag)
 {
   for (int i = 0; i < n_min_pre_force; i++)
     fix[list_min_pre_force[i]]->min_pre_force(vflag);
 }
 
 /* ----------------------------------------------------------------------
    minimizer force adjustment call, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::min_post_force(int vflag)
 {
   for (int i = 0; i < n_min_post_force; i++)
     fix[list_min_post_force[i]]->min_post_force(vflag);
 }
 
 /* ----------------------------------------------------------------------
    minimizer energy/force evaluation, only for relevant fixes
    return energy and forces on extra degrees of freedom
 ------------------------------------------------------------------------- */
 
 double Modify::min_energy(double *fextra)
 {
   int ifix,index;
 
   index = 0;
   double eng = 0.0;
   for (int i = 0; i < n_min_energy; i++) {
     ifix = list_min_energy[i];
     eng += fix[ifix]->min_energy(&fextra[index]);
     index += fix[ifix]->min_dof();
   }
   return eng;
 }
 
 /* ----------------------------------------------------------------------
    store current state of extra minimizer dof, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::min_store()
 {
   for (int i = 0; i < n_min_energy; i++)
     fix[list_min_energy[i]]->min_store();
 }
 
 /* ----------------------------------------------------------------------
    manage state of extra minimizer dof on a stack, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::min_clearstore()
 {
   for (int i = 0; i < n_min_energy; i++)
     fix[list_min_energy[i]]->min_clearstore();
 }
 
 void Modify::min_pushstore()
 {
   for (int i = 0; i < n_min_energy; i++)
     fix[list_min_energy[i]]->min_pushstore();
 }
 
 void Modify::min_popstore()
 {
   for (int i = 0; i < n_min_energy; i++)
     fix[list_min_energy[i]]->min_popstore();
 }
 
 /* ----------------------------------------------------------------------
    displace extra minimizer dof along vector hextra, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 void Modify::min_step(double alpha, double *hextra)
 {
   int ifix,index;
 
   index = 0;
   for (int i = 0; i < n_min_energy; i++) {
     ifix = list_min_energy[i];
     fix[ifix]->min_step(alpha,&hextra[index]);
     index += fix[ifix]->min_dof();
   }
 }
 
 /* ----------------------------------------------------------------------
    compute max allowed step size along vector hextra, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 double Modify::max_alpha(double *hextra)
 {
   int ifix,index;
 
   double alpha = BIG;
   index = 0;
   for (int i = 0; i < n_min_energy; i++) {
     ifix = list_min_energy[i];
     double alpha_one = fix[ifix]->max_alpha(&hextra[index]);
     alpha = MIN(alpha,alpha_one);
     index += fix[ifix]->min_dof();
   }
   return alpha;
 }
 
 /* ----------------------------------------------------------------------
    extract extra minimizer dof, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 int Modify::min_dof()
 {
   int ndof = 0;
   for (int i = 0; i < n_min_energy; i++)
     ndof += fix[list_min_energy[i]]->min_dof();
   return ndof;
 }
 
 /* ----------------------------------------------------------------------
    reset minimizer reference state of fix, only for relevant fixes
 ------------------------------------------------------------------------- */
 
 int Modify::min_reset_ref()
 {
   int itmp,itmpall;
   itmpall = 0;
   for (int i = 0; i < n_min_energy; i++) {
     itmp = fix[list_min_energy[i]]->min_reset_ref();
     if (itmp) itmpall = 1;
   }
   return itmpall;
 }
 
 /* ----------------------------------------------------------------------
    add a new fix or replace one with same ID
 ------------------------------------------------------------------------- */
 
-void Modify::add_fix(int narg, char **arg, char *suffix)
+void Modify::add_fix(int narg, char **arg, int trysuffix)
 {
   if (narg < 3) error->all(FLERR,"Illegal fix command");
 
   // cannot define fix before box exists unless style is in exception list
   // don't like this way of checking for exceptions by adding fixes to list,
   //   but can't think of better way
   // too late if instantiate fix, then check flag set in fix constructor,
   //   since some fixes access domain settings in their constructor
-  // change NEXCEPT above when add new fix to this list
+  // MUST change NEXCEPT above when add new fix to this list
 
-  const char *exceptions[NEXCEPT] = {"GPU","OMP","property/atom","cmap"};
+  const char *exceptions[NEXCEPT] = 
+    {"GPU","OMP","Intel","property/atom","cmap"};
 
   if (domain->box_exist == 0) {
     int m;
     for (m = 0; m < NEXCEPT; m++)
       if (strcmp(arg[2],exceptions[m]) == 0) break;
     if (m == NEXCEPT)
       error->all(FLERR,"Fix command before simulation box is defined");
   }
 
   // check group ID
 
   int igroup = group->find(arg[1]);
   if (igroup == -1) error->all(FLERR,"Could not find fix group ID");
 
   // if fix ID exists:
   //   set newflag = 0 so create new fix in same location in fix list
   //   error if new style does not match old style
   //     since can't replace it (all when-to-invoke ptrs would be invalid)
   //   warn if new group != old group
   //   delete old fix, but do not call update_callback(),
   //     since will replace this fix and thus other fix locs will not change
   //   set ptr to NULL in case new fix scans list of fixes,
   //     e.g. scan will occur in add_callback() if called by new fix
   // if fix ID does not exist:
   //   set newflag = 1 so create new fix
   //   extend fix and fmask lists as necessary
 
   int ifix,newflag;
   for (ifix = 0; ifix < nfix; ifix++)
     if (strcmp(arg[0],fix[ifix]->id) == 0) break;
 
   if (ifix < nfix) {
     newflag = 0;
-    if (strcmp(arg[2],fix[ifix]->style) != 0)
-      error->all(FLERR,"Replacing a fix, but new style != old style");
+    
+    int match = 0;
+    if (strcmp(arg[2],fix[ifix]->style) == 0) match = 1;
+    if (!match && trysuffix && lmp->suffix_enable) {
+      char estyle[256];
+      if (lmp->suffix) {
+        sprintf(estyle,"%s/%s",arg[2],lmp->suffix);
+        if (strcmp(estyle,fix[ifix]->style) == 0) match = 1;
+      }
+      if (lmp->suffix2) {
+        sprintf(estyle,"%s/%s",arg[2],lmp->suffix2);
+        if (strcmp(estyle,fix[ifix]->style) == 0) match = 1;
+      }
+    }
+    if (!match) error->all(FLERR,"Replacing a fix, but new style != old style");
+
     if (fix[ifix]->igroup != igroup && comm->me == 0)
       error->warning(FLERR,"Replacing a fix, but new group != old group");
     delete fix[ifix];
     fix[ifix] = NULL;
+
   } else {
     newflag = 1;
     if (nfix == maxfix) {
       maxfix += DELTA;
       fix = (Fix **) memory->srealloc(fix,maxfix*sizeof(Fix *),"modify:fix");
       memory->grow(fmask,maxfix,"modify:fmask");
     }
   }
 
   // create the Fix
   // try first with suffix appended
 
   fix[ifix] = NULL;
 
-  if (suffix && lmp->suffix_enable) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",arg[2],suffix);
-    if (fix_map->find(estyle) != fix_map->end()) {
-      FixCreator fix_creator = (*fix_map)[estyle];
-      fix[ifix] = fix_creator(lmp,narg,arg);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[2],lmp->suffix);
+      if (fix_map->find(estyle) != fix_map->end()) {
+        FixCreator fix_creator = (*fix_map)[estyle];
+        fix[ifix] = fix_creator(lmp,narg,arg);
+      }
+    }
+    if (fix[ifix] == NULL && lmp->suffix2) {
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[2],lmp->suffix2);
+      if (fix_map->find(estyle) != fix_map->end()) {
+        FixCreator fix_creator = (*fix_map)[estyle];
+        fix[ifix] = fix_creator(lmp,narg,arg);
+      }
     }
   }
 
   if (fix[ifix] == NULL && fix_map->find(arg[2]) != fix_map->end()) {
     FixCreator fix_creator = (*fix_map)[arg[2]];
     fix[ifix] = fix_creator(lmp,narg,arg);
   }
 
   if (fix[ifix] == NULL) error->all(FLERR,"Invalid fix style");
 
   // check if Fix is in restart_global list
   // if yes, pass state info to the Fix so it can reset itself
 
   for (int i = 0; i < nfix_restart_global; i++)
     if (strcmp(id_restart_global[i],fix[ifix]->id) == 0 &&
         strcmp(style_restart_global[i],fix[ifix]->style) == 0) {
       fix[ifix]->restart(state_restart_global[i]);
       if (comm->me == 0) {
         char *str = (char *) ("Resetting global state of Fix %s Style %s "
                               "from restart file info\n");
         if (screen) fprintf(screen,str,fix[ifix]->id,fix[ifix]->style);
         if (logfile) fprintf(logfile,str,fix[ifix]->id,fix[ifix]->style);
       }
     }
 
   // check if Fix is in restart_peratom list
   // if yes, loop over atoms so they can extract info from atom->extra array
 
   for (int i = 0; i < nfix_restart_peratom; i++)
     if (strcmp(id_restart_peratom[i],fix[ifix]->id) == 0 &&
         strcmp(style_restart_peratom[i],fix[ifix]->style) == 0) {
       for (int j = 0; j < atom->nlocal; j++)
         fix[ifix]->unpack_restart(j,index_restart_peratom[i]);
       fix[ifix]->restart_reset = 1;
       if (comm->me == 0) {
         char *str = (char *) ("Resetting per-atom state of Fix %s Style %s "
                      "from restart file info\n");
         if (screen) fprintf(screen,str,fix[ifix]->id,fix[ifix]->style);
         if (logfile) fprintf(logfile,str,fix[ifix]->id,fix[ifix]->style);
       }
     }
 
   // increment nfix (if new)
   // set fix mask values
   // post_construct() allows new fix to create other fixes
   // nfix increment comes first so that recursive call to add_fix within
   //   post_constructor() will see updated nfix
 
   if (newflag) nfix++;
   fmask[ifix] = fix[ifix]->setmask();
   fix[ifix]->post_constructor();
 }
 
 /* ----------------------------------------------------------------------
    one instance per fix in style_fix.h
 ------------------------------------------------------------------------- */
 
 template <typename T>
 Fix *Modify::fix_creator(LAMMPS *lmp, int narg, char **arg)
 {
   return new T(lmp,narg,arg);
 }
 
 /* ----------------------------------------------------------------------
    modify a Fix's parameters
 ------------------------------------------------------------------------- */
 
 void Modify::modify_fix(int narg, char **arg)
 {
   if (narg < 2) error->all(FLERR,"Illegal fix_modify command");
 
   // lookup Fix ID
 
   int ifix;
   for (ifix = 0; ifix < nfix; ifix++)
     if (strcmp(arg[0],fix[ifix]->id) == 0) break;
   if (ifix == nfix) error->all(FLERR,"Could not find fix_modify ID");
 
   fix[ifix]->modify_params(narg-1,&arg[1]);
 }
 
 /* ----------------------------------------------------------------------
    delete a Fix from list of Fixes
    Atom class must update indices in its list of callbacks to fixes
 ------------------------------------------------------------------------- */
 
 void Modify::delete_fix(const char *id)
 {
   int ifix = find_fix(id);
   if (ifix < 0) error->all(FLERR,"Could not find fix ID to delete");
   delete fix[ifix];
   atom->update_callback(ifix);
 
   // move other Fixes and fmask down in list one slot
 
   for (int i = ifix+1; i < nfix; i++) fix[i-1] = fix[i];
   for (int i = ifix+1; i < nfix; i++) fmask[i-1] = fmask[i];
   nfix--;
 }
 
 /* ----------------------------------------------------------------------
    find a fix by ID
    return index of fix or -1 if not found
 ------------------------------------------------------------------------- */
 
 int Modify::find_fix(const char *id)
 {
   int ifix;
   for (ifix = 0; ifix < nfix; ifix++)
     if (strcmp(id,fix[ifix]->id) == 0) break;
   if (ifix == nfix) return -1;
   return ifix;
 }
 
 /* ----------------------------------------------------------------------
    add a new compute
 ------------------------------------------------------------------------- */
 
-void Modify::add_compute(int narg, char **arg, char *suffix)
+void Modify::add_compute(int narg, char **arg, int trysuffix)
 {
   if (narg < 3) error->all(FLERR,"Illegal compute command");
 
   // error check
 
   for (int icompute = 0; icompute < ncompute; icompute++)
     if (strcmp(arg[0],compute[icompute]->id) == 0)
       error->all(FLERR,"Reuse of compute ID");
 
   // extend Compute list if necessary
 
   if (ncompute == maxcompute) {
     maxcompute += DELTA;
     compute = (Compute **)
       memory->srealloc(compute,maxcompute*sizeof(Compute *),"modify:compute");
   }
 
   // create the Compute
   // try first with suffix appended
 
   compute[ncompute] = NULL;
 
-  if (suffix && lmp->suffix_enable) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",arg[2],suffix);
-    if (compute_map->find(estyle) != compute_map->end()) {
-      ComputeCreator compute_creator = (*compute_map)[estyle];
-      compute[ncompute] = compute_creator(lmp,narg,arg);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[2],lmp->suffix);
+      if (compute_map->find(estyle) != compute_map->end()) {
+        ComputeCreator compute_creator = (*compute_map)[estyle];
+        compute[ncompute] = compute_creator(lmp,narg,arg);
+      }
+    }
+    if (compute[ncompute] == NULL && lmp->suffix2) {
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[2],lmp->suffix2);
+      if (compute_map->find(estyle) != compute_map->end()) {
+        ComputeCreator compute_creator = (*compute_map)[estyle];
+        compute[ncompute] = compute_creator(lmp,narg,arg);
+      }
     }
   }
 
   if (compute[ncompute] == NULL && 
       compute_map->find(arg[2]) != compute_map->end()) {
     ComputeCreator compute_creator = (*compute_map)[arg[2]];
     compute[ncompute] = compute_creator(lmp,narg,arg);
   }
 
   if (compute[ncompute] == NULL) error->all(FLERR,"Invalid compute style");
 
   ncompute++;
 }
 
 /* ----------------------------------------------------------------------
    one instance per compute in style_compute.h
 ------------------------------------------------------------------------- */
 
 template <typename T>
 Compute *Modify::compute_creator(LAMMPS *lmp, int narg, char **arg)
 {
   return new T(lmp,narg,arg);
 }
 
 /* ----------------------------------------------------------------------
    modify a Compute's parameters
 ------------------------------------------------------------------------- */
 
 void Modify::modify_compute(int narg, char **arg)
 {
   if (narg < 2) error->all(FLERR,"Illegal compute_modify command");
 
   // lookup Compute ID
 
   int icompute;
   for (icompute = 0; icompute < ncompute; icompute++)
     if (strcmp(arg[0],compute[icompute]->id) == 0) break;
   if (icompute == ncompute) 
     error->all(FLERR,"Could not find compute_modify ID");
 
   compute[icompute]->modify_params(narg-1,&arg[1]);
 }
 
 /* ----------------------------------------------------------------------
    delete a Compute from list of Computes
 ------------------------------------------------------------------------- */
 
 void Modify::delete_compute(const char *id)
 {
   int icompute = find_compute(id);
   if (icompute < 0) error->all(FLERR,"Could not find compute ID to delete");
   delete compute[icompute];
 
   // move other Computes down in list one slot
 
   for (int i = icompute+1; i < ncompute; i++) compute[i-1] = compute[i];
   ncompute--;
 }
 
 /* ----------------------------------------------------------------------
    find a compute by ID
    return index of compute or -1 if not found
 ------------------------------------------------------------------------- */
 
 int Modify::find_compute(const char *id)
 {
   int icompute;
   for (icompute = 0; icompute < ncompute; icompute++)
     if (strcmp(id,compute[icompute]->id) == 0) break;
   if (icompute == ncompute) return -1;
   return icompute;
 }
 
 /* ----------------------------------------------------------------------
    clear invoked flag of all computes
    called everywhere that computes are used, before computes are invoked
    invoked flag used to avoid re-invoking same compute multiple times
    and to flag computes that store invocation times as having been invoked
 ------------------------------------------------------------------------- */
 
 void Modify::clearstep_compute()
 {
   for (int icompute = 0; icompute < ncompute; icompute++)
     compute[icompute]->invoked_flag = 0;
 }
 
 /* ----------------------------------------------------------------------
    loop over computes that store invocation times
    if its invoked flag set on this timestep, schedule next invocation
    called everywhere that computes are used, after computes are invoked
 ------------------------------------------------------------------------- */
 
 void Modify::addstep_compute(bigint newstep)
 {
   for (int icompute = 0; icompute < n_timeflag; icompute++)
     if (compute[list_timeflag[icompute]]->invoked_flag)
       compute[list_timeflag[icompute]]->addstep(newstep);
 }
 
 /* ----------------------------------------------------------------------
    loop over all computes
    schedule next invocation for those that store invocation times
    called when not sure what computes will be needed on newstep
    do not loop only over n_timeflag, since may not be set yet
 ------------------------------------------------------------------------- */
 
 void Modify::addstep_compute_all(bigint newstep)
 {
   for (int icompute = 0; icompute < ncompute; icompute++)
     if (compute[icompute]->timeflag) compute[icompute]->addstep(newstep);
 }
 
 /* ----------------------------------------------------------------------
    write to restart file for all Fixes with restart info
    (1) fixes that have global state
    (2) fixes that store per-atom quantities
 ------------------------------------------------------------------------- */
 
 void Modify::write_restart(FILE *fp)
 {
   int me = comm->me;
 
   int count = 0;
   for (int i = 0; i < nfix; i++)
     if (fix[i]->restart_global) count++;
 
   if (me == 0) fwrite(&count,sizeof(int),1,fp);
 
   int n;
   for (int i = 0; i < nfix; i++)
     if (fix[i]->restart_global) {
       if (me == 0) {
         n = strlen(fix[i]->id) + 1;
         fwrite(&n,sizeof(int),1,fp);
         fwrite(fix[i]->id,sizeof(char),n,fp);
         n = strlen(fix[i]->style) + 1;
         fwrite(&n,sizeof(int),1,fp);
         fwrite(fix[i]->style,sizeof(char),n,fp);
       }
       fix[i]->write_restart(fp);
     }
 
   count = 0;
   for (int i = 0; i < nfix; i++)
     if (fix[i]->restart_peratom) count++;
 
   if (me == 0) fwrite(&count,sizeof(int),1,fp);
 
   for (int i = 0; i < nfix; i++)
     if (fix[i]->restart_peratom) {
       int maxsize_restart = fix[i]->maxsize_restart();
       if (me == 0) {
         n = strlen(fix[i]->id) + 1;
         fwrite(&n,sizeof(int),1,fp);
         fwrite(fix[i]->id,sizeof(char),n,fp);
         n = strlen(fix[i]->style) + 1;
         fwrite(&n,sizeof(int),1,fp);
         fwrite(fix[i]->style,sizeof(char),n,fp);
         fwrite(&maxsize_restart,sizeof(int),1,fp);
       }
     }
 }
 
 /* ----------------------------------------------------------------------
    read in restart file data on all previously defined Fixes with restart info
    (1) fixes that have global state
    (2) fixes that store per-atom quantities
    return maxsize of extra info that will be stored with any atom
 ------------------------------------------------------------------------- */
 
 int Modify::read_restart(FILE *fp)
 {
   // nfix_restart_global = # of restart entries with global state info
 
   int me = comm->me;
   if (me == 0) fread(&nfix_restart_global,sizeof(int),1,fp);
   MPI_Bcast(&nfix_restart_global,1,MPI_INT,0,world);
 
   // allocate space for each entry
 
   if (nfix_restart_global) {
     id_restart_global = new char*[nfix_restart_global];
     style_restart_global = new char*[nfix_restart_global];
     state_restart_global = new char*[nfix_restart_global];
   }
 
   // read each entry and Bcast to all procs
   // each entry has id string, style string, chunk of state data
 
   int n;
   for (int i = 0; i < nfix_restart_global; i++) {
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     id_restart_global[i] = new char[n];
     if (me == 0) fread(id_restart_global[i],sizeof(char),n,fp);
     MPI_Bcast(id_restart_global[i],n,MPI_CHAR,0,world);
 
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     style_restart_global[i] = new char[n];
     if (me == 0) fread(style_restart_global[i],sizeof(char),n,fp);
     MPI_Bcast(style_restart_global[i],n,MPI_CHAR,0,world);
 
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     state_restart_global[i] = new char[n];
     if (me == 0) fread(state_restart_global[i],sizeof(char),n,fp);
     MPI_Bcast(state_restart_global[i],n,MPI_CHAR,0,world);
   }
 
   // nfix_restart_peratom = # of restart entries with peratom info
 
   int maxsize = 0;
 
   if (me == 0) fread(&nfix_restart_peratom,sizeof(int),1,fp);
   MPI_Bcast(&nfix_restart_peratom,1,MPI_INT,0,world);
 
   // allocate space for each entry
 
   if (nfix_restart_peratom) {
     id_restart_peratom = new char*[nfix_restart_peratom];
     style_restart_peratom = new char*[nfix_restart_peratom];
     index_restart_peratom = new int[nfix_restart_peratom];
   }
 
   // read each entry and Bcast to all procs
   // each entry has id string, style string, maxsize of one atom's data
   // set index = which set of extra data this fix represents
 
   for (int i = 0; i < nfix_restart_peratom; i++) {
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     id_restart_peratom[i] = new char[n];
     if (me == 0) fread(id_restart_peratom[i],sizeof(char),n,fp);
     MPI_Bcast(id_restart_peratom[i],n,MPI_CHAR,0,world);
 
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     style_restart_peratom[i] = new char[n];
     if (me == 0) fread(style_restart_peratom[i],sizeof(char),n,fp);
     MPI_Bcast(style_restart_peratom[i],n,MPI_CHAR,0,world);
 
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     maxsize += n;
 
     index_restart_peratom[i] = i;
   }
 
   return maxsize;
 }
 
 /* ----------------------------------------------------------------------
    delete all lists of restart file Fix info
 ------------------------------------------------------------------------- */
 
 void Modify::restart_deallocate()
 {
   if (nfix_restart_global) {
     for (int i = 0; i < nfix_restart_global; i++) {
       delete [] id_restart_global[i];
       delete [] style_restart_global[i];
       delete [] state_restart_global[i];
     }
     delete [] id_restart_global;
     delete [] style_restart_global;
     delete [] state_restart_global;
   }
 
   if (nfix_restart_peratom) {
     for (int i = 0; i < nfix_restart_peratom; i++) {
       delete [] id_restart_peratom[i];
       delete [] style_restart_peratom[i];
     }
     delete [] id_restart_peratom;
     delete [] style_restart_peratom;
     delete [] index_restart_peratom;
   }
 
   nfix_restart_global = nfix_restart_peratom = 0;
 }
 
 /* ----------------------------------------------------------------------
    create list of fix indices for fixes which match mask
 ------------------------------------------------------------------------- */
 
 void Modify::list_init(int mask, int &n, int *&list)
 {
   delete [] list;
 
   n = 0;
   for (int i = 0; i < nfix; i++) if (fmask[i] & mask) n++;
   list = new int[n];
 
   n = 0;
   for (int i = 0; i < nfix; i++) if (fmask[i] & mask) list[n++] = i;
 }
 
 /* ----------------------------------------------------------------------
    create list of fix indices for end_of_step fixes
    also create end_of_step_every[]
 ------------------------------------------------------------------------- */
 
 void Modify::list_init_end_of_step(int mask, int &n, int *&list)
 {
   delete [] list;
   delete [] end_of_step_every;
 
   n = 0;
   for (int i = 0; i < nfix; i++) if (fmask[i] & mask) n++;
   list = new int[n];
   end_of_step_every = new int[n];
 
   n = 0;
   for (int i = 0; i < nfix; i++)
     if (fmask[i] & mask) {
       list[n] = i;
       end_of_step_every[n++] = fix[i]->nevery;
     }
 }
 
 /* ----------------------------------------------------------------------
    create list of fix indices for thermo energy fixes
    only added to list if fix has THERMO_ENERGY mask
    and its thermo_energy flag was set via fix_modify
 ------------------------------------------------------------------------- */
 
 void Modify::list_init_thermo_energy(int mask, int &n, int *&list)
 {
   delete [] list;
 
   n = 0;
   for (int i = 0; i < nfix; i++)
     if (fmask[i] & mask && fix[i]->thermo_energy) n++;
   list = new int[n];
 
   n = 0;
   for (int i = 0; i < nfix; i++)
     if (fmask[i] & mask && fix[i]->thermo_energy) list[n++] = i;
 }
 
 /* ----------------------------------------------------------------------
    create list of compute indices for computes which store invocation times
 ------------------------------------------------------------------------- */
 
 void Modify::list_init_compute()
 {
   delete [] list_timeflag;
 
   n_timeflag = 0;
   for (int i = 0; i < ncompute; i++)
     if (compute[i]->timeflag) n_timeflag++;
   list_timeflag = new int[n_timeflag];
 
   n_timeflag = 0;
   for (int i = 0; i < ncompute; i++)
     if (compute[i]->timeflag) list_timeflag[n_timeflag++] = i;
 }
 
 /* ----------------------------------------------------------------------
    return # of bytes of allocated memory from all fixes
 ------------------------------------------------------------------------- */
 
 bigint Modify::memory_usage()
 {
   bigint bytes = 0;
   for (int i = 0; i < nfix; i++)
     bytes += static_cast<bigint> (fix[i]->memory_usage());
   for (int i = 0; i < ncompute; i++)
     bytes += static_cast<bigint> (compute[i]->memory_usage());
   return bytes;
 }
diff --git a/src/modify.h b/src/modify.h
index 422c77d2f..ba7101a93 100644
--- a/src/modify.h
+++ b/src/modify.h
@@ -1,219 +1,219 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_MODIFY_H
 #define LMP_MODIFY_H
 
 #include "stdio.h"
 #include "pointers.h"
 #include <map>
 #include <string>
 
 namespace LAMMPS_NS {
 
 class Modify : protected Pointers {
  public:
   int nfix,maxfix;
   int n_initial_integrate,n_post_integrate,n_pre_exchange,n_pre_neighbor;
   int n_pre_force,n_post_force;
   int n_final_integrate,n_end_of_step,n_thermo_energy;
   int n_initial_integrate_respa,n_post_integrate_respa;
   int n_pre_force_respa,n_post_force_respa,n_final_integrate_respa;
   int n_min_pre_exchange,n_min_pre_neighbor;
   int n_min_pre_force,n_min_post_force,n_min_energy;
 
   int restart_pbc_any;       // 1 if any fix sets restart_pbc
   int nfix_restart_global;   // stored fix global info from restart file
   int nfix_restart_peratom;  // stored fix peratom info from restart file
 
   class Fix **fix;           // list of fixes
   int *fmask;                // bit mask for when each fix is applied
 
   int ncompute,maxcompute;   // list of computes
   class Compute **compute;
 
   Modify(class LAMMPS *);
   virtual ~Modify();
   virtual void init();
   virtual void setup(int);
   virtual void setup_pre_exchange();
   virtual void setup_pre_neighbor();
   virtual void setup_pre_force(int);
   virtual void initial_integrate(int);
   virtual void post_integrate();
   virtual void pre_exchange();
   virtual void pre_neighbor();
   virtual void pre_force(int);
   virtual void post_force(int);
   virtual void final_integrate();
   virtual void end_of_step();
   virtual double thermo_energy();
   virtual void post_run();
 
   virtual void setup_pre_force_respa(int, int);
   virtual void initial_integrate_respa(int, int, int);
   virtual void post_integrate_respa(int, int);
   virtual void pre_force_respa(int, int, int);
   virtual void post_force_respa(int, int, int);
   virtual void final_integrate_respa(int, int);
 
   virtual void min_pre_exchange();
   virtual void min_pre_neighbor();
   virtual void min_pre_force(int);
   virtual void min_post_force(int);
 
   virtual double min_energy(double *);
   virtual void min_store();
   virtual void min_step(double, double *);
   virtual void min_clearstore();
   virtual void min_pushstore();
   virtual void min_popstore();
   virtual double max_alpha(double *);
   virtual int min_dof();
   virtual int min_reset_ref();
 
-  void add_fix(int, char **, char *suffix = NULL);
+  void add_fix(int, char **, int trysuffix=0);
   void modify_fix(int, char **);
   void delete_fix(const char *);
   int find_fix(const char *);
 
-  void add_compute(int, char **, char *suffix = NULL);
+  void add_compute(int, char **, int trysuffix=0);
   void modify_compute(int, char **);
   void delete_compute(const char *);
   int find_compute(const char *);
 
   void clearstep_compute();
   void addstep_compute(bigint);
   void addstep_compute_all(bigint);
 
   void write_restart(FILE *);
   int read_restart(FILE *);
   void restart_deallocate();
 
   bigint memory_usage();
 
  protected:
 
   // lists of fixes to apply at different stages of timestep
 
   int *list_initial_integrate,*list_post_integrate;
   int *list_pre_exchange,*list_pre_neighbor;
   int *list_pre_force,*list_post_force;
   int *list_final_integrate,*list_end_of_step,*list_thermo_energy;
   int *list_initial_integrate_respa,*list_post_integrate_respa;
   int *list_pre_force_respa,*list_post_force_respa;
   int *list_final_integrate_respa;
   int *list_min_pre_exchange,*list_min_pre_neighbor;
   int *list_min_pre_force,*list_min_post_force;
   int *list_min_energy;
 
   int *end_of_step_every;
 
   int n_timeflag;            // list of computes that store time invocation
   int *list_timeflag;
 
   char **id_restart_global;           // stored fix global info
   char **style_restart_global;        // from read-in restart file
   char **state_restart_global;
 
   char **id_restart_peratom;          // stored fix peratom info
   char **style_restart_peratom;       // from read-in restart file
   int *index_restart_peratom;
 
   int index_permanent;        // fix/compute index returned to library call
 
   void list_init(int, int &, int *&);
   void list_init_end_of_step(int, int &, int *&);
   void list_init_thermo_energy(int, int &, int *&);
   void list_init_compute();
 
  private:
   typedef Compute *(*ComputeCreator)(LAMMPS *, int, char **);
   std::map<std::string,ComputeCreator> *compute_map;
 
   typedef Fix *(*FixCreator)(LAMMPS *, int, char **);
   std::map<std::string,FixCreator> *fix_map;
 
   template <typename T> static Compute *compute_creator(LAMMPS *, int, char **);
   template <typename T> static Fix *fix_creator(LAMMPS *, int, char **);
 };
 
 }
 
 #endif
 
 /* ERROR/WARNING messages:
 
 W: One or more atoms are time integrated more than once
 
 This is probably an error since you typically do not want to
 advance the positions or velocities of an atom more than once
 per timestep.
 
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
 E: Fix command before simulation box is defined
 
 The fix command cannot be used before a read_data, read_restart, or
 create_box command.
 
 E: Could not find fix group ID
 
 A group ID used in the fix command does not exist.
 
 E: Replacing a fix, but new style != old style
 
 A fix ID can be used a 2nd time, but only if the style matches the
 previous fix.  In this case it is assumed you with to reset a fix's
 parameters.  This error may mean you are mistakenly re-using a fix ID
 when you do not intend to.
 
 W: Replacing a fix, but new group != old group
 
 The ID and style of a fix match for a fix you are changing with a fix
 command, but the new group you are specifying does not match the old
 group.
 
 E: Invalid fix style
 
 The choice of fix style is unknown.
 
 E: Could not find fix_modify ID
 
 A fix ID used in the fix_modify command does not exist.
 
 E: Could not find fix ID to delete
 
 Self-explanatory.
 
 E: Reuse of compute ID
 
 A compute ID cannot be used twice.
 
 E: Invalid compute style
 
 Self-explanatory.
 
 E: Could not find compute_modify ID
 
 Self-explanatory.
 
 E: Could not find compute ID to delete
 
 Self-explanatory.
 
 */
diff --git a/src/neigh_list.cpp b/src/neigh_list.cpp
index dc925a6f9..0de4c0cb5 100644
--- a/src/neigh_list.cpp
+++ b/src/neigh_list.cpp
@@ -1,295 +1,296 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "neigh_list.h"
 #include "atom.h"
 #include "comm.h"
 #include "update.h"
 #include "neighbor.h"
 #include "neigh_request.h"
 #include "my_page.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define PGDELTA 1
 
 enum{NSQ,BIN,MULTI};     // also in neighbor.cpp
 
 /* ---------------------------------------------------------------------- */
 
 NeighList::NeighList(LAMMPS *lmp) :
   Pointers(lmp)
 {
   maxatoms = 0;
 
   inum = gnum = 0;
   ilist = NULL;
   numneigh = NULL;
   firstneigh = NULL;
   firstdouble = NULL;
 
   dnum = 0;
 
   last_build = -1;
 
   iskip = NULL;
   ijskip = NULL;
 
   listgranhistory = NULL;
   fix_history = NULL;
 
   respamiddle = 0;
   listinner = NULL;
   listmiddle = NULL;
   listfull = NULL;
   listcopy = NULL;
   listskip = NULL;
 
   maxstencil = 0;
   stencil = NULL;
   stencilxyz = NULL;
 
   maxstencil_multi = 0;
   nstencil_multi = NULL;
   stencil_multi = NULL;
   distsq_multi = NULL;
 
   ipage = NULL;
   dpage = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 NeighList::~NeighList()
 {
   if (!listcopy) {
     memory->destroy(ilist);
     memory->destroy(numneigh);
     memory->sfree(firstneigh);
     memory->sfree(firstdouble);
 
     delete [] ipage;
     if (dnum) delete [] dpage;
   }
 
   delete [] iskip;
   memory->destroy(ijskip);
 
   if (maxstencil) memory->destroy(stencil);
   if (ghostflag) memory->destroy(stencilxyz);
 
   if (maxstencil_multi) {
     for (int i = 1; i <= atom->ntypes; i++) {
       memory->destroy(stencil_multi[i]);
       memory->destroy(distsq_multi[i]);
     }
     delete [] nstencil_multi;
     delete [] stencil_multi;
     delete [] distsq_multi;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void NeighList::setup_pages(int pgsize_caller, int oneatom_caller, 
                             int dnum_caller)
 {
   pgsize = pgsize_caller;
   oneatom = oneatom_caller;
   dnum = dnum_caller;
 
   int nmypage = comm->nthreads;
   ipage = new MyPage<int>[nmypage];
   for (int i = 0; i < nmypage; i++)
     ipage[i].init(oneatom,pgsize,PGDELTA);
 
   if (dnum) {
     dpage = new MyPage<double>[nmypage];
     for (int i = 0; i < nmypage; i++)
       dpage[i].init(dnum*oneatom,dnum*pgsize,PGDELTA);
   }
   else dpage = NULL;
 }
 
 /* ----------------------------------------------------------------------
    grow atom arrays to allow for nmax atoms
    triggered by more atoms on a processor
    caller knows if this list stores neighs of local atoms or local+ghost
 ------------------------------------------------------------------------- */
 
 void NeighList::grow(int nmax)
 {
   // skip if this list is already long enough to store nmax atoms
 
   if (nmax <= maxatoms) return;
   maxatoms = nmax;
 
   memory->destroy(ilist);
   memory->destroy(numneigh);
   memory->sfree(firstneigh);
   memory->sfree(firstdouble);
 
   memory->create(ilist,maxatoms,"neighlist:ilist");
   memory->create(numneigh,maxatoms,"neighlist:numneigh");
   firstneigh = (int **) memory->smalloc(maxatoms*sizeof(int *),
                                         "neighlist:firstneigh");
 
   if (dnum)
     firstdouble = (double **) memory->smalloc(maxatoms*sizeof(double *),
                                               "neighlist:firstdouble");
 }
 
 /* ----------------------------------------------------------------------
    insure stencils are large enough for smax bins
    style = BIN or MULTI
 ------------------------------------------------------------------------- */
 
 void NeighList::stencil_allocate(int smax, int style)
 {
   int i;
 
   if (style == BIN) {
     if (smax > maxstencil) {
       maxstencil = smax;
       memory->destroy(stencil);
       memory->create(stencil,maxstencil,"neighlist:stencil");
       if (ghostflag) {
         memory->destroy(stencilxyz);
         memory->create(stencilxyz,maxstencil,3,"neighlist:stencilxyz");
       }
     }
 
   } else {
     int n = atom->ntypes;
     if (maxstencil_multi == 0) {
       nstencil_multi = new int[n+1];
       stencil_multi = new int*[n+1];
       distsq_multi = new double*[n+1];
       for (i = 1; i <= n; i++) {
         nstencil_multi[i] = 0;
         stencil_multi[i] = NULL;
         distsq_multi[i] = NULL;
       }
     }
     if (smax > maxstencil_multi) {
       maxstencil_multi = smax;
       for (i = 1; i <= n; i++) {
         memory->destroy(stencil_multi[i]);
         memory->destroy(distsq_multi[i]);
         memory->create(stencil_multi[i],maxstencil_multi,
                        "neighlist:stencil_multi");
         memory->create(distsq_multi[i],maxstencil_multi,
                        "neighlist:distsq_multi");
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    copy skip info from request rq into list's iskip,ijskip
 ------------------------------------------------------------------------- */
 
 void NeighList::copy_skip_info(int *rq_iskip, int **rq_ijskip)
 {
   int ntypes = atom->ntypes;
   iskip = new int[ntypes+1];
   memory->create(ijskip,ntypes+1,ntypes+1,"neigh_list:ijskip");
   int i,j;
   for (i = 1; i <= ntypes; i++) iskip[i] = rq_iskip[i];
   for (i = 1; i <= ntypes; i++)
     for (j = 1; j <= ntypes; j++)
       ijskip[i][j] = rq_ijskip[i][j];
 }
 
 /* ----------------------------------------------------------------------
    print attributes of this list and associated request
 ------------------------------------------------------------------------- */
 
 void NeighList::print_attributes()
 {
   if (comm->me != 0) return;
 
   NeighRequest *rq = neighbor->requests[index];
 
   printf("Neighbor list/request %d:\n",index);
   printf("  %d = build flag\n",buildflag);
   printf("  %d = grow flag\n",growflag);
   printf("  %d = stencil flag\n",stencilflag);
   printf("  %d = ghost flag\n",ghostflag);
   printf("\n");
   printf("  %d = pair\n",rq->pair);
   printf("  %d = fix\n",rq->fix);
   printf("  %d = compute\n",rq->compute);
   printf("  %d = command\n",rq->command);
   printf("\n");
   printf("  %d = half\n",rq->half);
   printf("  %d = full\n",rq->full);
   printf("  %d = gran\n",rq->gran);
   printf("  %d = granhistory\n",rq->granhistory);
   printf("  %d = respainner\n",rq->respainner);
   printf("  %d = respamiddle\n",rq->respamiddle);
   printf("  %d = respaouter\n",rq->respaouter);
   printf("  %d = half_from_full\n",rq->half_from_full);
   printf("\n");
   printf("  %d = occasional\n",rq->occasional);
   printf("  %d = dnum\n",rq->dnum);
   printf("  %d = omp\n",rq->omp);
+  printf("  %d = intel\n",rq->intel);
   printf("  %d = ghost\n",rq->ghost);
   printf("  %d = cudable\n",rq->cudable);
   printf("  %d = omp\n",rq->omp);
   printf("  %d = copy\n",rq->copy);
   printf("  %d = skip\n",rq->skip);
   printf("  %d = otherlist\n",rq->otherlist);
   printf("  %p = listskip\n",listskip);
   printf("\n");
 }
 
 /* ----------------------------------------------------------------------
    return # of bytes of allocated memory
    if growflag = 0, maxatoms & maxpage will also be 0
    if stencilflag = 0, maxstencil * maxstencil_multi will also be 0
 ------------------------------------------------------------------------- */
 
 bigint NeighList::memory_usage()
 {
   bigint bytes = 0;
   bytes += memory->usage(ilist,maxatoms);
   bytes += memory->usage(numneigh,maxatoms);
   bytes += maxatoms * sizeof(int *);
 
   int nmypage = comm->nthreads;
 
   if (ipage) {
     for (int i = 0; i < nmypage; i++)
       bytes += ipage[i].size();
   }
 
   if (dnum && dpage) {
     for (int i = 0; i < nmypage; i++) {
       bytes += maxatoms * sizeof(double *);
       bytes += dpage[i].size();
     }
   }
 
   if (maxstencil) bytes += memory->usage(stencil,maxstencil);
   if (ghostflag) bytes += memory->usage(stencilxyz,maxstencil,3);
 
   if (maxstencil_multi) {
     bytes += memory->usage(stencil_multi,atom->ntypes,maxstencil_multi);
     bytes += memory->usage(distsq_multi,atom->ntypes,maxstencil_multi);
   }
 
   return bytes;
 }
diff --git a/src/neigh_request.cpp b/src/neigh_request.cpp
index 643d11b3b..95ddc0151 100644
--- a/src/neigh_request.cpp
+++ b/src/neigh_request.cpp
@@ -1,208 +1,212 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "neigh_request.h"
 #include "atom.h"
 #include "memory.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 NeighRequest::NeighRequest(LAMMPS *lmp) : Pointers(lmp)
 {
   // default ID = 0
 
   id = 0;
   unprocessed = 1;
 
   // default is pair request
 
   pair = 1;
   fix = compute = command = 0;
 
   // default is half neighbor list
 
   half = 1;
   full = 0;
   full_cluster = 0;
   gran = granhistory = 0;
   respainner = respamiddle = respaouter = 0;
   half_from_full = 0;
 
   // default is every reneighboring
   // default is use newton_pair setting in force
   // default is encode special bond flags
   // default is no auxiliary floating point values
   // default is no neighbors of ghosts
   // default is no CUDA neighbor list build
   // default is no multi-threaded neighbor list build
   // default is no Kokkos neighbor list build
 
   occasional = 0;
   newton = 0;
   special = 1;
   dnum = 0;
   ghost = 0;
   cudable = 0;
   omp = 0;
+  intel = 0;
   kokkos_host = kokkos_device = 0;
 
   // default is no copy or skip
 
   copy = 0;
   skip = 0;
   iskip = NULL;
   ijskip = NULL;
   otherlist = -1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 NeighRequest::~NeighRequest()
 {
   delete [] iskip;
   memory->destroy(ijskip);
 }
 
 /* ----------------------------------------------------------------------
    archive request params that Neighbor may change after call to identical()
 ------------------------------------------------------------------------- */
 
 void NeighRequest::archive()
 {
   half_original = half;
   half_from_full_original = half_from_full;
   copy_original = copy;
   otherlist_original = otherlist;
 }
 
 /* ----------------------------------------------------------------------
    compare this request to other request
    identical means all params set by requester are the same
    compare to original values in other if Neighbor may have changed them
    return 1 if identical, 0 if not
 ------------------------------------------------------------------------- */
 
 int NeighRequest::identical(NeighRequest *other)
 {
   int same = 1;
 
   // set same = 0 if old list was never processed
 
   if (other->unprocessed) same = 0;
 
   if (requestor != other->requestor) same = 0;
   if (id != other->id) same = 0;
 
   if (pair != other->pair) same = 0;
   if (fix != other->fix) same = 0;
   if (compute != other->compute) same = 0;
   if (command != other->command) same = 0;
 
   if (half != other->half_original) same = 0;
   if (full != other->full) same = 0;
   if (gran != other->gran) same = 0;
   if (granhistory != other->granhistory) same = 0;
   if (respainner != other->respainner) same = 0;
   if (respamiddle != other->respamiddle) same = 0;
   if (respaouter != other->respaouter) same = 0;
   if (half_from_full != other->half_from_full_original) same = 0;
 
   if (newton != other->newton) same = 0;
   if (occasional != other->occasional) same = 0;
   if (special != other->special) same = 0;
   if (dnum != other->dnum) same = 0;
   if (ghost != other->ghost) same = 0;
   if (cudable != other->cudable) same = 0;
   if (omp != other->omp) same = 0;
+  if (intel != other->intel) same = 0;
 
   if (copy != other->copy_original) same = 0;
   if (same_skip(other) == 0) same = 0;
   if (otherlist != other->otherlist_original) same = 0;
 
   return same;
 }
 
 /* ----------------------------------------------------------------------
    compare kind of this request to other request
    return 1 if same, 0 if different
 ------------------------------------------------------------------------- */
 
 int NeighRequest::same_kind(NeighRequest *other)
 {
   int same = 1;
 
   if (half != other->half) same = 0;
   if (full != other->full) same = 0;
   if (gran != other->gran) same = 0;
   if (granhistory != other->granhistory) same = 0;
   if (respainner != other->respainner) same = 0;
   if (respamiddle != other->respamiddle) same = 0;
   if (respaouter != other->respaouter) same = 0;
   if (half_from_full != other->half_from_full) same = 0;
   if (newton != other->newton) same = 0;
   if (ghost != other->ghost) same = 0;
   if (cudable != other->cudable) same = 0;
   if (omp != other->omp) same = 0;
+  if (intel != other->intel) same = 0;
 
   return same;
 }
 
 /* ----------------------------------------------------------------------
    compare skip attributes of this request to other request
    return 1 if same, 0 if different
 ------------------------------------------------------------------------- */
 
 int NeighRequest::same_skip(NeighRequest *other)
 {
   int i,j;
 
   int same = 1;
 
   if (skip != other->skip) same = 0;
   if (skip && other->skip) {
     int ntypes = atom->ntypes;
     for (i = 1; i <= ntypes; i++)
       if (iskip[i] != other->iskip[i]) same = 0;
     for (i = 1; i <= ntypes; i++)
       for (j = 1; j <= ntypes; j++)
         if (ijskip[i][j] != other->ijskip[i][j]) same = 0;
   }
 
   return same;
 }
 
 /* ----------------------------------------------------------------------
    set kind and other values of this request to that of other request
 ------------------------------------------------------------------------- */
 
 void NeighRequest::copy_request(NeighRequest *other)
 {
   half = 0;
 
   if (other->half) half = 1;
   if (other->full) full = 1;
   if (other->gran) gran = 1;
   if (other->granhistory) granhistory = 1;
   if (other->respainner) respainner = 1;
   if (other->respamiddle) respamiddle = 1;
   if (other->respaouter) respaouter = 1;
   if (other->half_from_full) half_from_full = 1;
 
   newton = other->newton;
   dnum = other->dnum;
   ghost = other->ghost;
   cudable = other->cudable;
   omp = other->omp;
+  intel = other->intel;
 }
diff --git a/src/neigh_request.h b/src/neigh_request.h
index 769d5354b..41fa951fe 100644
--- a/src/neigh_request.h
+++ b/src/neigh_request.h
@@ -1,123 +1,124 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_NEIGH_REQUEST_H
 #define LMP_NEIGH_REQUEST_H
 
 #include "pointers.h"
 
 namespace LAMMPS_NS {
 
 class NeighRequest : protected Pointers {
  public:
   void *requestor;       // class that made request
   int id;                // ID of request
                          // used to track multiple requests from one class
   int unprocessed;       // 1 when first requested
                          // 0 after processed by Neighbor class
 
   // which class is requesting the list, one flag is 1, others are 0
 
   int pair;              // set by default
   int fix;
   int compute;
   int command;
 
   // kind of list requested, one flag is 1, others are 0
   // set by requesting class
 
   int half;              // 1 if half neigh list (set by default)
   int full;              // 1 if full neigh list
   int full_cluster;      // only used by Kokkos pair styles
 
   int gran;              // 1 if granular list 
   int granhistory;       // 1 if granular history list
 
   int respainner;        // 1 if a rRESPA inner list
   int respamiddle;       // 1 if a rRESPA middle list
   int respaouter;        // 1 if a rRESPA outer list
 
   int half_from_full;    // 1 if half list computed from previous full list
 
   // 0 if needed every reneighboring during run
   // 1 if occasionally needed by a fix, compute, etc
   // set by requesting class
 
   int occasional;
 
   // 0 if use force::newton_pair setting
   // 1 if override with pair newton on
   // 2 if override with pair newton off
 
   int newton;
 
   // 0 if user of list wants no encoding of special bond flags and all neighs
   // 1 if user of list wants special bond flags encoded, set by default
 
   int special;
 
   // number of auxiliary floating point values to store, 0 if none
   // set by requesting class
 
   int dnum;
 
   // 1 if also need neighbors of ghosts
 
   int ghost;
 
   // 1 if neighbor list build will be done on GPU
 
   int cudable;
 
-  // 1 if using multi-threaded neighbor list build
+  // 1 if using multi-threaded neighbor list build for USER-OMP or USER-INTEL
 
   int omp;
+  int intel;
 
   // 1 if using Kokkos neighbor build
 
   int kokkos_host;
   int kokkos_device;
  
   // set by neighbor and pair_hybrid after all requests are made
   // these settings do not change kind value
 
   int copy;              // 1 if this list copied from another list
 
   int skip;              // 1 if this list skips atom types from another list
   int *iskip;            // iskip[i] if atoms of type I are not in list
   int **ijskip;          // ijskip[i][j] if pairs of type I,J are not in list
 
   int otherlist;         // index of other list to copy or skip from
 
   // original params by requester
   // stored to compare against in identical() in case Neighbor changes them
 
   int half_original;
   int half_from_full_original;
   int copy_original;
   int otherlist_original;
 
   // methods
 
   NeighRequest(class LAMMPS *);
   ~NeighRequest();
   void archive();
   int identical(NeighRequest *);
   int same_kind(NeighRequest *);
   int same_skip(NeighRequest *);
   void copy_request(NeighRequest *);
 };
 
 }
 
 #endif
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index 705887ba0..28c051313 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -1,2045 +1,2057 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author (triclinic and multi-neigh) : Pieter in 't Veld (SNL)
 ------------------------------------------------------------------------- */
 
 #include "lmptype.h"
 #include "mpi.h"
 #include "math.h"
 #include "stdlib.h"
 #include "string.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "pair.h"
 #include "domain.h"
 #include "group.h"
 #include "modify.h"
 #include "fix.h"
 #include "compute.h"
 #include "update.h"
 #include "respa.h"
 #include "output.h"
 #include "citeme.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define RQDELTA 1
 #define EXDELTA 1
 
 #define LB_FACTOR 1.5
 #define SMALL 1.0e-6
 #define BIG 1.0e20
 #define CUT2BIN_RATIO 100
 
 enum{NSQ,BIN,MULTI};     // also in neigh_list.cpp
 
 static const char cite_neigh_multi[] =
   "neighbor multi command:\n\n"
   "@Article{Intveld08,\n"
   " author =  {P.{\\,}J.~in{\\,}'t~Veld and S.{\\,}J.~Plimpton"
   " and G.{\\,}S.~Grest},\n"
   " title =   {Accurate and Efficient Methods for Modeling Colloidal\n"
   "            Mixtures in an Explicit Solvent using Molecular Dynamics},\n"
   " journal = {Comp.~Phys.~Comm.},\n"
   " year =    2008,\n"
   " volume =  179,\n"
   " pages =   {320--329}\n"
   "}\n\n";
 
 //#define NEIGH_LIST_DEBUG 1
 
 /* ---------------------------------------------------------------------- */
 
 Neighbor::Neighbor(LAMMPS *lmp) : Pointers(lmp)
 {
   MPI_Comm_rank(world,&me);
   MPI_Comm_size(world,&nprocs);
 
   style = BIN;
   every = 1;
   delay = 10;
   dist_check = 1;
   pgsize = 100000;
   oneatom = 2000;
   binsizeflag = 0;
   build_once = 0;
   cluster_check = 0;
   binatomflag = 1;
 
   cutneighsq = NULL;
   cutneighghostsq = NULL;
   cuttype = NULL;
   cuttypesq = NULL;
   fixchecklist = NULL;
 
   // coords at last neighboring
 
   maxhold = 0;
   xhold = NULL;
 
   // binning
 
   maxhead = 0;
   binhead = NULL;
   maxbin = 0;
   bins = NULL;
 
   // pair exclusion list info
 
   includegroup = 0;
 
   nex_type = maxex_type = 0;
   ex1_type = ex2_type = NULL;
   ex_type = NULL;
 
   nex_group = maxex_group = 0;
   ex1_group = ex2_group = ex1_bit = ex2_bit = NULL;
 
   nex_mol = maxex_mol = 0;
   ex_mol_group = ex_mol_bit = NULL;
 
   // pair lists
 
   maxatom = 0;
   nblist = nglist = nslist = 0;
 
   nlist = 0;
   lists = NULL;
   pair_build = NULL;
   stencil_create = NULL;
   blist = glist = slist = NULL;
   anyghostlist = 0;
 
   nrequest = maxrequest = 0;
   requests = NULL;
 
   old_style = BIN;
   old_triclinic = 0;
   old_pgsize = pgsize;
   old_oneatom = oneatom;
   old_nrequest = 0;
   old_requests = NULL;
 
   // bond lists
 
   maxbond = 0;
   bondlist = NULL;
   maxangle = 0;
   anglelist = NULL;
   maxdihedral = 0;
   dihedrallist = NULL;
   maximproper = 0;
   improperlist = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 Neighbor::~Neighbor()
 {
   memory->destroy(cutneighsq);
   memory->destroy(cutneighghostsq);
   delete [] cuttype;
   delete [] cuttypesq;
   delete [] fixchecklist;
 
   memory->destroy(xhold);
 
   memory->destroy(binhead);
   memory->destroy(bins);
 
   memory->destroy(ex1_type);
   memory->destroy(ex2_type);
   memory->destroy(ex_type);
 
   memory->destroy(ex1_group);
   memory->destroy(ex2_group);
   delete [] ex1_bit;
   delete [] ex2_bit;
 
   memory->destroy(ex_mol_group);
   delete [] ex_mol_bit;
 
   for (int i = 0; i < nlist; i++) delete lists[i];
   delete [] lists;
   delete [] pair_build;
   delete [] stencil_create;
   delete [] blist;
   delete [] glist;
   delete [] slist;
 
   for (int i = 0; i < nrequest; i++) delete requests[i];
   memory->sfree(requests);
   for (int i = 0; i < old_nrequest; i++) delete old_requests[i];
   memory->sfree(old_requests);
 
   memory->destroy(bondlist);
   memory->destroy(anglelist);
   memory->destroy(dihedrallist);
   memory->destroy(improperlist);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Neighbor::init()
 {
   int i,j,m,n;
 
   ncalls = ndanger = 0;
   dimension = domain->dimension;
   triclinic = domain->triclinic;
   newton_pair = force->newton_pair;
 
   // error check
 
   if (delay > 0 && (delay % every) != 0)
     error->all(FLERR,"Neighbor delay must be 0 or multiple of every setting");
 
   if (pgsize < 10*oneatom)
     error->all(FLERR,"Neighbor page size must be >= 10x the one atom setting");
 
   // ------------------------------------------------------------------
   // settings
 
   // bbox lo/hi = bounding box of entire domain, stored by Domain
 
   if (triclinic == 0) {
     bboxlo = domain->boxlo;
     bboxhi = domain->boxhi;
   } else {
     bboxlo = domain->boxlo_bound;
     bboxhi = domain->boxhi_bound;
   }
 
   // set neighbor cutoffs (force cutoff + skin)
   // trigger determines when atoms migrate and neighbor lists are rebuilt
   //   needs to be non-zero for migration distance check
   //   even if pair = NULL and no neighbor lists are used
   // cutneigh = force cutoff + skin if cutforce > 0, else cutneigh = 0
   // cutneighghost = pair cutghost if it requests it, else same as cutneigh
 
   triggersq = 0.25*skin*skin;
   boxcheck = 0;
   if (domain->box_change && (domain->xperiodic || domain->yperiodic ||
                              (dimension == 3 && domain->zperiodic)))
       boxcheck = 1;
 
   n = atom->ntypes;
   if (cutneighsq == NULL) {
     if (lmp->kokkos) init_cutneighsq_kokkos(n);
     else memory->create(cutneighsq,n+1,n+1,"neigh:cutneighsq");
     memory->create(cutneighghostsq,n+1,n+1,"neigh:cutneighghostsq");
     cuttype = new double[n+1];
     cuttypesq = new double[n+1];
   }
 
   double cutoff,delta,cut;
   cutneighmin = BIG;
   cutneighmax = 0.0;
 
   for (i = 1; i <= n; i++) {
     cuttype[i] = cuttypesq[i] = 0.0;
     for (j = 1; j <= n; j++) {
       if (force->pair) cutoff = sqrt(force->pair->cutsq[i][j]);
       else cutoff = 0.0;
       if (cutoff > 0.0) delta = skin;
       else delta = 0.0;
       cut = cutoff + delta;
 
       cutneighsq[i][j] = cut*cut;
       cuttype[i] = MAX(cuttype[i],cut);
       cuttypesq[i] = MAX(cuttypesq[i],cut*cut);
       cutneighmin = MIN(cutneighmin,cut);
       cutneighmax = MAX(cutneighmax,cut);
 
       if (force->pair && force->pair->ghostneigh) {
         cut = force->pair->cutghost[i][j] + skin;
         cutneighghostsq[i][j] = cut*cut;
       } else cutneighghostsq[i][j] = cut*cut;
     }
   }
   cutneighmaxsq = cutneighmax * cutneighmax;
 
   // check other classes that can induce reneighboring in decide()
   // don't check if build_once is set
 
   restart_check = 0;
   if (output->restart_flag) restart_check = 1;
 
   delete [] fixchecklist;
   fixchecklist = NULL;
   fixchecklist = new int[modify->nfix];
 
   fix_check = 0;
   for (i = 0; i < modify->nfix; i++)
     if (modify->fix[i]->force_reneighbor)
       fixchecklist[fix_check++] = i;
 
   must_check = 0;
   if (restart_check || fix_check) must_check = 1;
   if (build_once) must_check = 0;
 
   // set special_flag for 1-2, 1-3, 1-4 neighbors
   // flag[0] is not used, flag[1] = 1-2, flag[2] = 1-3, flag[3] = 1-4
   // flag = 0 if both LJ/Coulomb special values are 0.0
   // flag = 1 if both LJ/Coulomb special values are 1.0
   // flag = 2 otherwise or if KSpace solver is enabled
   // pairwise portion of KSpace solver uses all 1-2,1-3,1-4 neighbors
 
   if (force->special_lj[1] == 0.0 && force->special_coul[1] == 0.0)
     special_flag[1] = 0;
   else if (force->special_lj[1] == 1.0 && force->special_coul[1] == 1.0)
     special_flag[1] = 1;
   else special_flag[1] = 2;
 
   if (force->special_lj[2] == 0.0 && force->special_coul[2] == 0.0)
     special_flag[2] = 0;
   else if (force->special_lj[2] == 1.0 && force->special_coul[2] == 1.0)
     special_flag[2] = 1;
   else special_flag[2] = 2;
 
   if (force->special_lj[3] == 0.0 && force->special_coul[3] == 0.0)
     special_flag[3] = 0;
   else if (force->special_lj[3] == 1.0 && force->special_coul[3] == 1.0)
     special_flag[3] = 1;
   else special_flag[3] = 2;
 
   if (force->kspace || force->pair_match("coul/wolf",0)
       || force->pair_match("coul/dsf",0))
      special_flag[1] = special_flag[2] = special_flag[3] = 2;
 
   // maxwt = max multiplicative factor on atom indices stored in neigh list
 
   maxwt = 0;
   if (special_flag[1] == 2) maxwt = 2;
   if (special_flag[2] == 2) maxwt = 3;
   if (special_flag[3] == 2) maxwt = 4;
 
   // rRESPA cutoffs
 
   int respa = 0;
   if (update->whichflag == 1 && strstr(update->integrate_style,"respa")) {
     if (((Respa *) update->integrate)->level_inner >= 0) respa = 1;
     if (((Respa *) update->integrate)->level_middle >= 0) respa = 2;
   }
 
   if (respa) {
     double *cut_respa = ((Respa *) update->integrate)->cutoff;
     cut_inner_sq = (cut_respa[1] + skin) * (cut_respa[1] + skin);
     cut_middle_sq = (cut_respa[3] + skin) * (cut_respa[3] + skin);
     cut_middle_inside_sq = (cut_respa[0] - skin) * (cut_respa[0] - skin);
     if (cut_respa[0]-skin < 0) cut_middle_inside_sq = 0.0;
   }
 
   // ------------------------------------------------------------------
   // xhold, bins, exclusion lists
 
   // free xhold and bins if not needed for this run
 
   if (dist_check == 0) {
     memory->destroy(xhold);
     maxhold = 0;
     xhold = NULL;
   }
 
   if (style == NSQ) {
     memory->destroy(bins);
     memory->destroy(binhead);
     maxbin = maxhead = 0;
     binhead = NULL;
     bins = NULL;
   }
 
   // 1st time allocation of xhold and bins
 
   if (dist_check) {
     if (maxhold == 0) {
       maxhold = atom->nmax;
       memory->create(xhold,maxhold,3,"neigh:xhold");
     }
   }
 
   if (style != NSQ) {
     if (maxbin == 0) {
       maxbin = atom->nmax;
       memory->create(bins,maxbin,"bins");
     }
   }
 
   // exclusion lists for type, group, molecule settings from neigh_modify
   // warn if exclusions used with KSpace solver
 
   n = atom->ntypes;
 
   if (nex_type == 0 && nex_group == 0 && nex_mol == 0) exclude = 0;
   else exclude = 1;
 
   if (nex_type) {
     memory->destroy(ex_type);
     memory->create(ex_type,n+1,n+1,"neigh:ex_type");
 
     for (i = 1; i <= n; i++)
       for (j = 1; j <= n; j++)
         ex_type[i][j] = 0;
 
     for (i = 0; i < nex_type; i++) {
       if (ex1_type[i] <= 0 || ex1_type[i] > n ||
           ex2_type[i] <= 0 || ex2_type[i] > n)
         error->all(FLERR,"Invalid atom type in neighbor exclusion list");
       ex_type[ex1_type[i]][ex2_type[i]] = 1;
       ex_type[ex2_type[i]][ex1_type[i]] = 1;
     }
   }
 
   if (nex_group) {
     delete [] ex1_bit;
     delete [] ex2_bit;
     ex1_bit = new int[nex_group];
     ex2_bit = new int[nex_group];
 
     for (i = 0; i < nex_group; i++) {
       ex1_bit[i] = group->bitmask[ex1_group[i]];
       ex2_bit[i] = group->bitmask[ex2_group[i]];
     }
   }
 
   if (nex_mol) {
     delete [] ex_mol_bit;
     ex_mol_bit = new int[nex_mol];
 
     for (i = 0; i < nex_mol; i++)
       ex_mol_bit[i] = group->bitmask[ex_mol_group[i]];
   }
 
   if (exclude && force->kspace && me == 0)
     error->warning(FLERR,"Neighbor exclusions used with KSpace solver "
                    "may give inconsistent Coulombic energies");
 
   // ------------------------------------------------------------------
   // pairwise lists
 
   // test if pairwise lists need to be re-created
   // no need to re-create if:
   //   neigh style, triclinic, pgsize, oneatom have not changed
   //   current requests = old requests
   // first archive request params for current requests 
   //   before Neighbor possibly changes them below
 
   for (i = 0; i < nrequest; i++) requests[i]->archive();
 
   int same = 1;
   if (style != old_style) same = 0;
   if (triclinic != old_triclinic) same = 0;
   if (pgsize != old_pgsize) same = 0;
   if (oneatom != old_oneatom) same = 0;
   if (nrequest != old_nrequest) same = 0;
   else
     for (i = 0; i < nrequest; i++)
       if (requests[i]->identical(old_requests[i]) == 0) same = 0;
 
 #ifdef NEIGH_LIST_DEBUG
   if (comm->me == 0) printf("SAME flag %d\n",same);
 #endif
 
   // if old and new are not the same, create new pairwise lists
 
   if (!same) {
 
     // delete old lists and create new ones
 
     for (i = 0; i < nlist; i++) delete lists[i];
     delete [] lists;
     delete [] pair_build;
     delete [] stencil_create;
 
     if (lmp->kokkos) nlist = init_lists_kokkos();
     else nlist = nrequest;
 
     lists = new NeighList*[nrequest];
     pair_build = new PairPtr[nrequest];
     stencil_create = new StencilPtr[nrequest];
 
     // initialize to NULL since some may be Kokkos lists
 
     for (i = 0; i < nrequest; i++) {
       lists[i] = NULL;
       pair_build[i] = NULL;
       stencil_create[i] = NULL;
     }
 
     // create individual lists, one per request
     // pass list ptr back to requestor (except for Command class)
     // wait to allocate initial pages until copy lists are detected
 
     for (i = 0; i < nrequest; i++) {
       if (requests[i]->kokkos_host || requests[i]->kokkos_device) continue;
       lists[i] = new NeighList(lmp);
       lists[i]->index = i;
 
       if (requests[i]->pair) {
         Pair *pair = (Pair *) requests[i]->requestor;
         pair->init_list(requests[i]->id,lists[i]);
       } else if (requests[i]->fix) {
         Fix *fix = (Fix *) requests[i]->requestor;
         fix->init_list(requests[i]->id,lists[i]);
       } else if (requests[i]->compute) {
         Compute *compute = (Compute *) requests[i]->requestor;
         compute->init_list(requests[i]->id,lists[i]);
       }
     }
 
     // detect lists that are connected to other lists
     // if-then-else sequence and processed flag is important
     //   since don't want to re-process skip or copy lists further down
 
     int processed;
 
     for (i = 0; i < nrequest; i++) {
       if (!lists[i]) continue;
       processed = 0;
 
       // copy: point this list at request->otherlist, could be a skip list
 
       if (requests[i]->copy) {
         lists[i]->listcopy = lists[requests[i]->otherlist];
         processed = 1;
 
       // skip: point this list at request->otherlist,
       //       copy skip info from request
       // skip list still needs to have granhistory or respa info added below
 
       } else if (requests[i]->skip) {
         lists[i]->listskip = lists[requests[i]->otherlist];
         lists[i]->copy_skip_info(requests[i]->iskip,requests[i]->ijskip);
         processed = 1;
 
       // half_from_full: point this list at full list that comes right before
       //   will only be case if pair style requested one after other
 
       } else if (requests[i]->half_from_full) {
         lists[i]->listfull = lists[i-1];
         processed = 1;
       }
 
       // granhistory: set preceeding list's listgranhistory to this list
       //               also set preceeding list's ptr to FixShearHistory
 
       if (requests[i]->granhistory) {
         lists[i-1]->listgranhistory = lists[i];
         for (int ifix = 0; ifix < modify->nfix; ifix++)
           if (strcmp(modify->fix[ifix]->style,"SHEAR_HISTORY") == 0)
             lists[i-1]->fix_history = (FixShearHistory *) modify->fix[ifix];
         processed = 1;
 
       // respaouter: point this list at preceeding 1/2 inner/middle lists
 
       } else if (requests[i]->respaouter) {
         if (requests[i-1]->respainner) {
           lists[i]->respamiddle = 0;
           lists[i]->listinner = lists[i-1];
         } else {
           lists[i]->respamiddle = 1;
           lists[i]->listmiddle = lists[i-1];
           lists[i]->listinner = lists[i-2];
         }
         processed = 1;
       }
 
       if (processed) continue;
 
       // pair and half and newton != 2:
       //   if there is a full non-occasional non-skip list
       //   change this list to half_from_full and point at the full list
       //   parent could be copy list or pair or fix
       // could remove newton != 2 check if added half_from_full_no_newton_ghost
       //   option in neigh_derive.cpp and below in choose_build()
       //   this would require full list had ghost info
       //   would be useful when reax/c used in hybrid mode, e.g. with airebo
 
       if (requests[i]->pair && requests[i]->half && requests[i]->newton != 2) {
         for (j = 0; j < nrequest; j++) {
           if (!lists[j]) continue;
           if (requests[j]->full && requests[j]->occasional == 0 &&
               requests[j]->skip == 0) break;
         }
         if (j < nrequest) {
           requests[i]->half = 0;
           requests[i]->half_from_full = 1;
           lists[i]->listfull = lists[j];
         }
         
       // fix/compute requests:
       // whether request is occasional or not doesn't matter
       // if request = half and non-skip pair half/respaouter exists,
       //   become copy of that list if cudable flag matches
       // if request = full and non-skip pair full exists,
       //   become copy of that list if cudable flag matches
       // if request = half and non-skip pair full exists,
       //   become half_from_full of that list if cudable flag matches
       // if no matches, do nothing
       //   fix/compute list will be built independently as needed
       // ok if parent is itself a copy list
 
       } else if (requests[i]->fix || requests[i]->compute) {
         for (j = 0; j < nrequest; j++) {
           if (!lists[j]) continue;
           if (requests[i]->half && requests[j]->pair &&
               requests[j]->skip == 0 && requests[j]->half) break;
           if (requests[i]->full && requests[j]->pair &&
               requests[j]->skip == 0 && requests[j]->full) break;
           if (requests[i]->gran && requests[j]->pair &&
               requests[j]->skip == 0 && requests[j]->gran) break;
           if (requests[i]->half && requests[j]->pair &&
               requests[j]->skip == 0 && requests[j]->respaouter) break;
         }
         if (j < nrequest && requests[j]->cudable != requests[i]->cudable)
           j = nrequest;
         if (j < nrequest) {
           requests[i]->copy = 1;
           requests[i]->otherlist = j;
           lists[i]->listcopy = lists[j];
         } else {
           for (j = 0; j < nrequest; j++) {
             if (!lists[j]) continue;
             if (requests[i]->half && requests[j]->pair &&
                 requests[j]->skip == 0 && requests[j]->full) break;
           }
           if (j < nrequest && requests[j]->cudable != requests[i]->cudable)
             j = nrequest;
           if (j < nrequest) {
             requests[i]->half = 0;
             requests[i]->half_from_full = 1;
             lists[i]->listfull = lists[j];
           }
         }
       }
     }
 
     // allocate initial pages for each list, except if listcopy set
 
     for (i = 0; i < nrequest; i++) {
       if (!lists[i]) continue;
       if (!lists[i]->listcopy)
         lists[i]->setup_pages(pgsize,oneatom,requests[i]->dnum);
     }
 
     // set ptrs to pair_build and stencil_create functions for each list
     // ptrs set to NULL if not set explicitly
     // also set cudable to 0 if any neigh list request is not cudable
 
     for (i = 0; i < nrequest; i++) {
       choose_build(i,requests[i]);
       if (style != NSQ) choose_stencil(i,requests[i]);
       else stencil_create[i] = NULL;
       if (!requests[i]->cudable) cudable = 0;
     }
 
     // set each list's build/grow/stencil/ghost flags based on neigh request
     // buildflag = 1 if its pair_build() invoked every reneighbor
     // growflag = 1 if it stores atom-based arrays and pages
     // stencilflag = 1 if it stores stencil arrays
     // ghostflag = 1 if it stores neighbors of ghosts
     // anyghostlist = 1 if any non-occasional list stores neighbors of ghosts
 
     anyghostlist = 0;
     for (i = 0; i < nrequest; i++) {
       if (lists[i]) {
         lists[i]->buildflag = 1;
         if (pair_build[i] == NULL) lists[i]->buildflag = 0;
         if (requests[i]->occasional) lists[i]->buildflag = 0;
 
         lists[i]->growflag = 1;
         if (requests[i]->copy) lists[i]->growflag = 0;
 
         lists[i]->stencilflag = 1;
         if (style == NSQ) lists[i]->stencilflag = 0;
         if (stencil_create[i] == NULL) lists[i]->stencilflag = 0;
 
         lists[i]->ghostflag = 0;
         if (requests[i]->ghost) lists[i]->ghostflag = 1;
         if (requests[i]->ghost && !requests[i]->occasional) anyghostlist = 1;
       } else init_list_flags1_kokkos(i);
     }
 
 #ifdef NEIGH_LIST_DEBUG
     for (i = 0; i < nrequest; i++) lists[i]->print_attributes();
 #endif
 
     // allocate atom arrays for neighbor lists that store them
 
     maxatom = atom->nmax;
     for (i = 0; i < nrequest; i++) {
       if (lists[i]) {
         if (lists[i]->growflag) lists[i]->grow(maxatom);
       } else init_list_grow_kokkos(i);
     }
 
     // setup 3 vectors of pairwise neighbor lists
     // blist = lists whose pair_build() is invoked every reneighbor
     // glist = lists who store atom arrays which are used every reneighbor
     // slist = lists who store stencil arrays which are used every reneighbor
     // blist and glist vectors are used by neighbor::build()
     // slist vector is used by neighbor::setup_bins()
 
     nblist = nglist = nslist = 0;
     delete [] blist;
     delete [] glist;
     delete [] slist;
     blist = new int[nrequest];
     glist = new int[nrequest];
     slist = new int[nrequest];
 
     for (i = 0; i < nrequest; i++) {
       if (lists[i]) {
         if (lists[i]->buildflag) blist[nblist++] = i;
         if (lists[i]->growflag && requests[i]->occasional == 0)
           glist[nglist++] = i;
         if (lists[i]->stencilflag && requests[i]->occasional == 0)
           slist[nslist++] = i;
       } else init_list_flags2_kokkos(i);
     }
 
 #ifdef NEIGH_LIST_DEBUG
     print_lists_of_lists();
 #endif
 
     // reorder build vector if necessary
     // relevant for lists that copy/skip/half-full from parent
     // the derived list must appear in blist after the parent list
     // no occasional lists are in build vector
     // swap two lists within blist when dependency is mis-ordered
     // done when entire pass thru blist results in no swaps
 
     int done = 0;
     while (!done) {
       done = 1;
       for (i = 0; i < nblist; i++) {
         if (!lists[blist[i]]) continue;
         NeighList *ptr = NULL;
         if (lists[blist[i]]->listfull) ptr = lists[blist[i]]->listfull;
         if (lists[blist[i]]->listcopy) ptr = lists[blist[i]]->listcopy;
         if (lists[blist[i]]->listskip) ptr = lists[blist[i]]->listskip;
         if (ptr == NULL) continue;
         for (m = 0; m < nrequest; m++)
           if (ptr == lists[m]) break;
         for (j = 0; j < nblist; j++)
           if (m == blist[j]) break;
         if (j < i) continue;
         int tmp = blist[i];
         blist[i] = blist[j];
         blist[j] = tmp;
         done = 0;
         break;
       }
     }
 
 #ifdef NEIGH_LIST_DEBUG
     print_lists_of_lists();
 #endif
   }
 
   // mark all current requests as processed
   // delete old requests
   // copy current requests and style to old for next run
 
   for (i = 0; i < nrequest; i++) requests[i]->unprocessed = 0;
   for (i = 0; i < old_nrequest; i++) delete old_requests[i];
   memory->sfree(old_requests);
   old_nrequest = nrequest;
   old_requests = requests;
   nrequest = maxrequest = 0;
   requests = NULL;
   old_style = style;
   old_triclinic = triclinic;
 
   // ------------------------------------------------------------------
   // topology lists
 
   // 1st time allocation of topology lists
 
   if (atom->molecular && atom->nbonds && maxbond == 0) {
     if (nprocs == 1) maxbond = atom->nbonds;
     else maxbond = static_cast<int> (LB_FACTOR * atom->nbonds / nprocs);
     memory->create(bondlist,maxbond,3,"neigh:bondlist");
   }
 
   if (atom->molecular && atom->nangles && maxangle == 0) {
     if (nprocs == 1) maxangle = atom->nangles;
     else maxangle = static_cast<int> (LB_FACTOR * atom->nangles / nprocs);
     memory->create(anglelist,maxangle,4,"neigh:anglelist");
   }
 
   if (atom->molecular && atom->ndihedrals && maxdihedral == 0) {
     if (nprocs == 1) maxdihedral = atom->ndihedrals;
     else maxdihedral = static_cast<int>
            (LB_FACTOR * atom->ndihedrals / nprocs);
     memory->create(dihedrallist,maxdihedral,5,"neigh:dihedrallist");
   }
 
   if (atom->molecular && atom->nimpropers && maximproper == 0) {
     if (nprocs == 1) maximproper = atom->nimpropers;
     else maximproper = static_cast<int>
            (LB_FACTOR * atom->nimpropers / nprocs);
     memory->create(improperlist,maximproper,5,"neigh:improperlist");
   }
 
   // set flags that determine which topology neighboring routines to use
   // bonds,etc can only be broken for atom->molecular = 1, not 2
   // SHAKE sets bonds and angles negative
   // bond_quartic sets bonds to 0
   // delete_bonds sets all interactions negative
 
   int bond_off = 0;
   int angle_off = 0;
   for (i = 0; i < modify->nfix; i++)
     if (strcmp(modify->fix[i]->style,"shake") == 0)
       bond_off = angle_off = 1;
   if (force->bond && force->bond_match("quartic")) bond_off = 1;
 
   if (atom->avec->bonds_allow && atom->molecular == 1) {
     for (i = 0; i < atom->nlocal; i++) {
       if (bond_off) break;
       for (m = 0; m < atom->num_bond[i]; m++)
         if (atom->bond_type[i][m] <= 0) bond_off = 1;
     }
   }
 
   if (atom->avec->angles_allow && atom->molecular == 1) {
     for (i = 0; i < atom->nlocal; i++) {
       if (angle_off) break;
       for (m = 0; m < atom->num_angle[i]; m++)
         if (atom->angle_type[i][m] <= 0) angle_off = 1;
     }
   }
 
   int dihedral_off = 0;
   if (atom->avec->dihedrals_allow && atom->molecular == 1) {
     for (i = 0; i < atom->nlocal; i++) {
       if (dihedral_off) break;
       for (m = 0; m < atom->num_dihedral[i]; m++)
         if (atom->dihedral_type[i][m] <= 0) dihedral_off = 1;
     }
   }
 
   int improper_off = 0;
   if (atom->avec->impropers_allow && atom->molecular == 1) {
     for (i = 0; i < atom->nlocal; i++) {
       if (improper_off) break;
       for (m = 0; m < atom->num_improper[i]; m++)
         if (atom->improper_type[i][m] <= 0) improper_off = 1;
     }
   }
 
   // sync on/off settings across all procs
 
   int on_or_off = bond_off;
   MPI_Allreduce(&on_or_off,&bond_off,1,MPI_INT,MPI_MAX,world);
   on_or_off = angle_off;
   MPI_Allreduce(&on_or_off,&angle_off,1,MPI_INT,MPI_MAX,world);
   on_or_off = dihedral_off;
   MPI_Allreduce(&on_or_off,&dihedral_off,1,MPI_INT,MPI_MAX,world);
   on_or_off = improper_off;
   MPI_Allreduce(&on_or_off,&improper_off,1,MPI_INT,MPI_MAX,world);
 
   // set ptrs to topology build functions
 
   if (atom->molecular == 2) bond_build = &Neighbor::bond_template;
   else if (bond_off) bond_build = &Neighbor::bond_partial;
   else bond_build = &Neighbor::bond_all;
 
   if (atom->molecular == 2) angle_build = &Neighbor::angle_template;
   else if (angle_off) angle_build = &Neighbor::angle_partial;
   else angle_build = &Neighbor::angle_all;
 
   if (atom->molecular == 2) dihedral_build = &Neighbor::dihedral_template;
   else if (dihedral_off) dihedral_build = &Neighbor::dihedral_partial;
   else dihedral_build = &Neighbor::dihedral_all;
 
   if (atom->molecular == 2) improper_build = &Neighbor::improper_template;
   else if (improper_off) improper_build = &Neighbor::improper_partial;
   else improper_build = &Neighbor::improper_all;
 
   // set topology neighbor list counts to 0
   // in case all are turned off but potential is still defined
 
   nbondlist = nanglelist = ndihedrallist = nimproperlist = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 int Neighbor::request(void *requestor)
 {
   if (nrequest == maxrequest) {
     maxrequest += RQDELTA;
     requests = (NeighRequest **)
       memory->srealloc(requests,maxrequest*sizeof(NeighRequest *),
                        "neighbor:requests");
   }
 
   requests[nrequest] = new NeighRequest(lmp);
   requests[nrequest]->requestor = requestor;
   nrequest++;
   return nrequest-1;
 }
 
 /* ----------------------------------------------------------------------
    determine which pair_build function each neigh list needs
    based on settings of neigh request
    copy -> copy_from function
    skip -> granular function if gran with granhistory,
            respa function if respaouter,
            skip_from function for everything else
    half_from_full, half, full, gran, respaouter ->
      choose by newton and rq->newton and tri settings
      style NSQ options = newton off, newton on
      style BIN options = newton off, newton on and not tri, newton on and tri
      stlye MULTI options = same options as BIN
    if none of these, ptr = NULL since pair_build is not invoked for this list
    use "else if" b/c skip,copy can be set in addition to half,full,etc
 ------------------------------------------------------------------------- */
 
 void Neighbor::choose_build(int index, NeighRequest *rq)
 {
   PairPtr pb = NULL;
 
-  if (rq->omp == 0) {
+  if (rq->omp == 0 && rq->intel == 0) {
 
     if (rq->copy) pb = &Neighbor::copy_from;
 
     else if (rq->skip) {
       if (rq->gran && lists[index]->listgranhistory)
         pb = &Neighbor::skip_from_granular;
       else if (rq->respaouter) pb = &Neighbor::skip_from_respa;
       else pb = &Neighbor::skip_from;
 
     } else if (rq->half_from_full) {
       if (rq->newton == 0) {
         if (newton_pair == 0) pb = &Neighbor::half_from_full_no_newton;
         else if (newton_pair == 1) pb = &Neighbor::half_from_full_newton;
       } else if (rq->newton == 1) {
         pb = &Neighbor::half_from_full_newton;
       } else if (rq->newton == 2) {
         pb = &Neighbor::half_from_full_no_newton;
       }
 
     } else if (rq->half) {
       if (style == NSQ) {
         if (rq->newton == 0) {
           if (newton_pair == 0) {
             if (rq->ghost == 0) pb = &Neighbor::half_nsq_no_newton;
             else if (includegroup)
               error->all(FLERR,"Neighbor include group not allowed "
                          "with ghost neighbors");
             else pb = &Neighbor::half_nsq_no_newton_ghost;
           } else if (newton_pair == 1) pb = &Neighbor::half_nsq_newton;
         } else if (rq->newton == 1) {
           pb = &Neighbor::half_nsq_newton;
         } else if (rq->newton == 2) {
           if (rq->ghost == 0) pb = &Neighbor::half_nsq_no_newton;
           else if (includegroup)
             error->all(FLERR,"Neighbor include group not allowed "
                        "with ghost neighbors");
           else pb = &Neighbor::half_nsq_no_newton_ghost;
         }
       } else if (style == BIN) {
         if (rq->newton == 0) {
           if (newton_pair == 0) {
             if (rq->ghost == 0) pb = &Neighbor::half_bin_no_newton;
             else if (includegroup)
               error->all(FLERR,"Neighbor include group not allowed "
                          "with ghost neighbors");
             else pb = &Neighbor::half_bin_no_newton_ghost;
           } else if (triclinic == 0) {
             pb = &Neighbor::half_bin_newton;
           } else if (triclinic == 1) 
             pb = &Neighbor::half_bin_newton_tri;
         } else if (rq->newton == 1) {
           if (triclinic == 0) pb = &Neighbor::half_bin_newton;
           else if (triclinic == 1) pb = &Neighbor::half_bin_newton_tri;
         } else if (rq->newton == 2) {
           if (rq->ghost == 0) pb = &Neighbor::half_bin_no_newton;
           else if (includegroup)
             error->all(FLERR,"Neighbor include group not allowed "
                        "with ghost neighbors");
           else pb = &Neighbor::half_bin_no_newton_ghost;
         }
       } else if (style == MULTI) {
         if (rq->ghost == 1)
           error->all(FLERR,
                      "Neighbor multi not yet enabled for ghost neighbors");
         if (rq->newton == 0) {
           if (newton_pair == 0) pb = &Neighbor::half_multi_no_newton;
           else if (triclinic == 0) pb = &Neighbor::half_multi_newton;
           else if (triclinic == 1) pb = &Neighbor::half_multi_newton_tri;
         } else if (rq->newton == 1) {
           if (triclinic == 0) pb = &Neighbor::half_multi_newton;
           else if (triclinic == 1) pb = &Neighbor::half_multi_newton_tri;
         } else if (rq->newton == 2) pb = &Neighbor::half_multi_no_newton;
       }
 
     } else if (rq->full) {
       if (style == NSQ) {
         if (rq->ghost == 0) pb = &Neighbor::full_nsq;
         else if (includegroup)
           error->all(FLERR,
                      "Neighbor include group not allowed with ghost neighbors");
         else pb = &Neighbor::full_nsq_ghost;
       } else if (style == BIN) {
         if (rq->ghost == 0) pb = &Neighbor::full_bin;
         else if (includegroup)
           error->all(FLERR,
                      "Neighbor include group not allowed with ghost neighbors");
         else pb = &Neighbor::full_bin_ghost;
       } else if (style == MULTI) {
         if (rq->ghost == 1)
           error->all(FLERR,
                      "Neighbor multi not yet enabled for ghost neighbors");
         pb = &Neighbor::full_multi;
       }
 
     } else if (rq->gran) {
       if (style == NSQ) {
         if (newton_pair == 0) pb = &Neighbor::granular_nsq_no_newton;
         else if (newton_pair == 1) pb = &Neighbor::granular_nsq_newton;
       } else if (style == BIN) {
         if (newton_pair == 0) pb = &Neighbor::granular_bin_no_newton;
         else if (triclinic == 0) pb = &Neighbor::granular_bin_newton;
         else if (triclinic == 1) pb = &Neighbor::granular_bin_newton_tri;
       } else if (style == MULTI)
         error->all(FLERR,"Neighbor multi not yet enabled for granular");
 
     } else if (rq->respaouter) {
       if (style == NSQ) {
         if (newton_pair == 0) pb = &Neighbor::respa_nsq_no_newton;
         else if (newton_pair == 1) pb = &Neighbor::respa_nsq_newton;
       } else if (style == BIN) {
         if (newton_pair == 0) pb = &Neighbor::respa_bin_no_newton;
         else if (triclinic == 0) pb = &Neighbor::respa_bin_newton;
         else if (triclinic == 1) pb = &Neighbor::respa_bin_newton_tri;
       } else if (style == MULTI)
         error->all(FLERR,"Neighbor multi not yet enabled for rRESPA");
     }
 
   // OMP versions of build methods
 
   } else {
 
     if (rq->copy) pb = &Neighbor::copy_from;
 
     else if (rq->skip) {
       if (rq->gran && lists[index]->listgranhistory)
         pb = &Neighbor::skip_from_granular;
       else if (rq->respaouter) pb = &Neighbor::skip_from_respa;
       else pb = &Neighbor::skip_from;
 
     } else if (rq->half_from_full) {
       if (newton_pair == 0) pb = &Neighbor::half_from_full_no_newton_omp;
       else if (newton_pair == 1) pb = &Neighbor::half_from_full_newton_omp;
 
     } else if (rq->half) {
       if (style == NSQ) {
         if (rq->newton == 0) {
           if (newton_pair == 0) {
             if (rq->ghost == 0) pb = &Neighbor::half_nsq_no_newton_omp;
             else if (includegroup)
               error->all(FLERR,"Neighbor include group not allowed "
                          "with ghost neighbors");
             else pb = &Neighbor::half_nsq_no_newton_ghost_omp;
           } else if (newton_pair == 1) pb = &Neighbor::half_nsq_newton_omp;
         } else if (rq->newton == 1) {
           pb = &Neighbor::half_nsq_newton_omp;
         } else if (rq->newton == 2) {
           if (rq->ghost == 0) pb = &Neighbor::half_nsq_no_newton_omp;
           else if (includegroup)
             error->all(FLERR,"Neighbor include group not allowed "
                        "with ghost neighbors");
           else pb = &Neighbor::half_nsq_no_newton_ghost_omp;
         }
       } else if (style == BIN) {
         if (rq->newton == 0) {
           if (newton_pair == 0) {
-            if (rq->ghost == 0) pb = &Neighbor::half_bin_no_newton_omp;
-            else if (includegroup)
+            if (rq->ghost == 0) {
+	      if (rq->intel) pb = &Neighbor::half_bin_no_newton_intel;
+	      else pb = &Neighbor::half_bin_no_newton_omp;
+            } else if (includegroup)
               error->all(FLERR,"Neighbor include group not allowed "
                          "with ghost neighbors");
             else pb = &Neighbor::half_bin_no_newton_ghost_omp;
           } else if (triclinic == 0) {
-            pb = &Neighbor::half_bin_newton_omp;
-          } else if (triclinic == 1)
-            pb = &Neighbor::half_bin_newton_tri_omp;
+            if (rq->intel) pb = &Neighbor::half_bin_newton_intel;
+            else pb = &Neighbor::half_bin_newton_omp;
+          } else if (triclinic == 1) {
+            if (rq->intel) pb = &Neighbor::half_bin_newton_tri_intel;
+            else pb = &Neighbor::half_bin_newton_tri_omp;
+	  }
         } else if (rq->newton == 1) {
-          if (triclinic == 0) pb = &Neighbor::half_bin_newton_omp;
-          else if (triclinic == 1) pb = &Neighbor::half_bin_newton_tri_omp;
+          if (triclinic == 0) {
+	    if (rq->intel) pb = &Neighbor::half_bin_newton_intel;
+	    else pb = &Neighbor::half_bin_newton_omp;
+          } else if (triclinic == 1) {
+            if (rq->intel) pb = &Neighbor::half_bin_newton_tri_intel;
+            else pb = &Neighbor::half_bin_newton_tri_omp;
+	  }
         } else if (rq->newton == 2) {
-          if (rq->ghost == 0) pb = &Neighbor::half_bin_no_newton_omp;
-          else if (includegroup)
+          if (rq->ghost == 0) {
+	    if (rq->intel) pb = &Neighbor::half_bin_no_newton_intel;
+	    else pb = &Neighbor::half_bin_no_newton_omp;
+          } else if (includegroup)
             error->all(FLERR,"Neighbor include group not allowed "
                        "with ghost neighbors");
           else pb = &Neighbor::half_bin_no_newton_ghost_omp;
         }
       } else if (style == MULTI) {
         if (rq->ghost == 1)
           error->all(FLERR,
                      "Neighbor multi not yet enabled for ghost neighbors");
         if (rq->newton == 0) {
           if (newton_pair == 0) pb = &Neighbor::half_multi_no_newton_omp;
           else if (triclinic == 0) pb = &Neighbor::half_multi_newton_omp;
           else if (triclinic == 1) pb = &Neighbor::half_multi_newton_tri_omp;
         } else if (rq->newton == 1) {
           if (triclinic == 0) pb = &Neighbor::half_multi_newton_omp;
           else if (triclinic == 1) pb = &Neighbor::half_multi_newton_tri_omp;
         } else if (rq->newton == 2) pb = &Neighbor::half_multi_no_newton_omp;
       }
 
     } else if (rq->full) {
       if (style == NSQ) {
         if (rq->ghost == 0) pb = &Neighbor::full_nsq_omp;
         else if (includegroup)
           error->all(FLERR,
                      "Neighbor include group not allowed with ghost neighbors");
         else pb = &Neighbor::full_nsq_ghost_omp;
       } else if (style == BIN) {
         if (rq->ghost == 0) pb = &Neighbor::full_bin_omp;
         else if (includegroup)
           error->all(FLERR,
                      "Neighbor include group not allowed with ghost neighbors");
         else pb = &Neighbor::full_bin_ghost_omp;
       } else if (style == MULTI) {
         if (rq->ghost == 1)
           error->all(FLERR,
                      "Neighbor multi not yet enabled for ghost neighbors");
         pb = &Neighbor::full_multi_omp;
       }
 
     } else if (rq->gran) {
       if (style == NSQ) {
         if (newton_pair == 0) pb = &Neighbor::granular_nsq_no_newton_omp;
         else if (newton_pair == 1) pb = &Neighbor::granular_nsq_newton_omp;
       } else if (style == BIN) {
         if (newton_pair == 0) pb = &Neighbor::granular_bin_no_newton_omp;
         else if (triclinic == 0) pb = &Neighbor::granular_bin_newton_omp;
         else if (triclinic == 1) pb = &Neighbor::granular_bin_newton_tri_omp;
       } else if (style == MULTI)
         error->all(FLERR,"Neighbor multi not yet enabled for granular");
 
     } else if (rq->respaouter) {
       if (style == NSQ) {
         if (newton_pair == 0) pb = &Neighbor::respa_nsq_no_newton_omp;
         else if (newton_pair == 1) pb = &Neighbor::respa_nsq_newton_omp;
       } else if (style == BIN) {
         if (newton_pair == 0) pb = &Neighbor::respa_bin_no_newton_omp;
         else if (triclinic == 0) pb = &Neighbor::respa_bin_newton_omp;
         else if (triclinic == 1) pb = &Neighbor::respa_bin_newton_tri_omp;
       } else if (style == MULTI)
         error->all(FLERR,"Neighbor multi not yet enabled for rRESPA");
     }
   }
 
   pair_build[index] = pb;
 }
 
 /* ----------------------------------------------------------------------
    determine which stencil_create function each neigh list needs
    based on settings of neigh request, only called if style != NSQ
    skip or copy or half_from_full -> no stencil
    half, gran, respaouter, full -> choose by newton and tri and dimension
    if none of these, ptr = NULL since this list needs no stencils
    use "else if" b/c skip,copy can be set in addition to half,full,etc
 ------------------------------------------------------------------------- */
 
 void Neighbor::choose_stencil(int index, NeighRequest *rq)
 {
   StencilPtr sc = NULL;
 
   if (rq->skip || rq->copy || rq->half_from_full) sc = NULL;
 
   else if (rq->half || rq->gran || rq->respaouter) {
     if (style == BIN) {
       if (rq->newton == 0) {
         if (newton_pair == 0) {
           if (dimension == 2) {
             if (rq->ghost) sc = &Neighbor::stencil_half_ghost_bin_2d_no_newton;
             else sc = &Neighbor::stencil_half_bin_2d_no_newton;
           } else if (dimension == 3) {
             if (rq->ghost) sc = &Neighbor::stencil_half_ghost_bin_3d_no_newton;
             else sc = &Neighbor::stencil_half_bin_3d_no_newton;
           }
         } else if (triclinic == 0) {
           if (dimension == 2)
             sc = &Neighbor::stencil_half_bin_2d_newton;
           else if (dimension == 3)
             sc = &Neighbor::stencil_half_bin_3d_newton;
         } else if (triclinic == 1) {
           if (dimension == 2)
             sc = &Neighbor::stencil_half_bin_2d_newton_tri;
           else if (dimension == 3)
             sc = &Neighbor::stencil_half_bin_3d_newton_tri;
         }
       } else if (rq->newton == 1) {
         if (triclinic == 0) {
           if (dimension == 2)
             sc = &Neighbor::stencil_half_bin_2d_newton;
           else if (dimension == 3)
             sc = &Neighbor::stencil_half_bin_3d_newton;
         } else if (triclinic == 1) {
           if (dimension == 2)
             sc = &Neighbor::stencil_half_bin_2d_newton_tri;
           else if (dimension == 3)
             sc = &Neighbor::stencil_half_bin_3d_newton_tri;
         }
       } else if (rq->newton == 2) {
         if (dimension == 2)
           if (rq->ghost) sc = &Neighbor::stencil_half_ghost_bin_2d_no_newton;
           else sc = &Neighbor::stencil_half_bin_2d_no_newton;
         else if (dimension == 3) {
           if (rq->ghost) sc = &Neighbor::stencil_half_ghost_bin_3d_no_newton;
           else sc = &Neighbor::stencil_half_bin_3d_no_newton;
         }
       }
 
     } else if (style == MULTI) {
       if (rq->newton == 0) {
         if (newton_pair == 0) {
           if (dimension == 2)
             sc = &Neighbor::stencil_half_multi_2d_no_newton;
           else if (dimension == 3)
             sc = &Neighbor::stencil_half_multi_3d_no_newton;
         } else if (triclinic == 0) {
           if (dimension == 2)
             sc = &Neighbor::stencil_half_multi_2d_newton;
           else if (dimension == 3)
             sc = &Neighbor::stencil_half_multi_3d_newton;
         } else if (triclinic == 1) {
           if (dimension == 2)
             sc = &Neighbor::stencil_half_multi_2d_newton_tri;
           else if (dimension == 3)
             sc = &Neighbor::stencil_half_multi_3d_newton_tri;
         }
       } else if (rq->newton == 1) {
         if (triclinic == 0) {
           if (dimension == 2)
             sc = &Neighbor::stencil_half_multi_2d_newton;
           else if (dimension == 3)
             sc = &Neighbor::stencil_half_multi_3d_newton;
         } else if (triclinic == 1) {
           if (dimension == 2)
             sc = &Neighbor::stencil_half_multi_2d_newton_tri;
           else if (dimension == 3)
             sc = &Neighbor::stencil_half_multi_3d_newton_tri;
         }
       } else if (rq->newton == 2) {
         if (dimension == 2)
           sc = &Neighbor::stencil_half_multi_2d_no_newton;
         else if (dimension == 3)
           sc = &Neighbor::stencil_half_multi_3d_no_newton;
       }
     }
 
   } else if (rq->full) {
     if (style == BIN) {
       if (dimension == 2) {
         if (rq->ghost) sc = &Neighbor::stencil_full_ghost_bin_2d;
         else sc = &Neighbor::stencil_full_bin_2d;
       }
       else if (dimension == 3) {
         if (rq->ghost) sc = &Neighbor::stencil_full_ghost_bin_3d;
         else sc = &Neighbor::stencil_full_bin_3d;
       }
     } else if (style == MULTI) {
       if (dimension == 2) sc = &Neighbor::stencil_full_multi_2d;
       else if (dimension == 3) sc = &Neighbor::stencil_full_multi_3d;
     }
   }
 
   stencil_create[index] = sc;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Neighbor::print_lists_of_lists()
 {
   if (comm->me == 0) {
     printf("Build lists = %d: ",nblist);
     for (int i = 0; i < nblist; i++) printf("%d ",blist[i]);
     printf("\n");
     printf("Grow lists = %d: ",nglist);
     for (int i = 0; i < nglist; i++) printf("%d ",glist[i]);
     printf("\n");
     printf("Stencil lists = %d: ",nslist);
     for (int i = 0; i < nslist; i++) printf("%d ",slist[i]);
     printf("\n");
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 int Neighbor::decide()
 {
   if (must_check) {
     bigint n = update->ntimestep;
     if (restart_check && n == output->next_restart) return 1;
     for (int i = 0; i < fix_check; i++)
       if (n == modify->fix[fixchecklist[i]]->next_reneighbor) return 1;
   }
 
   ago++;
   if (ago >= delay && ago % every == 0) {
     if (build_once) return 0;
     if (dist_check == 0) return 1;
     return check_distance();
   } else return 0;
 }
 
 /* ----------------------------------------------------------------------
    if any atom moved trigger distance (half of neighbor skin) return 1
    shrink trigger distance if box size has changed
    conservative shrink procedure:
      compute distance each of 8 corners of box has moved since last reneighbor
      reduce skin distance by sum of 2 largest of the 8 values
      new trigger = 1/2 of reduced skin distance
    for orthogonal box, only need 2 lo/hi corners
    for triclinic, need all 8 corners since deformations can displace all 8
 ------------------------------------------------------------------------- */
 
 int Neighbor::check_distance()
 {
   double delx,dely,delz,rsq;
   double delta,deltasq,delta1,delta2;
 
   if (boxcheck) {
     if (triclinic == 0) {
       delx = bboxlo[0] - boxlo_hold[0];
       dely = bboxlo[1] - boxlo_hold[1];
       delz = bboxlo[2] - boxlo_hold[2];
       delta1 = sqrt(delx*delx + dely*dely + delz*delz);
       delx = bboxhi[0] - boxhi_hold[0];
       dely = bboxhi[1] - boxhi_hold[1];
       delz = bboxhi[2] - boxhi_hold[2];
       delta2 = sqrt(delx*delx + dely*dely + delz*delz);
       delta = 0.5 * (skin - (delta1+delta2));
       deltasq = delta*delta;
     } else {
       domain->box_corners();
       delta1 = delta2 = 0.0;
       for (int i = 0; i < 8; i++) {
         delx = corners[i][0] - corners_hold[i][0];
         dely = corners[i][1] - corners_hold[i][1];
         delz = corners[i][2] - corners_hold[i][2];
         delta = sqrt(delx*delx + dely*dely + delz*delz);
         if (delta > delta1) delta1 = delta;
         else if (delta > delta2) delta2 = delta;
       }
       delta = 0.5 * (skin - (delta1+delta2));
       deltasq = delta*delta;
     }
   } else deltasq = triggersq;
 
   double **x = atom->x;
   int nlocal = atom->nlocal;
   if (includegroup) nlocal = atom->nfirst;
 
   int flag = 0;
   for (int i = 0; i < nlocal; i++) {
     delx = x[i][0] - xhold[i][0];
     dely = x[i][1] - xhold[i][1];
     delz = x[i][2] - xhold[i][2];
     rsq = delx*delx + dely*dely + delz*delz;
     if (rsq > deltasq) flag = 1;
   }
 
   int flagall;
   MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_MAX,world);
   if (flagall && ago == MAX(every,delay)) ndanger++;
   return flagall;
 }
 
 /* ----------------------------------------------------------------------
    build perpetuals neighbor lists
    called at setup and every few timesteps during run or minimization
    topology lists only built if topoflag = 1, USER-CUDA calls with topoflag = 0
 ------------------------------------------------------------------------- */
 
 void Neighbor::build(int topoflag)
 {
   int i;
 
   ago = 0;
   ncalls++;
   lastcall = update->ntimestep;
 
   // store current atom positions and box size if needed
 
   if (dist_check) {
     double **x = atom->x;
     int nlocal = atom->nlocal;
     if (includegroup) nlocal = atom->nfirst;
     if (nlocal > maxhold) {
       maxhold = atom->nmax;
       memory->destroy(xhold);
       memory->create(xhold,maxhold,3,"neigh:xhold");
     }
     for (i = 0; i < nlocal; i++) {
       xhold[i][0] = x[i][0];
       xhold[i][1] = x[i][1];
       xhold[i][2] = x[i][2];
     }
     if (boxcheck) {
       if (triclinic == 0) {
         boxlo_hold[0] = bboxlo[0];
         boxlo_hold[1] = bboxlo[1];
         boxlo_hold[2] = bboxlo[2];
         boxhi_hold[0] = bboxhi[0];
         boxhi_hold[1] = bboxhi[1];
         boxhi_hold[2] = bboxhi[2];
       } else {
         domain->box_corners();
         corners = domain->corners;
         for (i = 0; i < 8; i++) {
           corners_hold[i][0] = corners[i][0];
           corners_hold[i][1] = corners[i][1];
           corners_hold[i][2] = corners[i][2];
         }
       }
     }
   }
 
   // if any lists store neighbors of ghosts:
   //   invoke grow() if nlocal+nghost exceeds previous list size
   // else only invoke grow() if nlocal exceeds previous list size
   // only for lists with growflag set and which are perpetual (glist)
 
   if (anyghostlist && atom->nlocal+atom->nghost > maxatom) {
     maxatom = atom->nmax;
     for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
   } else if (atom->nlocal > maxatom) {
     maxatom = atom->nmax;
     for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
   }
 
   // extend atom bin list if necessary
 
   if (style != NSQ && atom->nmax > maxbin) {
     maxbin = atom->nmax;
     memory->destroy(bins);
     memory->create(bins,maxbin,"bins");
   }
 
   // check that using special bond flags will not overflow neigh lists
 
   if (atom->nlocal+atom->nghost > NEIGHMASK)
     error->one(FLERR,"Too many local+ghost atoms for neighbor list");
 
   // invoke building of pair and molecular topology neighbor lists
   // only for pairwise lists with buildflag set
   // blist is for standard neigh lists, otherwise is a Kokkos list
 
   for (i = 0; i < nblist; i++) {
     if (lists[blist[i]])
       (this->*pair_build[blist[i]])(lists[blist[i]]);
     else build_kokkos(i);
   }
 
   if (atom->molecular && topoflag) build_topology();
 }
 
 /* ----------------------------------------------------------------------
    build all topology neighbor lists every few timesteps
    normally built with pair lists, but USER-CUDA separates them
 ------------------------------------------------------------------------- */
 
 void Neighbor::build_topology()
 {
   if (force->bond) (this->*bond_build)();
   if (force->angle) (this->*angle_build)();
   if (force->dihedral) (this->*dihedral_build)();
   if (force->improper) (this->*improper_build)();
 }
 
 /* ----------------------------------------------------------------------
    build a single occasional pairwise neighbor list indexed by I
    called by other classes
 ------------------------------------------------------------------------- */
 
 void Neighbor::build_one(int i, int preflag)
 {
   // no need to build if already built since last re-neighbor
   // preflag is set by fix bond/create and fix bond/swap
   //   b/c they invoke build_one() on same step neigh list is re-built,
   //   but before re-build, so need to use ">" instead of ">="
 
   if (preflag) {
     if (lists[i]->last_build > lastcall) return;
   } else {
     if (lists[i]->last_build >= lastcall) return;
   }
 
   lists[i]->last_build = update->ntimestep;
 
   // update stencils and grow atom arrays as needed
   // only for relevant settings of stencilflag and growflag
   // grow atom array for this list to current size of perpetual lists
 
   if (lists[i]->stencilflag) {
     lists[i]->stencil_allocate(smax,style);
     (this->*stencil_create[i])(lists[i],sx,sy,sz);
   }
 
   if (lists[i]->growflag) lists[i]->grow(maxatom);
 
   // build list I, turning off atom binning
   // binning results from last re-neighbor should be used instead
   // if re-bin now, atoms may have moved outside of proc domain & bin extent,
   //   leading to errors or even a crash
 
   binatomflag = 0;
   (this->*pair_build[i])(lists[i]);
   binatomflag = 1;
 }
 
 /* ----------------------------------------------------------------------
    setup neighbor binning parameters
    bin numbering in each dimension is global:
      0 = 0.0 to binsize, 1 = binsize to 2*binsize, etc
      nbin-1,nbin,etc = bbox-binsize to bbox, bbox to bbox+binsize, etc
      -1,-2,etc = -binsize to 0.0, -2*binsize to -binsize, etc
    code will work for any binsize
      since next(xyz) and stencil extend as far as necessary
      binsize = 1/2 of cutoff is roughly optimal
    for orthogonal boxes:
      a dim must be filled exactly by integer # of bins
      in periodic, procs on both sides of PBC must see same bin boundary
      in non-periodic, coord2bin() still assumes this by use of nbin xyz
    for triclinic boxes:
      tilted simulation box cannot contain integer # of bins
      stencil & neigh list built differently to account for this
    mbinlo = lowest global bin any of my ghost atoms could fall into
    mbinhi = highest global bin any of my ghost atoms could fall into
    mbin = number of bins I need in a dimension
 ------------------------------------------------------------------------- */
 
 void Neighbor::setup_bins()
 {
   // bbox = size of bbox of entire domain
   // bsubbox lo/hi = bounding box of my subdomain extended by comm->cutghost
   // for triclinic:
   //   bbox bounds all 8 corners of tilted box
   //   subdomain is in lamda coords
   //   include dimension-dependent extension via comm->cutghost
   //   domain->bbox() converts lamda extent to box coords and computes bbox
 
   double bbox[3],bsubboxlo[3],bsubboxhi[3];
   double *cutghost = comm->cutghost;
 
   if (triclinic == 0) {
     bsubboxlo[0] = domain->sublo[0] - cutghost[0];
     bsubboxlo[1] = domain->sublo[1] - cutghost[1];
     bsubboxlo[2] = domain->sublo[2] - cutghost[2];
     bsubboxhi[0] = domain->subhi[0] + cutghost[0];
     bsubboxhi[1] = domain->subhi[1] + cutghost[1];
     bsubboxhi[2] = domain->subhi[2] + cutghost[2];
   } else {
     double lo[3],hi[3];
     lo[0] = domain->sublo_lamda[0] - cutghost[0];
     lo[1] = domain->sublo_lamda[1] - cutghost[1];
     lo[2] = domain->sublo_lamda[2] - cutghost[2];
     hi[0] = domain->subhi_lamda[0] + cutghost[0];
     hi[1] = domain->subhi_lamda[1] + cutghost[1];
     hi[2] = domain->subhi_lamda[2] + cutghost[2];
     domain->bbox(lo,hi,bsubboxlo,bsubboxhi);
   }
 
   bbox[0] = bboxhi[0] - bboxlo[0];
   bbox[1] = bboxhi[1] - bboxlo[1];
   bbox[2] = bboxhi[2] - bboxlo[2];
 
   // optimal bin size is roughly 1/2 the cutoff
   // for BIN style, binsize = 1/2 of max neighbor cutoff
   // for MULTI style, binsize = 1/2 of min neighbor cutoff
   // special case of all cutoffs = 0.0, binsize = box size
 
   double binsize_optimal;
   if (binsizeflag) binsize_optimal = binsize_user;
   else if (style == BIN) binsize_optimal = 0.5*cutneighmax;
   else binsize_optimal = 0.5*cutneighmin;
   if (binsize_optimal == 0.0) binsize_optimal = bbox[0];
   double binsizeinv = 1.0/binsize_optimal;
 
   // test for too many global bins in any dimension due to huge global domain
 
   if (bbox[0]*binsizeinv > MAXSMALLINT || bbox[1]*binsizeinv > MAXSMALLINT ||
       bbox[2]*binsizeinv > MAXSMALLINT)
     error->all(FLERR,"Domain too large for neighbor bins");
 
   // create actual bins
   // always have one bin even if cutoff > bbox
   // for 2d, nbinz = 1
 
   nbinx = static_cast<int> (bbox[0]*binsizeinv);
   nbiny = static_cast<int> (bbox[1]*binsizeinv);
   if (dimension == 3) nbinz = static_cast<int> (bbox[2]*binsizeinv);
   else nbinz = 1;
 
   if (nbinx == 0) nbinx = 1;
   if (nbiny == 0) nbiny = 1;
   if (nbinz == 0) nbinz = 1;
 
   // compute actual bin size for nbins to fit into box exactly
   // error if actual bin size << cutoff, since will create a zillion bins
   // this happens when nbin = 1 and box size << cutoff
   // typically due to non-periodic, flat system in a particular dim
   // in that extreme case, should use NSQ not BIN neighbor style
 
   binsizex = bbox[0]/nbinx;
   binsizey = bbox[1]/nbiny;
   binsizez = bbox[2]/nbinz;
 
   bininvx = 1.0 / binsizex;
   bininvy = 1.0 / binsizey;
   bininvz = 1.0 / binsizez;
 
   if (binsize_optimal*bininvx > CUT2BIN_RATIO ||
       binsize_optimal*bininvy > CUT2BIN_RATIO ||
       binsize_optimal*bininvz > CUT2BIN_RATIO)
     error->all(FLERR,"Cannot use neighbor bins - box size << cutoff");
 
   // mbinlo/hi = lowest and highest global bins my ghost atoms could be in
   // coord = lowest and highest values of coords for my ghost atoms
   // static_cast(-1.5) = -1, so subract additional -1
   // add in SMALL for round-off safety
 
   int mbinxhi,mbinyhi,mbinzhi;
   double coord;
 
   coord = bsubboxlo[0] - SMALL*bbox[0];
   mbinxlo = static_cast<int> ((coord-bboxlo[0])*bininvx);
   if (coord < bboxlo[0]) mbinxlo = mbinxlo - 1;
   coord = bsubboxhi[0] + SMALL*bbox[0];
   mbinxhi = static_cast<int> ((coord-bboxlo[0])*bininvx);
 
   coord = bsubboxlo[1] - SMALL*bbox[1];
   mbinylo = static_cast<int> ((coord-bboxlo[1])*bininvy);
   if (coord < bboxlo[1]) mbinylo = mbinylo - 1;
   coord = bsubboxhi[1] + SMALL*bbox[1];
   mbinyhi = static_cast<int> ((coord-bboxlo[1])*bininvy);
 
   if (dimension == 3) {
     coord = bsubboxlo[2] - SMALL*bbox[2];
     mbinzlo = static_cast<int> ((coord-bboxlo[2])*bininvz);
     if (coord < bboxlo[2]) mbinzlo = mbinzlo - 1;
     coord = bsubboxhi[2] + SMALL*bbox[2];
     mbinzhi = static_cast<int> ((coord-bboxlo[2])*bininvz);
   }
 
   // extend bins by 1 to insure stencil extent is included
   // if 2d, only 1 bin in z
 
   mbinxlo = mbinxlo - 1;
   mbinxhi = mbinxhi + 1;
   mbinx = mbinxhi - mbinxlo + 1;
 
   mbinylo = mbinylo - 1;
   mbinyhi = mbinyhi + 1;
   mbiny = mbinyhi - mbinylo + 1;
 
   if (dimension == 3) {
     mbinzlo = mbinzlo - 1;
     mbinzhi = mbinzhi + 1;
   } else mbinzlo = mbinzhi = 0;
   mbinz = mbinzhi - mbinzlo + 1;
 
   // memory for bin ptrs
 
   bigint bbin = ((bigint) mbinx) * ((bigint) mbiny) * ((bigint) mbinz);
   if (bbin > MAXSMALLINT) error->one(FLERR,"Too many neighbor bins");
   mbins = bbin;
   if (mbins > maxhead) {
     maxhead = mbins;
     memory->destroy(binhead);
     memory->create(binhead,maxhead,"neigh:binhead");
   }
 
   // create stencil of bins to search over in neighbor list construction
   // sx,sy,sz = max range of stencil in each dim
   // smax = max possible size of entire 3d stencil
   // stencil is empty if cutneighmax = 0.0
 
   sx = static_cast<int> (cutneighmax*bininvx);
   if (sx*binsizex < cutneighmax) sx++;
   sy = static_cast<int> (cutneighmax*bininvy);
   if (sy*binsizey < cutneighmax) sy++;
   sz = static_cast<int> (cutneighmax*bininvz);
   if (sz*binsizez < cutneighmax) sz++;
   if (dimension == 2) sz = 0;
   smax = (2*sx+1) * (2*sy+1) * (2*sz+1);
 
   // create stencils for pairwise neighbor lists
   // only done for lists with stencilflag and buildflag set
 
   for (int i = 0; i < nslist; i++) {
     if (lists[slist[i]]) {
       lists[slist[i]]->stencil_allocate(smax,style);
       (this->*stencil_create[slist[i]])(lists[slist[i]],sx,sy,sz);
     } else setup_bins_kokkos(i);
   }
 }
 
 /* ----------------------------------------------------------------------
    compute closest distance between central bin (0,0,0) and bin (i,j,k)
 ------------------------------------------------------------------------- */
 
 double Neighbor::bin_distance(int i, int j, int k)
 {
   double delx,dely,delz;
 
   if (i > 0) delx = (i-1)*binsizex;
   else if (i == 0) delx = 0.0;
   else delx = (i+1)*binsizex;
 
   if (j > 0) dely = (j-1)*binsizey;
   else if (j == 0) dely = 0.0;
   else dely = (j+1)*binsizey;
 
   if (k > 0) delz = (k-1)*binsizez;
   else if (k == 0) delz = 0.0;
   else delz = (k+1)*binsizez;
 
   return (delx*delx + dely*dely + delz*delz);
 }
 
 /* ----------------------------------------------------------------------
    set neighbor style and skin distance
 ------------------------------------------------------------------------- */
 
 void Neighbor::set(int narg, char **arg)
 {
   if (narg != 2) error->all(FLERR,"Illegal neighbor command");
 
   skin = force->numeric(FLERR,arg[0]);
   if (skin < 0.0) error->all(FLERR,"Illegal neighbor command");
 
   if (strcmp(arg[1],"nsq") == 0) style = NSQ;
   else if (strcmp(arg[1],"bin") == 0) style = BIN;
   else if (strcmp(arg[1],"multi") == 0) style = MULTI;
   else error->all(FLERR,"Illegal neighbor command");
 
   if (style == MULTI && lmp->citeme) lmp->citeme->add(cite_neigh_multi);
 }
 
 /* ----------------------------------------------------------------------
    modify parameters of the pair-wise neighbor build
 ------------------------------------------------------------------------- */
 
 void Neighbor::modify_params(int narg, char **arg)
 {
   int iarg = 0;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"every") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
       every = force->inumeric(FLERR,arg[iarg+1]);
       if (every <= 0) error->all(FLERR,"Illegal neigh_modify command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"delay") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
       delay = force->inumeric(FLERR,arg[iarg+1]);
       if (delay < 0) error->all(FLERR,"Illegal neigh_modify command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"check") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
       if (strcmp(arg[iarg+1],"yes") == 0) dist_check = 1;
       else if (strcmp(arg[iarg+1],"no") == 0) dist_check = 0;
       else error->all(FLERR,"Illegal neigh_modify command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"once") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
       if (strcmp(arg[iarg+1],"yes") == 0) build_once = 1;
       else if (strcmp(arg[iarg+1],"no") == 0) build_once = 0;
       else error->all(FLERR,"Illegal neigh_modify command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"page") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
       old_pgsize = pgsize;
       pgsize = force->inumeric(FLERR,arg[iarg+1]);
       iarg += 2;
     } else if (strcmp(arg[iarg],"one") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
       old_oneatom = oneatom;
       oneatom = force->inumeric(FLERR,arg[iarg+1]);
       iarg += 2;
     } else if (strcmp(arg[iarg],"binsize") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
       binsize_user = force->numeric(FLERR,arg[iarg+1]);
       if (binsize_user <= 0.0) binsizeflag = 0;
       else binsizeflag = 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"cluster") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
       if (strcmp(arg[iarg+1],"yes") == 0) cluster_check = 1;
       else if (strcmp(arg[iarg+1],"no") == 0) cluster_check = 0;
       else error->all(FLERR,"Illegal neigh_modify command");
       iarg += 2;
 
     } else if (strcmp(arg[iarg],"include") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
       includegroup = group->find(arg[iarg+1]);
       if (includegroup < 0)
         error->all(FLERR,"Invalid group ID in neigh_modify command");
       if (includegroup && (atom->firstgroupname == NULL ||
                             strcmp(arg[iarg+1],atom->firstgroupname) != 0))
         error->all(FLERR,
                    "Neigh_modify include group != atom_modify first group");
       iarg += 2;
 
     } else if (strcmp(arg[iarg],"exclude") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal neigh_modify command");
 
       if (strcmp(arg[iarg+1],"type") == 0) {
         if (iarg+4 > narg) error->all(FLERR,"Illegal neigh_modify command");
         if (nex_type == maxex_type) {
           maxex_type += EXDELTA;
           memory->grow(ex1_type,maxex_type,"neigh:ex1_type");
           memory->grow(ex2_type,maxex_type,"neigh:ex2_type");
         }
         ex1_type[nex_type] = force->inumeric(FLERR,arg[iarg+2]);
         ex2_type[nex_type] = force->inumeric(FLERR,arg[iarg+3]);
         nex_type++;
         iarg += 4;
 
       } else if (strcmp(arg[iarg+1],"group") == 0) {
         if (iarg+4 > narg) error->all(FLERR,"Illegal neigh_modify command");
         if (nex_group == maxex_group) {
           maxex_group += EXDELTA;
           memory->grow(ex1_group,maxex_group,"neigh:ex1_group");
           memory->grow(ex2_group,maxex_group,"neigh:ex2_group");
         }
         ex1_group[nex_group] = group->find(arg[iarg+2]);
         ex2_group[nex_group] = group->find(arg[iarg+3]);
         if (ex1_group[nex_group] == -1 || ex2_group[nex_group] == -1)
           error->all(FLERR,"Invalid group ID in neigh_modify command");
         nex_group++;
         iarg += 4;
 
       } else if (strcmp(arg[iarg+1],"molecule") == 0) {
         if (iarg+3 > narg) error->all(FLERR,"Illegal neigh_modify command");
         if (atom->molecule_flag == 0)
           error->all(FLERR,"Neigh_modify exclude molecule "
                      "requires atom attribute molecule");
         if (nex_mol == maxex_mol) {
           maxex_mol += EXDELTA;
           memory->grow(ex_mol_group,maxex_mol,"neigh:ex_mol_group");
         }
         ex_mol_group[nex_mol] = group->find(arg[iarg+2]);
         if (ex_mol_group[nex_mol] == -1)
           error->all(FLERR,"Invalid group ID in neigh_modify command");
         nex_mol++;
         iarg += 3;
 
       } else if (strcmp(arg[iarg+1],"none") == 0) {
         nex_type = nex_group = nex_mol = 0;
         iarg += 2;
 
       } else error->all(FLERR,"Illegal neigh_modify command");
 
     } else error->all(FLERR,"Illegal neigh_modify command");
   }
 }
 
 /* ----------------------------------------------------------------------
    bin owned and ghost atoms
 ------------------------------------------------------------------------- */
 
 void Neighbor::bin_atoms()
 {
   int i,ibin;
 
   for (i = 0; i < mbins; i++) binhead[i] = -1;
 
   // bin in reverse order so linked list will be in forward order
   // also puts ghost atoms at end of list, which is necessary
 
   double **x = atom->x;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
   int nall = nlocal + atom->nghost;
 
   if (includegroup) {
     int bitmask = group->bitmask[includegroup];
     for (i = nall-1; i >= nlocal; i--) {
       if (mask[i] & bitmask) {
         ibin = coord2bin(x[i]);
         bins[i] = binhead[ibin];
         binhead[ibin] = i;
       }
     }
     for (i = atom->nfirst-1; i >= 0; i--) {
       ibin = coord2bin(x[i]);
       bins[i] = binhead[ibin];
       binhead[ibin] = i;
     }
 
   } else {
     for (i = nall-1; i >= 0; i--) {
       ibin = coord2bin(x[i]);
       bins[i] = binhead[ibin];
       binhead[ibin] = i;
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    convert atom coords into local bin #
    for orthogonal, only ghost atoms will have coord >= bboxhi or coord < bboxlo
      take special care to insure ghosts are in correct bins even w/ roundoff
      hi ghost atoms = nbin,nbin+1,etc
      owned atoms = 0 to nbin-1
      lo ghost atoms = -1,-2,etc
      this is necessary so that both procs on either side of PBC
        treat a pair of atoms straddling the PBC in a consistent way
    for triclinic, doesn't matter since stencil & neigh list built differently
 ------------------------------------------------------------------------- */
 
 int Neighbor::coord2bin(double *x)
 {
   int ix,iy,iz;
 
   if (x[0] >= bboxhi[0])
     ix = static_cast<int> ((x[0]-bboxhi[0])*bininvx) + nbinx;
   else if (x[0] >= bboxlo[0]) {
     ix = static_cast<int> ((x[0]-bboxlo[0])*bininvx);
     ix = MIN(ix,nbinx-1);
   } else
     ix = static_cast<int> ((x[0]-bboxlo[0])*bininvx) - 1;
 
   if (x[1] >= bboxhi[1])
     iy = static_cast<int> ((x[1]-bboxhi[1])*bininvy) + nbiny;
   else if (x[1] >= bboxlo[1]) {
     iy = static_cast<int> ((x[1]-bboxlo[1])*bininvy);
     iy = MIN(iy,nbiny-1);
   } else
     iy = static_cast<int> ((x[1]-bboxlo[1])*bininvy) - 1;
 
   if (x[2] >= bboxhi[2])
     iz = static_cast<int> ((x[2]-bboxhi[2])*bininvz) + nbinz;
   else if (x[2] >= bboxlo[2]) {
     iz = static_cast<int> ((x[2]-bboxlo[2])*bininvz);
     iz = MIN(iz,nbinz-1);
   } else
     iz = static_cast<int> ((x[2]-bboxlo[2])*bininvz) - 1;
 
   return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
 }
 
 /* ----------------------------------------------------------------------
    same as coord2bin, but also return ix,iy,iz offsets in each dim
 ------------------------------------------------------------------------- */
 
 int Neighbor::coord2bin(double *x, int &ix, int &iy, int &iz)
 {
   if (x[0] >= bboxhi[0])
     ix = static_cast<int> ((x[0]-bboxhi[0])*bininvx) + nbinx;
   else if (x[0] >= bboxlo[0]) {
     ix = static_cast<int> ((x[0]-bboxlo[0])*bininvx);
     ix = MIN(ix,nbinx-1);
   } else
     ix = static_cast<int> ((x[0]-bboxlo[0])*bininvx) - 1;
 
   if (x[1] >= bboxhi[1])
     iy = static_cast<int> ((x[1]-bboxhi[1])*bininvy) + nbiny;
   else if (x[1] >= bboxlo[1]) {
     iy = static_cast<int> ((x[1]-bboxlo[1])*bininvy);
     iy = MIN(iy,nbiny-1);
   } else
     iy = static_cast<int> ((x[1]-bboxlo[1])*bininvy) - 1;
 
   if (x[2] >= bboxhi[2])
     iz = static_cast<int> ((x[2]-bboxhi[2])*bininvz) + nbinz;
   else if (x[2] >= bboxlo[2]) {
     iz = static_cast<int> ((x[2]-bboxlo[2])*bininvz);
     iz = MIN(iz,nbinz-1);
   } else
     iz = static_cast<int> ((x[2]-bboxlo[2])*bininvz) - 1;
 
   ix -= mbinxlo;
   iy -= mbinylo;
   iz -= mbinzlo;
   return iz*mbiny*mbinx + iy*mbinx + ix;
 }
 
 /* ----------------------------------------------------------------------
    test if atom pair i,j is excluded from neighbor list
    due to type, group, molecule settings from neigh_modify command
    return 1 if should be excluded, 0 if included
 ------------------------------------------------------------------------- */
 
 int Neighbor::exclusion(int i, int j, int itype, int jtype,
                         int *mask, tagint *molecule) const {
   int m;
 
   if (nex_type && ex_type[itype][jtype]) return 1;
 
   if (nex_group) {
     for (m = 0; m < nex_group; m++) {
       if (mask[i] & ex1_bit[m] && mask[j] & ex2_bit[m]) return 1;
       if (mask[i] & ex2_bit[m] && mask[j] & ex1_bit[m]) return 1;
     }
   }
 
   if (nex_mol) {
     for (m = 0; m < nex_mol; m++)
       if (mask[i] & ex_mol_bit[m] && mask[j] & ex_mol_bit[m] &&
           molecule[i] == molecule[j]) return 1;
   }
 
   return 0;
 }
 
 /* ----------------------------------------------------------------------
    return # of bytes of allocated memory
 ------------------------------------------------------------------------- */
 
 bigint Neighbor::memory_usage()
 {
   bigint bytes = 0;
   bytes += memory->usage(xhold,maxhold,3);
 
   if (style != NSQ) {
     bytes += memory->usage(bins,maxbin);
     bytes += memory->usage(binhead,maxhead);
   }
 
   for (int i = 0; i < nrequest; i++) 
     if (lists[i]) bytes += lists[i]->memory_usage();
 
   bytes += memory->usage(bondlist,maxbond,3);
   bytes += memory->usage(anglelist,maxangle,4);
   bytes += memory->usage(dihedrallist,maxdihedral,5);
   bytes += memory->usage(improperlist,maximproper,5);
 
   return bytes;
 }
 
 /* ----------------------------------------------------------------------
    return the value of exclude - used to check compatibility with GPU
 ------------------------------------------------------------------------- */
 
 int Neighbor::exclude_setting()
 {
   return exclude;
 }
diff --git a/src/neighbor.h b/src/neighbor.h
index 3c0c4af88..05a8622d0 100644
--- a/src/neighbor.h
+++ b/src/neighbor.h
@@ -1,424 +1,425 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_NEIGHBOR_H
 #define LMP_NEIGHBOR_H
 
 #include "pointers.h"
 
 namespace LAMMPS_NS {
 
 class Neighbor : protected Pointers {
   friend class Cuda;
 
  public:
   int style;                       // 0,1,2 = nsq, bin, multi
   int every;                       // build every this many steps
   int delay;                       // delay build for this many steps
   int dist_check;                  // 0 = always build, 1 = only if 1/2 dist
   int ago;                         // how many steps ago neighboring occurred
   int pgsize;                      // size of neighbor page
   int oneatom;                     // max # of neighbors for one atom
   int includegroup;                // only build pairwise lists for this group
   int build_once;                  // 1 if only build lists once per run
   int cudable;                     // GPU <-> CPU communication flag for CUDA
 
   double skin;                     // skin distance
   double cutneighmin;              // min neighbor cutoff for all type pairs
   double cutneighmax;              // max neighbor cutoff for all type pairs
   double *cuttype;                 // for each type, max neigh cut w/ others
 
   bigint ncalls;                   // # of times build has been called
   bigint ndanger;                  // # of dangerous builds
   bigint lastcall;                 // timestep of last neighbor::build() call
 
   int nrequest;                    // requests for pairwise neighbor lists
   class NeighRequest **requests;   // from Pair, Fix, Compute, Command classes
   int maxrequest;
 
   int old_style;                   // previous run info to avoid
   int old_nrequest;                // re-creation of pairwise neighbor lists
   int old_triclinic;
   int old_pgsize;
   int old_oneatom;
   class NeighRequest **old_requests;
 
   int nlist;                       // pairwise neighbor lists
   class NeighList **lists;
 
   int nbondlist;                   // list of bonds to compute
   int **bondlist;
   int nanglelist;                  // list of angles to compute
   int **anglelist;
   int ndihedrallist;               // list of dihedrals to compute
   int **dihedrallist;
   int nimproperlist;               // list of impropers to compute
   int **improperlist;
 
   Neighbor(class LAMMPS *);
   virtual ~Neighbor();
   virtual void init();
   int request(void *);              // another class requests a neighbor list
   void print_lists_of_lists();      // debug print out
   int decide();                     // decide whether to build or not
   virtual int check_distance();     // check max distance moved since last build
   void setup_bins();                // setup bins based on box and cutoff
   virtual void build(int topoflag=1);  // create all neighbor lists (pair,bond)
   virtual void build_topology();    // create all topology neighbor lists
   void build_one(int, int preflag=0);  // create a single neighbor list
   void set(int, char **);           // set neighbor style and skin distance
   void modify_params(int, char**);  // modify parameters that control builds
   bigint memory_usage();
   int exclude_setting();
 
  protected:
   int me,nprocs;
 
   int maxatom;                     // size of atom-based NeighList arrays
   int maxbond,maxangle,maxdihedral,maximproper;   // size of bond lists
   int maxwt;                       // max weighting factor applied + 1
 
   int must_check;                  // 1 if must check other classes to reneigh
   int restart_check;               // 1 if restart enabled, 0 if no
   int fix_check;                   // # of fixes that induce reneigh
   int *fixchecklist;               // which fixes to check
 
   double **cutneighsq;             // neighbor cutneigh sq for each type pair
   double **cutneighghostsq;        // neighbor cutnsq for each ghost type pair
   double cutneighmaxsq;            // cutneighmax squared
   double *cuttypesq;               // cuttype squared
 
   double triggersq;                // trigger = build when atom moves this dist
   int cluster_check;               // 1 if check bond/angle/etc satisfies minimg
 
   double **xhold;                      // atom coords at last neighbor build
   int maxhold;                         // size of xhold array
   int boxcheck;                        // 1 if need to store box size
   double boxlo_hold[3],boxhi_hold[3];  // box size at last neighbor build
   double corners_hold[8][3];           // box corners at last neighbor build
 
   int binatomflag;                 // bin atoms or not when build neigh list
                                    // turned off by build_one()
 
   int nbinx,nbiny,nbinz;           // # of global bins
   int *bins;                       // ptr to next atom in each bin
   int maxbin;                      // size of bins array
 
   int *binhead;                    // ptr to 1st atom in each bin
   int maxhead;                     // size of binhead array
 
   int mbins;                       // # of local bins and offset
   int mbinx,mbiny,mbinz;
   int mbinxlo,mbinylo,mbinzlo;
 
   int binsizeflag;                 // user-chosen bin size
   double binsize_user;
 
   double binsizex,binsizey,binsizez;  // actual bin sizes and inverse sizes
   double bininvx,bininvy,bininvz;
 
   int sx,sy,sz,smax;               // bin stencil extents
 
   int dimension;                   // 2/3 for 2d/3d
   int triclinic;                   // 0 if domain is orthog, 1 if triclinic
   int newton_pair;                 // 0 if newton off, 1 if on for pairwise
 
   double *bboxlo,*bboxhi;          // ptrs to full domain bounding box
   double (*corners)[3];            // ptr to 8 corners of triclinic box
 
   double inner[2],middle[2];       // rRESPA cutoffs for extra lists
   double cut_inner_sq;                   // outer cutoff for inner neighbor list
   double cut_middle_sq;            // outer cutoff for middle neighbor list
   double cut_middle_inside_sq;     // inner cutoff for middle neighbor list
 
   int special_flag[4];             // flags for 1-2, 1-3, 1-4 neighbors
 
   int anyghostlist;                // 1 if any non-occasional list
                                    // stores neighbors of ghosts
 
   int exclude;                     // 0 if no type/group exclusions, 1 if yes
 
   int nex_type;                    // # of entries in type exclusion list
   int maxex_type;                  // max # in type list
   int *ex1_type,*ex2_type;         // pairs of types to exclude
   int **ex_type;                   // 2d array of excluded type pairs
 
   int nex_group;                   // # of entries in group exclusion list
   int maxex_group;                 // max # in group list
   int *ex1_group,*ex2_group;       // pairs of group #'s to exclude
   int *ex1_bit,*ex2_bit;           // pairs of group bits to exclude
 
   int nex_mol;                     // # of entries in molecule exclusion list
   int maxex_mol;                   // max # in molecule list
   int *ex_mol_group;               // molecule group #'s to exclude
   int *ex_mol_bit;                 // molecule group bits to exclude
 
   int nblist,nglist,nslist;    // # of pairwise neigh lists of various kinds
   int *blist;                  // lists to build every reneighboring
   int *glist;                  // lists to grow atom arrays every reneigh
   int *slist;                  // lists to grow stencil arrays every reneigh
 
   void bin_atoms();                     // bin all atoms
   double bin_distance(int, int, int);   // distance between binx
   int coord2bin(double *);              // mapping atom coord to a bin
   int coord2bin(double *, int &, int &, int&); // ditto
 
   int exclusion(int, int, int,
                 int, int *, tagint *) const;    // test for pair exclusion
 
   virtual void choose_build(int, class NeighRequest *);
   void choose_stencil(int, class NeighRequest *);
 
   // dummy functions provided by NeighborKokkos
 
   virtual void init_cutneighsq_kokkos(int) {}
   virtual int init_lists_kokkos() {return 0;}
   virtual void init_list_flags1_kokkos(int) {}
   virtual void init_list_flags2_kokkos(int) {}
   virtual void init_list_grow_kokkos(int) {}
   virtual void build_kokkos(int) {}
   virtual void setup_bins_kokkos(int) {}
 
   // pairwise build functions
 
   typedef void (Neighbor::*PairPtr)(class NeighList *);
   PairPtr *pair_build;
 
   void half_nsq_no_newton(class NeighList *);
   void half_nsq_no_newton_ghost(class NeighList *);
   void half_nsq_newton(class NeighList *);
 
   void half_bin_no_newton(class NeighList *);
   void half_bin_no_newton_ghost(class NeighList *);
   void half_bin_newton(class NeighList *);
   void half_bin_newton_tri(class NeighList *);
 
   void half_multi_no_newton(class NeighList *);
   void half_multi_newton(class NeighList *);
   void half_multi_newton_tri(class NeighList *);
 
   void full_nsq(class NeighList *);
   void full_nsq_ghost(class NeighList *);
   void full_bin(class NeighList *);
   void full_bin_ghost(class NeighList *);
   void full_multi(class NeighList *);
 
   void half_from_full_no_newton(class NeighList *);
   void half_from_full_newton(class NeighList *);
   void skip_from(class NeighList *);
   void skip_from_granular(class NeighList *);
   void skip_from_respa(class NeighList *);
   void copy_from(class NeighList *);
 
   void granular_nsq_no_newton(class NeighList *);
   void granular_nsq_newton(class NeighList *);
   void granular_bin_no_newton(class NeighList *);
   void granular_bin_newton(class NeighList *);
   void granular_bin_newton_tri(class NeighList *);
 
   void respa_nsq_no_newton(class NeighList *);
   void respa_nsq_newton(class NeighList *);
   void respa_bin_no_newton(class NeighList *);
   void respa_bin_newton(class NeighList *);
   void respa_bin_newton_tri(class NeighList *);
 
   // include prototypes for multi-threaded neighbor lists
   // builds or their corresponding dummy versions
 
 #define LMP_INSIDE_NEIGHBOR_H
 #include "accelerator_omp.h"
+#include "accelerator_intel.h"
 #undef LMP_INSIDE_NEIGHBOR_H
 
   // pairwise stencil creation functions
 
   typedef void (Neighbor::*StencilPtr)(class NeighList *, int, int, int);
   StencilPtr *stencil_create;
 
   void stencil_half_bin_2d_no_newton(class NeighList *, int, int, int);
   void stencil_half_ghost_bin_2d_no_newton(class NeighList *, int, int, int);
   void stencil_half_bin_3d_no_newton(class NeighList *, int, int, int);
   void stencil_half_ghost_bin_3d_no_newton(class NeighList *, int, int, int);
   void stencil_half_bin_2d_newton(class NeighList *, int, int, int);
   void stencil_half_bin_3d_newton(class NeighList *, int, int, int);
   void stencil_half_bin_2d_newton_tri(class NeighList *, int, int, int);
   void stencil_half_bin_3d_newton_tri(class NeighList *, int, int, int);
 
   void stencil_half_multi_2d_no_newton(class NeighList *, int, int, int);
   void stencil_half_multi_3d_no_newton(class NeighList *, int, int, int);
   void stencil_half_multi_2d_newton(class NeighList *, int, int, int);
   void stencil_half_multi_3d_newton(class NeighList *, int, int, int);
   void stencil_half_multi_2d_newton_tri(class NeighList *, int, int, int);
   void stencil_half_multi_3d_newton_tri(class NeighList *, int, int, int);
 
   void stencil_full_bin_2d(class NeighList *, int, int, int);
   void stencil_full_ghost_bin_2d(class NeighList *, int, int, int);
   void stencil_full_bin_3d(class NeighList *, int, int, int);
   void stencil_full_ghost_bin_3d(class NeighList *, int, int, int);
   void stencil_full_multi_2d(class NeighList *, int, int, int);
   void stencil_full_multi_3d(class NeighList *, int, int, int);
 
   // topology build functions
 
   typedef void (Neighbor::*BondPtr)();   // ptrs to topology build functions
 
   BondPtr bond_build;                 // ptr to bond list functions
   void bond_all();                    // bond list with all bonds
   void bond_template();               // bond list with templated bonds
   void bond_partial();                // exclude certain bonds
   void bond_check();
 
   BondPtr angle_build;                // ptr to angle list functions
   void angle_all();                   // angle list with all angles
   void angle_template();              // angle list with templated bonds
   void angle_partial();               // exclude certain angles
   void angle_check();
 
   BondPtr dihedral_build;             // ptr to dihedral list functions
   void dihedral_all();                // dihedral list with all dihedrals
   void dihedral_template();           // dihedral list with templated bonds
   void dihedral_partial();            // exclude certain dihedrals
   void dihedral_check(int, int **);
 
   BondPtr improper_build;             // ptr to improper list functions
   void improper_all();                // improper list with all impropers
   void improper_template();           // improper list with templated bonds
   void improper_partial();            // exclude certain impropers
 
   // find_special: determine if atom j is in special list of atom i
   // if it is not, return 0
   // if it is and special flag is 0 (both coeffs are 0.0), return -1
   // if it is and special flag is 1 (both coeffs are 1.0), return 0
   // if it is and special flag is 2 (otherwise), return 1,2,3
   //   for which level of neighbor it is (and which coeff it maps to)
 
   inline int find_special(const tagint *list, const int *nspecial,
                           const tagint tag) const {
     const int n1 = nspecial[0];
     const int n2 = nspecial[1];
     const int n3 = nspecial[2];
 
     for (int i = 0; i < n3; i++) {
       if (list[i] == tag) {
         if (i < n1) {
           if (special_flag[1] == 0) return -1;
           else if (special_flag[1] == 1) return 0;
           else return 1;
         } else if (i < n2) {
           if (special_flag[2] == 0) return -1;
           else if (special_flag[2] == 1) return 0;
           else return 2;
         } else {
           if (special_flag[3] == 0) return -1;
           else if (special_flag[3] == 1) return 0;
           else return 3;
         }
       }
     }
     return 0;
   };
 };
 
 }
 
 #endif
 
 /* ERROR/WARNING messages:
 
 E: Neighbor delay must be 0 or multiple of every setting
 
 The delay and every parameters set via the neigh_modify command are
 inconsistent.  If the delay setting is non-zero, then it must be a
 multiple of the every setting.
 
 E: Neighbor page size must be >= 10x the one atom setting
 
 This is required to prevent wasting too much memory.
 
 E: Invalid atom type in neighbor exclusion list
 
 Atom types must range from 1 to Ntypes inclusive.
 
 W: Neighbor exclusions used with KSpace solver may give inconsistent Coulombic energies
 
 This is because excluding specific pair interactions also excludes
 them from long-range interactions which may not be the desired effect.
 The special_bonds command handles this consistently by insuring
 excluded (or weighted) 1-2, 1-3, 1-4 interactions are treated
 consistently by both the short-range pair style and the long-range
 solver.  This is not done for exclusions of charged atom pairs via the
 neigh_modify exclude command.
 
 E: Neighbor include group not allowed with ghost neighbors
 
 This is a current restriction within LAMMPS.
 
 E: Neighbor multi not yet enabled for ghost neighbors
 
 This is a current restriction within LAMMPS.
 
 E: Neighbor multi not yet enabled for granular
 
 Self-explanatory.
 
 E: Neighbor multi not yet enabled for rRESPA
 
 Self-explanatory.
 
 E: Too many local+ghost atoms for neighbor list
 
 The number of nlocal + nghost atoms on a processor
 is limited by the size of a 32-bit integer with 2 bits
 removed for masking 1-2, 1-3, 1-4 neighbors.
 
 W: Building an occasional neighobr list when atoms may have moved too far
 
 This can cause LAMMPS to crash when the neighbor list is built.
 The solution is to check for building the regular neighbor lists
 more frequently.
 
 E: Domain too large for neighbor bins
 
 The domain has become extremely large so that neighbor bins cannot be
 used.  Most likely, one or more atoms have been blown out of the
 simulation box to a great distance.
 
 E: Cannot use neighbor bins - box size << cutoff
 
 Too many neighbor bins will be created.  This typically happens when
 the simulation box is very small in some dimension, compared to the
 neighbor cutoff.  Use the "nsq" style instead of "bin" style.
 
 E: Too many neighbor bins
 
 This is likely due to an immense simulation box that has blown up
 to a large size.
 
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
 E: Invalid group ID in neigh_modify command
 
 A group ID used in the neigh_modify command does not exist.
 
 E: Neigh_modify include group != atom_modify first group
 
 Self-explanatory.
 
 E: Neigh_modify exclude molecule requires atom attribute molecule
 
 Self-explanatory.
 
 */
diff --git a/src/output.cpp b/src/output.cpp
index 0383dfe84..6a9223604 100644
--- a/src/output.cpp
+++ b/src/output.cpp
@@ -1,807 +1,807 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "output.h"
 #include "style_dump.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "input.h"
 #include "variable.h"
 #include "comm.h"
 #include "update.h"
 #include "group.h"
 #include "domain.h"
 #include "thermo.h"
 #include "modify.h"
 #include "compute.h"
 #include "force.h"
 #include "dump.h"
 #include "write_restart.h"
 #include "accelerator_cuda.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define DELTA 1
 
 /* ----------------------------------------------------------------------
    initialize all output
 ------------------------------------------------------------------------- */
 
 Output::Output(LAMMPS *lmp) : Pointers(lmp)
 {
   // create default computes for temp,pressure,pe
 
   char **newarg = new char*[4];
   newarg[0] = (char *) "thermo_temp";
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "temp";
-  modify->add_compute(3,newarg,lmp->suffix);
+  modify->add_compute(3,newarg,1);
 
   newarg[0] = (char *) "thermo_press";
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "pressure";
   newarg[3] = (char *) "thermo_temp";
-  modify->add_compute(4,newarg,lmp->suffix);
+  modify->add_compute(4,newarg,1);
 
   newarg[0] = (char *) "thermo_pe";
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "pe";
-  modify->add_compute(3,newarg,lmp->suffix);
+  modify->add_compute(3,newarg,1);
 
   delete [] newarg;
 
   // create default Thermo class
 
   newarg = new char*[1];
   newarg[0] = (char *) "one";
   thermo = new Thermo(lmp,1,newarg);
   delete [] newarg;
 
   thermo_every = 0;
   var_thermo = NULL;
 
   ndump = 0;
   max_dump = 0;
   every_dump = NULL;
   next_dump = NULL;
   last_dump = NULL;
   var_dump = NULL;
   ivar_dump = NULL;
   dump = NULL;
 
   restart_flag = restart_flag_single = restart_flag_double = 0;
   restart_every_single = restart_every_double = 0;
   last_restart = -1;
   restart1 = restart2a = restart2b = NULL;
   var_restart_single = var_restart_double = NULL;
   restart = NULL;
 }
 
 /* ----------------------------------------------------------------------
    free all memory
 ------------------------------------------------------------------------- */
 
 Output::~Output()
 {
   if (thermo) delete thermo;
   delete [] var_thermo;
 
   memory->destroy(every_dump);
   memory->destroy(next_dump);
   memory->destroy(last_dump);
   for (int i = 0; i < ndump; i++) delete [] var_dump[i];
   memory->sfree(var_dump);
   memory->destroy(ivar_dump);
   for (int i = 0; i < ndump; i++) delete dump[i];
   memory->sfree(dump);
 
   delete [] restart1;
   delete [] restart2a;
   delete [] restart2b;
   delete [] var_restart_single;
   delete [] var_restart_double;
   delete restart;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Output::init()
 {
   thermo->init();
   if (var_thermo) {
     ivar_thermo = input->variable->find(var_thermo);
     if (ivar_thermo < 0)
       error->all(FLERR,"Variable name for thermo every does not exist");
     if (!input->variable->equalstyle(ivar_thermo))
       error->all(FLERR,"Variable for thermo every is invalid style");
   }
 
   for (int i = 0; i < ndump; i++) dump[i]->init();
   for (int i = 0; i < ndump; i++)
     if (every_dump[i] == 0) {
       ivar_dump[i] = input->variable->find(var_dump[i]);
       if (ivar_dump[i] < 0)
         error->all(FLERR,"Variable name for dump every does not exist");
       if (!input->variable->equalstyle(ivar_dump[i]))
         error->all(FLERR,"Variable for dump every is invalid style");
     }
 
   if (restart_flag_single && restart_every_single == 0) {
     ivar_restart_single = input->variable->find(var_restart_single);
     if (ivar_restart_single < 0)
       error->all(FLERR,"Variable name for restart does not exist");
     if (!input->variable->equalstyle(ivar_restart_single))
       error->all(FLERR,"Variable for restart is invalid style");
   }
   if (restart_flag_double && restart_every_double == 0) {
     ivar_restart_double = input->variable->find(var_restart_double);
     if (ivar_restart_double < 0)
       error->all(FLERR,"Variable name for restart does not exist");
     if (!input->variable->equalstyle(ivar_restart_double))
       error->all(FLERR,"Variable for restart is invalid style");
   }
 }
 
 /* ----------------------------------------------------------------------
    perform output for setup of run/min
    do dump first, so memory_usage will include dump allocation
    do thermo last, so will print after memory_usage
    memflag = 0/1 for printing out memory usage
 ------------------------------------------------------------------------- */
 
 void Output::setup(int memflag)
 {
   bigint ntimestep = update->ntimestep;
 
   // perform dump at start of run only if:
   //   current timestep is multiple of every and last dump not >= this step
   //   this is first run after dump created and firstflag is set
   //   note that variable freq will not write unless triggered by firstflag
   // set next_dump to multiple of every or variable value
   // set next_dump_any to smallest next_dump
   // wrap dumps that invoke computes and variable eval with clear/add
   // if dump not written now, use addstep_compute_all() since don't know
   //   what computes the dump write would invoke
   // if no dumps, set next_dump_any to last+1 so will not influence next
 
   int writeflag;
 
   if (ndump && update->restrict_output == 0) {
     for (int idump = 0; idump < ndump; idump++) {
       if (dump[idump]->clearstep || every_dump[idump] == 0)
         modify->clearstep_compute();
       writeflag = 0;
       if (every_dump[idump] && ntimestep % every_dump[idump] == 0 &&
           last_dump[idump] != ntimestep) writeflag = 1;
       if (last_dump[idump] < 0 && dump[idump]->first_flag == 1) writeflag = 1;
 
       if (writeflag) {
         dump[idump]->write();
         last_dump[idump] = ntimestep;
       }
       if (every_dump[idump])
         next_dump[idump] =
           (ntimestep/every_dump[idump])*every_dump[idump] + every_dump[idump];
       else {
         bigint nextdump = static_cast<bigint>
           (input->variable->compute_equal(ivar_dump[idump]));
         if (nextdump <= ntimestep)
           error->all(FLERR,"Dump every variable returned a bad timestep");
         next_dump[idump] = nextdump;
       }
       if (dump[idump]->clearstep || every_dump[idump] == 0) {
         if (writeflag) modify->addstep_compute(next_dump[idump]);
         else modify->addstep_compute_all(next_dump[idump]);
       }
       if (idump) next_dump_any = MIN(next_dump_any,next_dump[idump]);
       else next_dump_any = next_dump[0];
     }
   } else next_dump_any = update->laststep + 1;
 
   // do not write restart files at start of run
   // set next_restart values to multiple of every or variable value
   // wrap variable eval with clear/add
   // if no restarts, set next_restart to last+1 so will not influence next
 
   if (restart_flag && update->restrict_output == 0) {
     if (restart_flag_single) {
       if (restart_every_single)
         next_restart_single =
           (ntimestep/restart_every_single)*restart_every_single +
           restart_every_single;
       else {
         bigint nextrestart = static_cast<bigint>
           (input->variable->compute_equal(ivar_restart_single));
         if (nextrestart <= ntimestep)
           error->all(FLERR,"Restart variable returned a bad timestep");
         next_restart_single = nextrestart;
       }
     } else next_restart_single = update->laststep + 1;
     if (restart_flag_double) {
       if (restart_every_double)
         next_restart_double =
           (ntimestep/restart_every_double)*restart_every_double +
           restart_every_double;
       else {
         bigint nextrestart = static_cast<bigint>
           (input->variable->compute_equal(ivar_restart_double));
         if (nextrestart <= ntimestep)
           error->all(FLERR,"Restart variable returned a bad timestep");
         next_restart_double = nextrestart;
       }
     } else next_restart_double = update->laststep + 1;
     next_restart = MIN(next_restart_single,next_restart_double);
   } else next_restart = update->laststep + 1;
 
   // print memory usage unless being called between multiple runs
 
   if (memflag) memory_usage();
 
   // set next_thermo to multiple of every or variable eval if var defined
   // insure thermo output on last step of run
   // thermo may invoke computes so wrap with clear/add
 
   modify->clearstep_compute();
 
   thermo->header();
   thermo->compute(0);
   last_thermo = ntimestep;
 
   if (var_thermo) {
     next_thermo = static_cast<bigint>
       (input->variable->compute_equal(ivar_thermo));
     if (next_thermo <= ntimestep)
       error->all(FLERR,"Thermo every variable returned a bad timestep");
   } else if (thermo_every) {
     next_thermo = (ntimestep/thermo_every)*thermo_every + thermo_every;
     next_thermo = MIN(next_thermo,update->laststep);
   } else next_thermo = update->laststep;
 
   modify->addstep_compute(next_thermo);
 
   // next = next timestep any output will be done
 
   next = MIN(next_dump_any,next_restart);
   next = MIN(next,next_thermo);
 }
 
 /* ----------------------------------------------------------------------
    perform all output for this timestep
    only perform output if next matches current step and last output doesn't
    do dump/restart before thermo so thermo CPU time will include them
 ------------------------------------------------------------------------- */
 
 void Output::write(bigint ntimestep)
 {
   // next_dump does not force output on last step of run
   // wrap dumps that invoke computes or eval of variable with clear/add
   // download data from GPU if necessary
 
   if (next_dump_any == ntimestep) {
     if (lmp->cuda && !lmp->cuda->oncpu) lmp->cuda->downloadAll();
 
     for (int idump = 0; idump < ndump; idump++) {
       if (next_dump[idump] == ntimestep) {
         if (dump[idump]->clearstep || every_dump[idump] == 0)
           modify->clearstep_compute();
         if (last_dump[idump] != ntimestep) {
           dump[idump]->write();
           last_dump[idump] = ntimestep;
         }
         if (every_dump[idump]) next_dump[idump] += every_dump[idump];
         else {
           bigint nextdump = static_cast<bigint>
             (input->variable->compute_equal(ivar_dump[idump]));
           if (nextdump <= ntimestep)
             error->all(FLERR,"Dump every variable returned a bad timestep");
           next_dump[idump] = nextdump;
         }
         if (dump[idump]->clearstep || every_dump[idump] == 0)
           modify->addstep_compute(next_dump[idump]);
       }
       if (idump) next_dump_any = MIN(next_dump_any,next_dump[idump]);
       else next_dump_any = next_dump[0];
     }
   }
 
   // next_restart does not force output on last step of run
   // for toggle = 0, replace "*" with current timestep in restart filename
   // download data from GPU if necessary
   // eval of variable may invoke computes so wrap with clear/add
 
   if (next_restart == ntimestep) {
     if (lmp->cuda && !lmp->cuda->oncpu) lmp->cuda->downloadAll();
 
     if (next_restart_single == ntimestep) {
       char *file = new char[strlen(restart1) + 16];
       char *ptr = strchr(restart1,'*');
       *ptr = '\0';
       sprintf(file,"%s" BIGINT_FORMAT "%s",restart1,ntimestep,ptr+1);
       *ptr = '*';
       if (last_restart != ntimestep) restart->write(file);
       delete [] file;
       if (restart_every_single) next_restart_single += restart_every_single;
       else {
         modify->clearstep_compute();
         bigint nextrestart = static_cast<bigint>
           (input->variable->compute_equal(ivar_restart_single));
         if (nextrestart <= ntimestep)
           error->all(FLERR,"Restart variable returned a bad timestep");
         next_restart_single = nextrestart;
         modify->addstep_compute(next_restart_single);
       }
     }
     if (next_restart_double == ntimestep) {
       if (last_restart != ntimestep) {
         if (restart_toggle == 0) {
           restart->write(restart2a);
           restart_toggle = 1;
         } else {
           restart->write(restart2b);
           restart_toggle = 0;
         }
       }
       if (restart_every_double) next_restart_double += restart_every_double;
       else {
         modify->clearstep_compute();
         bigint nextrestart = static_cast<bigint>
           (input->variable->compute_equal(ivar_restart_double));
         if (nextrestart <= ntimestep)
           error->all(FLERR,"Restart variable returned a bad timestep");
         next_restart_double = nextrestart;
         modify->addstep_compute(next_restart_double);
       }
     }
     last_restart = ntimestep;
     next_restart = MIN(next_restart_single,next_restart_double);
   }
 
   // insure next_thermo forces output on last step of run
   // thermo may invoke computes so wrap with clear/add
 
   if (next_thermo == ntimestep) {
     modify->clearstep_compute();
     if (last_thermo != ntimestep) thermo->compute(1);
     last_thermo = ntimestep;
     if (var_thermo) {
       next_thermo = static_cast<bigint>
         (input->variable->compute_equal(ivar_thermo));
       if (next_thermo <= ntimestep)
         error->all(FLERR,"Thermo every variable returned a bad timestep");
     } else if (thermo_every) next_thermo += thermo_every;
     else next_thermo = update->laststep;
     next_thermo = MIN(next_thermo,update->laststep);
     modify->addstep_compute(next_thermo);
   }
 
   // next = next timestep any output will be done
 
   next = MIN(next_dump_any,next_restart);
   next = MIN(next,next_thermo);
 }
 
 /* ----------------------------------------------------------------------
    force a snapshot to be written for all dumps
    called from PRD and TAD
 ------------------------------------------------------------------------- */
 
 void Output::write_dump(bigint ntimestep)
 {
   for (int idump = 0; idump < ndump; idump++) {
     dump[idump]->write();
     last_dump[idump] = ntimestep;
   }
 }
 
 /* ----------------------------------------------------------------------
    force restart file(s) to be written
    called from PRD and TAD
 ------------------------------------------------------------------------- */
 
 void Output::write_restart(bigint ntimestep)
 {
   if (restart_flag_single) {
     char *file = new char[strlen(restart1) + 16];
     char *ptr = strchr(restart1,'*');
     *ptr = '\0';
     sprintf(file,"%s" BIGINT_FORMAT "%s",restart1,ntimestep,ptr+1);
     *ptr = '*';
     restart->write(file);
     delete [] file;
   }
 
   if (restart_flag_double) {
     if (restart_toggle == 0) {
       restart->write(restart2a);
       restart_toggle = 1;
     } else {
       restart->write(restart2b);
       restart_toggle = 0;
     }
   }
 
   last_restart = ntimestep;
 }
 
 /* ----------------------------------------------------------------------
    timestep is being changed, called by update->reset_timestep()
    reset next timestep values for dumps, restart, thermo output
    reset to smallest value >= new timestep
    if next timestep set by variable evaluation,
      eval for ntimestep-1, so current ntimestep can be returned if needed
      no guarantee that variable can be evaluated for ntimestep-1
        if it depends on computes, but live with that rare case for now
 ------------------------------------------------------------------------- */
 
 void Output::reset_timestep(bigint ntimestep)
 {
   next_dump_any = MAXBIGINT;
   for (int idump = 0; idump < ndump; idump++) {
     if (every_dump[idump]) {
       next_dump[idump] = (ntimestep/every_dump[idump])*every_dump[idump];
       if (next_dump[idump] < ntimestep) next_dump[idump] += every_dump[idump];
     } else {
       modify->clearstep_compute();
       update->ntimestep--;
       bigint nextdump = static_cast<bigint>
         (input->variable->compute_equal(ivar_dump[idump]));
       if (nextdump < ntimestep)
         error->all(FLERR,"Dump every variable returned a bad timestep");
       update->ntimestep++;
       next_dump[idump] = nextdump;
       modify->addstep_compute(next_dump[idump]);
     }
     next_dump_any = MIN(next_dump_any,next_dump[idump]);
   }
 
   if (restart_flag_single) {
     if (restart_every_single) {
       next_restart_single =
         (ntimestep/restart_every_single)*restart_every_single;
       if (next_restart_single < ntimestep)
         next_restart_single += restart_every_single;
     } else {
       modify->clearstep_compute();
       update->ntimestep--;
       bigint nextrestart = static_cast<bigint>
         (input->variable->compute_equal(ivar_restart_single));
       if (nextrestart < ntimestep)
         error->all(FLERR,"Restart variable returned a bad timestep");
       update->ntimestep++;
       next_restart_single = nextrestart;
       modify->addstep_compute(next_restart_single);
     }
   } else next_restart_single = update->laststep + 1;
 
   if (restart_flag_double) {
     if (restart_every_double) {
       next_restart_double =
         (ntimestep/restart_every_double)*restart_every_double;
       if (next_restart_double < ntimestep)
         next_restart_double += restart_every_double;
     } else {
       modify->clearstep_compute();
       update->ntimestep--;
       bigint nextrestart = static_cast<bigint>
         (input->variable->compute_equal(ivar_restart_double));
       if (nextrestart < ntimestep)
         error->all(FLERR,"Restart variable returned a bad timestep");
       update->ntimestep++;
       next_restart_double = nextrestart;
       modify->addstep_compute(next_restart_double);
     }
   } else next_restart_double = update->laststep + 1;
 
   next_restart = MIN(next_restart_single,next_restart_double);
 
   if (var_thermo) {
     modify->clearstep_compute();
     update->ntimestep--;
     next_thermo = static_cast<bigint>
       (input->variable->compute_equal(ivar_thermo));
     if (next_thermo < ntimestep)
       error->all(FLERR,"Thermo_modify every variable returned a bad timestep");
     update->ntimestep++;
     next_thermo = MIN(next_thermo,update->laststep);
     modify->addstep_compute(next_thermo);
   } else if (thermo_every) {
     next_thermo = (ntimestep/thermo_every)*thermo_every;
     if (next_thermo < ntimestep) next_thermo += thermo_every;
     next_thermo = MIN(next_thermo,update->laststep);
   } else next_thermo = update->laststep;
 
   next = MIN(next_dump_any,next_restart);
   next = MIN(next,next_thermo);
 }
 
 /* ----------------------------------------------------------------------
    add a Dump to list of Dumps
 ------------------------------------------------------------------------- */
 
 void Output::add_dump(int narg, char **arg)
 {
   if (narg < 5) error->all(FLERR,"Illegal dump command");
 
   // error checks
 
   for (int idump = 0; idump < ndump; idump++)
     if (strcmp(arg[0],dump[idump]->id) == 0)
       error->all(FLERR,"Reuse of dump ID");
   int igroup = group->find(arg[1]);
   if (igroup == -1) error->all(FLERR,"Could not find dump group ID");
   if (force->inumeric(FLERR,arg[3]) <= 0) 
     error->all(FLERR,"Invalid dump frequency");
 
   // extend Dump list if necessary
 
   if (ndump == max_dump) {
     max_dump += DELTA;
     dump = (Dump **)
       memory->srealloc(dump,max_dump*sizeof(Dump *),"output:dump");
     memory->grow(every_dump,max_dump,"output:every_dump");
     memory->grow(next_dump,max_dump,"output:next_dump");
     memory->grow(last_dump,max_dump,"output:last_dump");
     var_dump = (char **)
       memory->srealloc(var_dump,max_dump*sizeof(char *),"output:var_dump");
     memory->grow(ivar_dump,max_dump,"output:ivar_dump");
   }
 
   // create the Dump
 
   if (0) return;         // dummy line to enable else-if macro expansion
 
 #define DUMP_CLASS
 #define DumpStyle(key,Class) \
   else if (strcmp(arg[2],#key) == 0) dump[ndump] = new Class(lmp,narg,arg);
 #include "style_dump.h"
 #undef DUMP_CLASS
 
   else error->all(FLERR,"Invalid dump style");
 
   every_dump[ndump] = force->inumeric(FLERR,arg[3]);
   if (every_dump[ndump] <= 0) error->all(FLERR,"Illegal dump command");
   last_dump[ndump] = -1;
   var_dump[ndump] = NULL;
   ndump++;
 }
 
 /* ----------------------------------------------------------------------
    modify parameters of a Dump
 ------------------------------------------------------------------------- */
 
 void Output::modify_dump(int narg, char **arg)
 {
   if (narg < 1) error->all(FLERR,"Illegal dump_modify command");
 
   // find which dump it is
 
   int idump;
   for (idump = 0; idump < ndump; idump++)
     if (strcmp(arg[0],dump[idump]->id) == 0) break;
   if (idump == ndump) error->all(FLERR,"Cound not find dump_modify ID");
 
   dump[idump]->modify_params(narg-1,&arg[1]);
 }
 
 /* ----------------------------------------------------------------------
    delete a Dump from list of Dumps
 ------------------------------------------------------------------------- */
 
 void Output::delete_dump(char *id)
 {
   // find which dump it is and delete it
 
   int idump;
   for (idump = 0; idump < ndump; idump++)
     if (strcmp(id,dump[idump]->id) == 0) break;
   if (idump == ndump) error->all(FLERR,"Could not find undump ID");
 
   delete dump[idump];
   delete [] var_dump[idump];
 
   // move other dumps down in list one slot
 
   for (int i = idump+1; i < ndump; i++) {
     dump[i-1] = dump[i];
     every_dump[i-1] = every_dump[i];
     next_dump[i-1] = next_dump[i];
     last_dump[i-1] = last_dump[i];
     var_dump[i-1] = var_dump[i];
     ivar_dump[i-1] = ivar_dump[i];
   }
   ndump--;
 }
 
 /* ----------------------------------------------------------------------
    set thermo output frequency from input script
 ------------------------------------------------------------------------- */
 
 void Output::set_thermo(int narg, char **arg)
 {
   if (narg != 1) error->all(FLERR,"Illegal thermo command");
 
   if (strstr(arg[0],"v_") == arg[0]) {
     delete [] var_thermo;
     int n = strlen(&arg[0][2]) + 1;
     var_thermo = new char[n];
     strcpy(var_thermo,&arg[0][2]);
   } else {
     thermo_every = force->inumeric(FLERR,arg[0]);
     if (thermo_every < 0) error->all(FLERR,"Illegal thermo command");
   }
 }
 
 /* ----------------------------------------------------------------------
    new Thermo style
 ------------------------------------------------------------------------- */
 
 void Output::create_thermo(int narg, char **arg)
 {
   if (narg < 1) error->all(FLERR,"Illegal thermo_style command");
 
   // don't allow this so that dipole style can safely allocate inertia vector
 
   if (domain->box_exist == 0)
     error->all(FLERR,"Thermo_style command before simulation box is defined");
 
   // warn if previous thermo had been modified via thermo_modify command
 
   if (thermo->modified && comm->me == 0)
     error->warning(FLERR,"New thermo_style command, "
                    "previous thermo_modify settings will be lost");
 
   // set thermo = NULL in case new Thermo throws an error
 
   delete thermo;
   thermo = NULL;
   thermo = new Thermo(lmp,narg,arg);
 }
 
 /* ----------------------------------------------------------------------
    setup restart capability for single or double output files
    if only one filename and it contains no "*", then append ".*"
 ------------------------------------------------------------------------- */
 
 void Output::create_restart(int narg, char **arg)
 {
   if (narg < 1) error->all(FLERR,"Illegal restart command");
 
   int every = 0;
   int varflag = 0;
 
   if (strstr(arg[0],"v_") == arg[0]) varflag = 1;
   else every = force->inumeric(FLERR,arg[0]);
 
   if (!varflag && every == 0) {
     if (narg != 1) error->all(FLERR,"Illegal restart command");
 
     restart_flag = restart_flag_single = restart_flag_double = 0;
     last_restart = -1;
 
     delete restart;
     restart = NULL;
     delete [] restart1;
     delete [] restart2a;
     delete [] restart2b;
     restart1 = restart2a = restart2b = NULL;
     delete [] var_restart_single;
     delete [] var_restart_double;
     var_restart_single = var_restart_double = NULL;
 
     return;
   }
 
   if (narg < 2) error->all(FLERR,"Illegal restart command");
 
   int nfile = 0;
   if (narg % 2 == 0) nfile = 1;
   else nfile = 2;
 
   if (nfile == 1) {
     restart_flag = restart_flag_single = 1;
 
     if (varflag) {
       delete [] var_restart_single;
       int n = strlen(&arg[0][2]) + 1;
       var_restart_single = new char[n];
       strcpy(var_restart_single,&arg[0][2]);
       restart_every_single = 0;
     } else restart_every_single = every;
 
     int n = strlen(arg[1]) + 3;
     restart1 = new char[n];
     strcpy(restart1,arg[1]);
     if (strchr(restart1,'*') == NULL) strcat(restart1,".*");
   }
 
   if (nfile == 2) {
     restart_flag = restart_flag_double = 1;
 
     if (varflag) {
       delete [] var_restart_double;
       int n = strlen(&arg[0][2]) + 1;
       var_restart_double = new char[n];
       strcpy(var_restart_double,&arg[0][2]);
       restart_every_double = 0;
     } else restart_every_double = every;
 
     restart_toggle = 0;
     int n = strlen(arg[1]) + 3;
     restart2a = new char[n];
     strcpy(restart2a,arg[1]);
     n = strlen(arg[2]) + 1;
     restart2b = new char[n];
     strcpy(restart2b,arg[2]);
   }
 
   // check for multiproc output and an MPI-IO filename
   // if 2 filenames, must be consistent
 
   int multiproc;
   if (strchr(arg[1],'%')) multiproc = comm->nprocs;
   else multiproc = 0;
   if (nfile == 2) {
     if (multiproc && !strchr(arg[2],'%')) 
       error->all(FLERR,"Both restart files must use % or neither");
     if (!multiproc && strchr(arg[2],'%'))
       error->all(FLERR,"Both restart files must use % or neither");
   }
 
   int mpiioflag;
   if (strstr(arg[1],".mpi")) mpiioflag = 1;
   else mpiioflag = 0;
   if (nfile == 2) {
     if (mpiioflag && !strstr(arg[2],".mpi")) 
       error->all(FLERR,"Both restart files must use MPI-IO or neither");
     if (!mpiioflag && strstr(arg[2],".mpi"))
       error->all(FLERR,"Both restart files must use MPI-IO or neither");
   }
 
   // setup output style and process optional args
 
   delete restart;
   restart = new WriteRestart(lmp);
   int iarg = nfile+1;
   restart->multiproc_options(multiproc,mpiioflag,narg-iarg,&arg[iarg]);
 }
 
 /* ----------------------------------------------------------------------
    sum and print memory usage
    result is only memory on proc 0, not averaged across procs
 ------------------------------------------------------------------------- */
 
 void Output::memory_usage()
 {
   bigint bytes = 0;
   bytes += atom->memory_usage();
   bytes += neighbor->memory_usage();
   bytes += comm->memory_usage();
   bytes += update->memory_usage();
   bytes += force->memory_usage();
   bytes += modify->memory_usage();
   for (int i = 0; i < ndump; i++) bytes += dump[i]->memory_usage();
 
   double mbytes = bytes/1024.0/1024.0;
 
   if (comm->me == 0) {
     if (screen)
       fprintf(screen,"Memory usage per processor = %g Mbytes\n",mbytes);
     if (logfile)
       fprintf(logfile,"Memory usage per processor = %g Mbytes\n",mbytes);
   }
 }
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index e15d4b00d..15de14db1 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -1,787 +1,787 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "stdlib.h"
 #include "string.h"
 #include "ctype.h"
 #include "pair_hybrid.h"
 #include "atom.h"
 #include "force.h"
 #include "pair.h"
 #include "neighbor.h"
 #include "neigh_request.h"
 #include "update.h"
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairHybrid::PairHybrid(LAMMPS *lmp) : Pair(lmp)
 {
   nstyles = 0;
   styles = NULL;
   keywords = NULL;
   multiple = NULL;
 
   outerflag = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairHybrid::~PairHybrid()
 {
   if (nstyles) {
     for (int m = 0; m < nstyles; m++) delete styles[m];
     for (int m = 0; m < nstyles; m++) delete [] keywords[m];
   }
   delete [] styles;
   delete [] keywords;
   delete [] multiple;
 
   delete [] svector;
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     memory->destroy(cutghost);
     memory->destroy(nmap);
     memory->destroy(map);
   }
 }
 
 /* ----------------------------------------------------------------------
   call each sub-style's compute() or compute_outer() function
   accumulate sub-style global/peratom energy/virial in hybrid
   for global vflag = 1:
     each sub-style computes own virial[6]
     sum sub-style virial[6] to hybrid's virial[6]
   for global vflag = 2:
     call sub-style with adjusted vflag to prevent it calling
       virial_fdotr_compute()
     hybrid calls virial_fdotr_compute() on final accumulated f
 ------------------------------------------------------------------------- */
 
 void PairHybrid::compute(int eflag, int vflag)
 {
   int i,j,m,n;
 
   // if no_virial_fdotr_compute is set and global component of
   //   incoming vflag = 2, then
   // reset vflag as if global component were 1
   // necessary since one or more sub-styles cannot compute virial as F dot r
 
   if (no_virial_fdotr_compute && vflag % 4 == 2) vflag = 1 + vflag/4 * 4;
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = eflag_global = vflag_global =
          eflag_atom = vflag_atom = 0;
 
   // check if global component of incoming vflag = 2
   // if so, reset vflag passed to substyle as if it were 0
   // necessary so substyle will not invoke virial_fdotr_compute()
 
   int vflag_substyle;
   if (vflag % 4 == 2) vflag_substyle = vflag/4 * 4;
   else vflag_substyle = vflag;
 
   for (m = 0; m < nstyles; m++) {
 
     // invoke compute() unless compute flag is turned off or
     // outerflag is set and sub-style has a compute_outer() method
 
     if (styles[m]->compute_flag == 0) continue;
     if (outerflag && styles[m]->respa_enable) 
       styles[m]->compute_outer(eflag,vflag_substyle);
     else styles[m]->compute(eflag,vflag_substyle);
 
     if (eflag_global) {
       eng_vdwl += styles[m]->eng_vdwl;
       eng_coul += styles[m]->eng_coul;
     }
     if (vflag_global) {
       for (n = 0; n < 6; n++) virial[n] += styles[m]->virial[n];
     }
     if (eflag_atom) {
       n = atom->nlocal;
       if (force->newton_pair) n += atom->nghost;
       double *eatom_substyle = styles[m]->eatom;
       for (i = 0; i < n; i++) eatom[i] += eatom_substyle[i];
     }
     if (vflag_atom) {
       n = atom->nlocal;
       if (force->newton_pair) n += atom->nghost;
       double **vatom_substyle = styles[m]->vatom;
       for (i = 0; i < n; i++)
         for (j = 0; j < 6; j++)
           vatom[i][j] += vatom_substyle[i][j];
     }
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairHybrid::compute_inner()
 {
   for (int m = 0; m < nstyles; m++)
     if (styles[m]->respa_enable) styles[m]->compute_inner();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairHybrid::compute_middle()
 {
   for (int m = 0; m < nstyles; m++)
     if (styles[m]->respa_enable) styles[m]->compute_middle();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairHybrid::compute_outer(int eflag, int vflag)
 {
   outerflag = 1;
   compute(eflag,vflag);
   outerflag = 0;
 }
 
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
 
 void PairHybrid::allocate()
 {
   allocated = 1;
   int n = atom->ntypes;
 
   memory->create(setflag,n+1,n+1,"pair:setflag");
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       setflag[i][j] = 0;
 
   memory->create(cutsq,n+1,n+1,"pair:cutsq");
   memory->create(cutghost,n+1,n+1,"pair:cutghost");
 
   memory->create(nmap,n+1,n+1,"pair:nmap");
   memory->create(map,n+1,n+1,nstyles,"pair:map");
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       nmap[i][j] = 0;
 }
 
 /* ----------------------------------------------------------------------
    create one pair style for each arg in list
 ------------------------------------------------------------------------- */
 
 void PairHybrid::settings(int narg, char **arg)
 {
   if (narg < 1) error->all(FLERR,"Illegal pair_style command");
 
   // delete old lists, since cannot just change settings
 
   if (nstyles) {
     for (int m = 0; m < nstyles; m++) delete styles[m];
     delete [] styles;
     for (int m = 0; m < nstyles; m++) delete [] keywords[m];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     memory->destroy(cutghost);
     memory->destroy(nmap);
     memory->destroy(map);
   }
   allocated = 0;
 
   // allocate list of sub-styles as big as possibly needed if no extra args
 
   styles = new Pair*[narg];
   keywords = new char*[narg];
   multiple = new int[narg];
 
   // allocate each sub-style
   // call settings() with set of args that are not pair style names
   // use force->pair_map to determine which args these are
 
-  int iarg,jarg,dummy;
+  int iarg,jarg,sflag;
 
   iarg = 0;
   nstyles = 0;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"hybrid") == 0)
       error->all(FLERR,"Pair style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[iarg],"none") == 0)
       error->all(FLERR,"Pair style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_pair(arg[iarg],lmp->suffix,dummy);
-    int n = strlen(arg[iarg]) + 1;
-    keywords[nstyles] = new char[n];
-    strcpy(keywords[nstyles],arg[iarg]);
+
+    styles[nstyles] = force->new_pair(arg[iarg],1,sflag);
+    force->store_style(keywords[nstyles],arg[iarg],sflag);
+
     jarg = iarg + 1;
     while (jarg < narg && !force->pair_map->count(arg[jarg])) jarg++;
     styles[nstyles]->settings(jarg-iarg-1,&arg[iarg+1]);
     iarg = jarg;
     nstyles++;
   }
 
   // multiple[i] = 1 to M if sub-style used multiple times, else 0
 
   for (int i = 0; i < nstyles; i++) {
     int count = 0;
     for (int j = 0; j < nstyles; j++) {
       if (strcmp(keywords[j],keywords[i]) == 0) count++;
       if (j == i) multiple[i] = count;
     }
     if (count == 1) multiple[i] = 0;
   }
 
   // set pair flags from sub-style flags
 
   flags();
 }
 
 /* ----------------------------------------------------------------------
    set top-level pair flags from sub-style flags
 ------------------------------------------------------------------------- */
 
 void PairHybrid::flags()
 {
   int m;
 
   // set comm_forward, comm_reverse, comm_reverse_off to max of any sub-style
 
   for (m = 0; m < nstyles; m++) {
     if (styles[m]) comm_forward = MAX(comm_forward,styles[m]->comm_forward);
     if (styles[m]) comm_reverse = MAX(comm_reverse,styles[m]->comm_reverse);
     if (styles[m]) comm_reverse_off = MAX(comm_reverse_off,
                                           styles[m]->comm_reverse_off);
   }
 
   // single_enable = 1 if any sub-style is set
   // respa_enable = 1 if any sub-style is set
   // manybody_flag = 1 if any sub-style is set
   // no_virial_fdotr_compute = 1 if any sub-style is set
   // ghostneigh = 1 if any sub-style is set
   // ewaldflag, pppmflag, msmflag, dispersionflag, tip4pflag = 1
   //   if any sub-style is set
   // compute_flag = 1 if any sub-style is set
 
   single_enable = 0;
   compute_flag = 0;
   for (m = 0; m < nstyles; m++) {
     if (styles[m]->single_enable) single_enable = 1;
     if (styles[m]->respa_enable) respa_enable = 1;
     if (styles[m]->manybody_flag) manybody_flag = 1;
     if (styles[m]->no_virial_fdotr_compute) no_virial_fdotr_compute = 1;
     if (styles[m]->ghostneigh) ghostneigh = 1;
     if (styles[m]->ewaldflag) ewaldflag = 1;
     if (styles[m]->pppmflag) pppmflag = 1;
     if (styles[m]->msmflag) msmflag = 1;
     if (styles[m]->dispersionflag) dispersionflag = 1;
     if (styles[m]->tip4pflag) tip4pflag = 1;
     if (styles[m]->compute_flag) compute_flag = 1;
   }
 
   // single_extra = min of all sub-style single_extra
   // allocate svector
 
   single_extra = styles[0]->single_extra;
   for (m = 1; m < nstyles; m++)
     single_extra = MIN(single_extra,styles[m]->single_extra);
 
   if (single_extra) {
     delete [] svector;
     svector = new double[single_extra];
   }
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
 void PairHybrid::coeff(int narg, char **arg)
 {
   if (narg < 3) error->all(FLERR,"Incorrect args for pair coefficients");
   if (!allocated) allocate();
 
   int ilo,ihi,jlo,jhi;
   force->bounds(arg[0],atom->ntypes,ilo,ihi);
   force->bounds(arg[1],atom->ntypes,jlo,jhi);
 
   // 3rd arg = pair sub-style name
   // 4th arg = pair sub-style index if name used multiple times
   // allow for "none" as valid sub-style name
 
   int multflag;
   int m;
 
   for (m = 0; m < nstyles; m++) {
     multflag = 0;
     if (strcmp(arg[2],keywords[m]) == 0) {
       if (multiple[m]) {
         multflag = 1;
         if (narg < 4) error->all(FLERR,"Incorrect args for pair coefficients");
         if (!isdigit(arg[3][0]))
           error->all(FLERR,"Incorrect args for pair coefficients");
         int index = force->inumeric(FLERR,arg[3]);
         if (index == multiple[m]) break;
         else continue;
       } else break;
     }
   }
 
   int none = 0;
   if (m == nstyles) {
     if (strcmp(arg[2],"none") == 0) none = 1;
     else error->all(FLERR,"Pair coeff for hybrid has invalid style");
   }
 
   // move 1st/2nd args to 2nd/3rd args
   // if multflag: move 1st/2nd args to 3rd/4th args
   // just copy ptrs, since arg[] points into original input line
 
   arg[2+multflag] = arg[1];
   arg[1+multflag] = arg[0];
 
   // invoke sub-style coeff() starting with 1st remaining arg
 
   if (!none) styles[m]->coeff(narg-1-multflag,&arg[1+multflag]);
 
   // if sub-style only allows one pair coeff call (with * * and type mapping)
   // then unset setflag/map assigned to that style before setting it below
   // in case pair coeff for this sub-style is being called for 2nd time
 
   if (!none && styles[m]->one_coeff)
     for (int i = 1; i <= atom->ntypes; i++)
       for (int j = i; j <= atom->ntypes; j++)
         if (nmap[i][j] && map[i][j][0] == m) {
           setflag[i][j] = 0;
           nmap[i][j] = 0;
         }
 
   // set setflag and which type pairs map to which sub-style
   // if sub-style is none: set hybrid setflag, wipe out map
   // else: set hybrid setflag & map only if substyle setflag is set
   //       previous mappings are wiped out
 
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
       if (none) {
         setflag[i][j] = 1;
         nmap[i][j] = 0;
         count++;
       } else if (styles[m]->setflag[i][j]) {
         setflag[i][j] = 1;
         nmap[i][j] = 1;
         map[i][j][0] = m;
         count++;
       }
     }
   }
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairHybrid::init_style()
 {
   int i,m,itype,jtype,used,istyle,skip;
 
   // error if a sub-style is not used
 
   int ntypes = atom->ntypes;
 
   for (istyle = 0; istyle < nstyles; istyle++) {
     used = 0;
     for (itype = 1; itype <= ntypes; itype++)
       for (jtype = itype; jtype <= ntypes; jtype++)
         for (m = 0; m < nmap[itype][jtype]; m++)
           if (map[itype][jtype][m] == istyle) used = 1;
     if (used == 0) error->all(FLERR,"Pair hybrid sub-style is not used");
   }
 
   // each sub-style makes its neighbor list request(s)
 
   for (istyle = 0; istyle < nstyles; istyle++) styles[istyle]->init_style();
 
   // create skip lists for each pair neigh request
   // any kind of list can have its skip flag set at this stage
 
   for (i = 0; i < neighbor->nrequest; i++) {
     if (!neighbor->requests[i]->pair) continue;
 
     // istyle = associated sub-style
 
     for (istyle = 0; istyle < nstyles; istyle++)
       if (styles[istyle] == neighbor->requests[i]->requestor) break;
 
     // allocate iskip and ijskip
     // initialize so as to skip all pair types
     // set ijskip = 0 if type pair matches any entry in sub-style map
     // set ijskip = 0 if mixing will assign type pair to this sub-style
     //   will occur if type pair is currently unassigned
     //   and both I,I and J,J are assigned to single sub-style
     //   and sub-style for both I,I and J,J match istyle
     // set iskip = 1 only if all ijskip for itype are 1
 
     int *iskip = new int[ntypes+1];
     int **ijskip;
     memory->create(ijskip,ntypes+1,ntypes+1,"pair_hybrid:ijskip");
 
     for (itype = 1; itype <= ntypes; itype++)
       for (jtype = 1; jtype <= ntypes; jtype++)
         ijskip[itype][jtype] = 1;
 
     for (itype = 1; itype <= ntypes; itype++)
       for (jtype = itype; jtype <= ntypes; jtype++) {
         for (m = 0; m < nmap[itype][jtype]; m++)
           if (map[itype][jtype][m] == istyle)
             ijskip[itype][jtype] = ijskip[jtype][itype] = 0;
         if (nmap[itype][jtype] == 0 &&
             nmap[itype][itype] == 1 && map[itype][itype][0] == istyle &&
             nmap[jtype][jtype] == 1 && map[jtype][jtype][0] == istyle)
           ijskip[itype][jtype] = ijskip[jtype][itype] = 0;
       }
 
     for (itype = 1; itype <= ntypes; itype++) {
       iskip[itype] = 1;
       for (jtype = 1; jtype <= ntypes; jtype++)
         if (ijskip[itype][jtype] == 0) iskip[itype] = 0;
     }
 
     // if any skipping occurs
     // set request->skip and copy iskip and ijskip into request
     // else delete iskip and ijskip
 
     skip = 0;
     for (itype = 1; itype <= ntypes; itype++)
       for (jtype = 1; jtype <= ntypes; jtype++)
         if (ijskip[itype][jtype] == 1) skip = 1;
 
     if (skip) {
       neighbor->requests[i]->skip = 1;
       neighbor->requests[i]->iskip = iskip;
       neighbor->requests[i]->ijskip = ijskip;
     } else {
       delete [] iskip;
       memory->destroy(ijskip);
     }
   }
 
   // combine sub-style neigh list requests and create new ones if needed
 
   modify_requests();
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairHybrid::init_one(int i, int j)
 {
   // if I,J is not set explicitly:
   // perform mixing only if I,I sub-style = J,J sub-style
   // also require I,I and J,J are both assigned to single sub-style
 
   if (setflag[i][j] == 0) {
     if (nmap[i][i] != 1 || nmap[j][j] != 1 || map[i][i][0] != map[j][j][0])
       error->one(FLERR,"All pair coeffs are not set");
     nmap[i][j] = 1;
     map[i][j][0] = map[i][i][0];
   }
 
   // call init/mixing for all sub-styles of I,J
   // set cutsq in sub-style just as Pair::init() does via call to init_one()
   // set cutghost for I,J and J,I just as sub-style does
   // sum tail corrections for I,J
   // return max cutoff of all sub-styles assigned to I,J
   // if no sub-styles assigned to I,J (pair_coeff none), cutmax = 0.0 returned
 
   double cutmax = 0.0;
   cutghost[i][j] = cutghost[j][i] = 0.0;
   if (tail_flag) etail_ij = ptail_ij = 0.0;
 
   nmap[j][i] = nmap[i][j];
 
   for (int k = 0; k < nmap[i][j]; k++) {
     map[j][i][k] = map[i][j][k];
     double cut = styles[map[i][j][k]]->init_one(i,j);
     styles[map[i][j][k]]->cutsq[i][j] =
       styles[map[i][j][k]]->cutsq[j][i] = cut*cut;
     if (styles[map[i][j][k]]->ghostneigh)
       cutghost[i][j] = cutghost[j][i] =
         MAX(cutghost[i][j],styles[map[i][j][k]]->cutghost[i][j]);
     if (tail_flag) {
       etail_ij += styles[map[i][j][k]]->etail_ij;
       ptail_ij += styles[map[i][j][k]]->ptail_ij;
     }
     cutmax = MAX(cutmax,cut);
   }
 
   return cutmax;
 }
 
 /* ----------------------------------------------------------------------
    combine sub-style neigh list requests and create new ones if needed
 ------------------------------------------------------------------------- */
 
 void PairHybrid::modify_requests()
 {
   int i,j;
   NeighRequest *irq,*jrq;
 
   // loop over pair requests only
   // if list is skip list and not copy, look for non-skip list of same kind
   // if one exists, point at that one via otherlist
   // else make new non-skip request of same kind and point at that one
   //   don't bother to set ID for new request, since pair hybrid ignores list
   // only exception is half_from_full:
   //   ignore it, turn off skip, since it will derive from its skip parent
   // after possible new request creation, unset skip flag and otherlist
   //   for these derived lists: granhistory, rRESPA inner/middle
   //   this prevents neighbor from treating them as skip lists
   // copy list check is for pair style = hybrid/overlay
   //   which invokes this routine
 
   for (i = 0; i < neighbor->nrequest; i++) {
     if (!neighbor->requests[i]->pair) continue;
 
     irq = neighbor->requests[i];
     if (irq->skip == 0 || irq->copy) continue;
     if (irq->half_from_full) {
       irq->skip = 0;
       continue;
     }
 
     for (j = 0; j < neighbor->nrequest; j++) {
       if (!neighbor->requests[j]->pair) continue;
       jrq = neighbor->requests[j];
       if (irq->same_kind(jrq) && jrq->skip == 0) break;
     }
 
     if (j < neighbor->nrequest) irq->otherlist = j;
     else {
       int newrequest = neighbor->request(this);
       neighbor->requests[newrequest]->copy_request(irq);
       irq->otherlist = newrequest;
     }
 
     if (irq->granhistory || irq->respainner || irq->respamiddle) {
       irq->skip = 0;
       irq->otherlist = -1;
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairHybrid::write_restart(FILE *fp)
 {
   fwrite(&nstyles,sizeof(int),1,fp);
 
   // each sub-style writes its settings, but no coeff info
 
   int n;
   for (int m = 0; m < nstyles; m++) {
     n = strlen(keywords[m]) + 1;
     fwrite(&n,sizeof(int),1,fp);
     fwrite(keywords[m],sizeof(char),n,fp);
     styles[m]->write_restart_settings(fp);
   }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairHybrid::read_restart(FILE *fp)
 {
   int me = comm->me;
   if (me == 0) fread(&nstyles,sizeof(int),1,fp);
   MPI_Bcast(&nstyles,1,MPI_INT,0,world);
 
   // allocate list of sub-styles
 
   styles = new Pair*[nstyles];
   keywords = new char*[nstyles];
   multiple = new int[nstyles];
 
   // each sub-style is created via new_pair()
   // each reads its settings, but no coeff info
 
   int n,dummy;
   for (int m = 0; m < nstyles; m++) {
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_pair(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_pair(keywords[m],0,dummy);
     styles[m]->read_restart_settings(fp);
   }
 
   // multiple[i] = 1 to M if sub-style used multiple times, else 0
 
   for (int i = 0; i < nstyles; i++) {
     int count = 0;
     for (int j = 0; j < nstyles; j++) {
       if (strcmp(keywords[j],keywords[i]) == 0) count++;
       if (j == i) multiple[i] = count;
     }
     if (count == 1) multiple[i] = 0;
   }
 
   // set pair flags from sub-style flags
 
   flags();
 }
 
 /* ----------------------------------------------------------------------
    call sub-style to compute single interaction
    error if sub-style does not support single() call
    since overlay could have multiple sub-styles, sum results explicitly
 ------------------------------------------------------------------------- */
 
 double PairHybrid::single(int i, int j, int itype, int jtype,
                           double rsq, double factor_coul, double factor_lj,
                           double &fforce)
 {
   if (nmap[itype][jtype] == 0)
     error->one(FLERR,"Invoked pair single on pair style none");
 
   double fone;
   fforce = 0.0;
   double esum = 0.0;
 
   for (int m = 0; m < nmap[itype][jtype]; m++) {
     if (rsq < styles[map[itype][jtype][m]]->cutsq[itype][jtype]) {
       if (styles[map[itype][jtype][m]]->single_enable == 0)
         error->one(FLERR,"Pair hybrid sub-style does not support single call");
 
       esum += styles[map[itype][jtype][m]]->
         single(i,j,itype,jtype,rsq,factor_coul,factor_lj,fone);
       fforce += fone;
 
       // copy substyle extra values into hybrid's svector
 
       if (single_extra && styles[map[itype][jtype][m]]->single_extra)
         for (m = 0; m < single_extra; m++)
           svector[m] = styles[map[itype][jtype][m]]->svector[m];
     }
   }
 
   return esum;
 }
 
 /* ----------------------------------------------------------------------
    modify parameters of the pair style
    if 1st keyword is pair, then applies to one sub-style
    else pass command args to every sub-style of hybrid
 ------------------------------------------------------------------------- */
 
 void PairHybrid::modify_params(int narg, char **arg)
 {
   if (narg == 0) error->all(FLERR,"Illegal pair_modify command");
 
   if (strcmp(arg[0],"pair") == 0) {
     if (narg < 2) error->all(FLERR,"Illegal pair_modify command");
     int m;
     for (m = 0; m < nstyles; m++)
       if (strcmp(arg[1],keywords[m]) == 0) break;
     if (m == nstyles) error->all(FLERR,"Unknown pair_modify hybrid sub-style");
     if (multiple[m] == 0)
       styles[m]->modify_params(narg-2,&arg[2]);
     else {
       if (narg < 3) error->all(FLERR,"Illegal pair_modify command");
       int multiflag = force->inumeric(FLERR,arg[2]);
       for (m = 0; m < nstyles; m++)
         if (strcmp(arg[1],keywords[m]) == 0 && multiflag == multiple[m]) break;
       if (m == nstyles) 
         error->all(FLERR,"Unknown pair_modify hybrid sub-style");
       styles[m]->modify_params(narg-3,&arg[3]);
     }
     
   } else
     for (int m = 0; m < nstyles; m++) styles[m]->modify_params(narg,arg);
 }
 
 /* ----------------------------------------------------------------------
    extract a ptr to a particular quantity stored by pair
    pass request thru to sub-styles
    return first non-NULL result except for cut_coul request
    for cut_coul, insure all non-NULL results are equal since required by Kspace
 ------------------------------------------------------------------------- */
 
 void *PairHybrid::extract(const char *str, int &dim)
 {
   void *cutptr = NULL;
   void *ptr;
   double cutvalue;
 
   for (int m = 0; m < nstyles; m++) {
     ptr = styles[m]->extract(str,dim);
     if (ptr && strcmp(str,"cut_coul") == 0) {
       double *p_newvalue = (double *) ptr;
       double newvalue = *p_newvalue;
       if (cutptr && newvalue != cutvalue)
         error->all(FLERR,
                    "Coulomb cutoffs of pair hybrid sub-styles do not match");
       cutptr = ptr;
       cutvalue = newvalue;
     } else if (ptr) return ptr;
   }
 
   if (strcmp(str,"cut_coul") == 0) return cutptr;
   return NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairHybrid::reset_dt()
 {
   for (int m = 0; m < nstyles; m++) styles[m]->reset_dt();
 }
 
 /* ----------------------------------------------------------------------
    check if itype,jtype maps to sub-style
 ------------------------------------------------------------------------- */
 
 int PairHybrid::check_ijtype(int itype, int jtype, char *substyle)
 {
   for (int m = 0; m < nmap[itype][jtype]; m++)
     if (strcmp(keywords[map[itype][jtype][m]],substyle) == 0) return 1;
   return 0;
 }
 
 /* ----------------------------------------------------------------------
    memory usage of each sub-style
 ------------------------------------------------------------------------- */
 
 double PairHybrid::memory_usage()
 {
   double bytes = maxeatom * sizeof(double);
   bytes += maxvatom*6 * sizeof(double);
   for (int m = 0; m < nstyles; m++) bytes += styles[m]->memory_usage();
   return bytes;
 }
diff --git a/src/read_restart.cpp b/src/read_restart.cpp
index 749bff7f3..441ecbfcd 100644
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@@ -1,1149 +1,1149 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "lmptype.h"
 #include "mpi.h"
 #include "string.h"
 #include "stdlib.h"
 #include "dirent.h"
 #include "read_restart.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "domain.h"
 #include "comm.h"
 #include "irregular.h"
 #include "update.h"
 #include "modify.h"
 #include "fix.h"
 #include "fix_read_restart.h"
 #include "group.h"
 #include "force.h"
 #include "pair.h"
 #include "bond.h"
 #include "angle.h"
 #include "dihedral.h"
 #include "improper.h"
 #include "special.h"
 #include "universe.h"
 #include "mpiio.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 // same as write_restart.cpp
 
 #define MAGIC_STRING "LammpS RestartT"
 #define ENDIAN 0x0001
 #define ENDIANSWAP 0x1000
 #define VERSION_NUMERIC 0
 
 enum{VERSION,SMALLINT,TAGINT,BIGINT,
      UNITS,NTIMESTEP,DIMENSION,NPROCS,PROCGRID,
      NEWTON_PAIR,NEWTON_BOND,
      XPERIODIC,YPERIODIC,ZPERIODIC,BOUNDARY,
      ATOM_STYLE,NATOMS,NTYPES,
      NBONDS,NBONDTYPES,BOND_PER_ATOM,
      NANGLES,NANGLETYPES,ANGLE_PER_ATOM,
      NDIHEDRALS,NDIHEDRALTYPES,DIHEDRAL_PER_ATOM,
      NIMPROPERS,NIMPROPERTYPES,IMPROPER_PER_ATOM,
      TRICLINIC,BOXLO,BOXHI,XY,XZ,YZ,
      SPECIAL_LJ,SPECIAL_COUL,
      MASS,PAIR,BOND,ANGLE,DIHEDRAL,IMPROPER,
      MULTIPROC,MPIIO,PROCSPERFILE,PERPROC,
      IMAGEINT};
 
 #define LB_FACTOR 1.1
 
 /* ---------------------------------------------------------------------- */
 
 ReadRestart::ReadRestart(LAMMPS *lmp) : Pointers(lmp) {}
 
 /* ---------------------------------------------------------------------- */
 
 void ReadRestart::command(int narg, char **arg)
 {
   if (narg != 1 && narg != 2) error->all(FLERR,"Illegal read_restart command");
 
   if (domain->box_exist)
     error->all(FLERR,"Cannot read_restart after simulation box is defined");
 
   MPI_Comm_rank(world,&me);
   MPI_Comm_size(world,&nprocs);
 
   // check for remap option
 
   int remapflag = 0;
   if (narg == 2) {
     if (strcmp(arg[1],"remap") == 0) remapflag = 1;
     else error->all(FLERR,"Illegal read_restart command");
   }
 
   // if filename contains "*", search dir for latest restart file
 
   char *file = new char[strlen(arg[0]) + 16];
   if (strchr(arg[0],'*')) {
     int n;
     if (me == 0) {
       file_search(arg[0],file);
       n = strlen(file) + 1;
     }
     MPI_Bcast(&n,1,MPI_INT,0,world);
     MPI_Bcast(file,n,MPI_CHAR,0,world);
   } else strcpy(file,arg[0]);
 
   // check for multiproc files and an MPI-IO filename
 
   if (strchr(arg[0],'%')) multiproc = 1;
   else multiproc = 0;
   if (strstr(arg[0],".mpiio")) mpiioflag = 1;
   else mpiioflag = 0;
 
   if (multiproc && mpiioflag) 
     error->all(FLERR,
                "Read restart MPI-IO input not allowed with % in filename");
 
   if (mpiioflag) {
     mpiio = new RestartMPIIO(lmp);
     if (!mpiio->mpiio_exists) 
       error->all(FLERR,"Reading from MPI-IO filename when "
                  "MPIIO package is not installed");
   }
 
   // open single restart file or base file for multiproc case
 
   if (me == 0) {
     if (screen) fprintf(screen,"Reading restart file ...\n");
     char *hfile;
     if (multiproc) {
       hfile = new char[strlen(file) + 16];
       char *ptr = strchr(file,'%');
       *ptr = '\0';
       sprintf(hfile,"%s%s%s",file,"base",ptr+1);
       *ptr = '%';
     } else hfile = file;
     fp = fopen(hfile,"rb");
     if (fp == NULL) {
       char str[128];
       sprintf(str,"Cannot open restart file %s",hfile);
       error->one(FLERR,str);
     }
     if (multiproc) delete [] hfile;
   }
 
   // read magic string, endian flag, numeric version
 
   magic_string();
   endian();
   int incompatible = version_numeric();
 
   // read header info which creates simulation box
 
   header(incompatible);
   domain->box_exist = 1;
 
   // problem setup using info from header
 
   int n;
   if (nprocs == 1) n = static_cast<int> (atom->natoms);
   else n = static_cast<int> (LB_FACTOR * atom->natoms / nprocs);
 
   atom->allocate_type_arrays();
   atom->deallocate_topology();
   atom->avec->grow(n);
   n = atom->nmax;
 
   domain->print_box("  ");
   domain->set_initial_box(0);
   domain->set_global_box();
   comm->set_proc_grid();
   domain->set_local_box();
 
   // read groups, ntype-length arrays, force field, fix info from file
   // nextra = max # of extra quantities stored with each atom
 
   group->read_restart(fp);
   type_arrays();
   force_fields();
 
   int nextra = modify->read_restart(fp);
   atom->nextra_store = nextra;
   memory->create(atom->extra,n,nextra,"atom:extra");
 
   // read file layout info
 
   file_layout();
 
   // close header file if in multiproc mode
 
   if (multiproc && me == 0) fclose(fp);
 
   // read per-proc info
 
   AtomVec *avec = atom->avec;
 
   int maxbuf = 0;
   double *buf = NULL;
   int m,flag;
 
   // MPI-IO input from single file
 
   if (mpiioflag) {
     mpiio->openForRead(file);
     memory->create(buf,assignedChunkSize,"read_restart:buf");
     mpiio->read((headerOffset+assignedChunkOffset),assignedChunkSize,buf);
     mpiio->close();
 
     m = 0;
     while (m < assignedChunkSize) m += avec->unpack_restart(&buf[m]);
   }
 
   // input of single native file
   // nprocs_file = # of chunks in file
   // proc 0 reads a chunk and bcasts it to other procs
   // each proc unpacks the atoms, saving ones in it's sub-domain
   // if remapflag set, remap the atom to box before checking sub-domain
   // check for atom in sub-domain differs for orthogonal vs triclinic box
 
   else if (multiproc == 0) {
 
     int triclinic = domain->triclinic;
     imageint *iptr;
     double *x,lamda[3];
     double *coord,*sublo,*subhi;
     if (triclinic == 0) {
       sublo = domain->sublo;
       subhi = domain->subhi;
     } else {
       sublo = domain->sublo_lamda;
       subhi = domain->subhi_lamda;
     }
 
     for (int iproc = 0; iproc < nprocs_file; iproc++) {
       if (read_int() != PERPROC) 
         error->all(FLERR,"Invalid flag in peratom section of restart file");
 
       n = read_int();
       if (n > maxbuf) {
         maxbuf = n;
         memory->destroy(buf);
         memory->create(buf,maxbuf,"read_restart:buf");
       }
       read_double_vec(n,buf);
 
       m = 0;
       while (m < n) {
         x = &buf[m+1];
         if (remapflag) {
           iptr = (imageint *) &buf[m+7];
           domain->remap(x,*iptr);
         }
 
         if (triclinic) {
           domain->x2lamda(x,lamda);
           coord = lamda;
         } else coord = x;
 
         if (coord[0] >= sublo[0] && coord[0] < subhi[0] &&
             coord[1] >= sublo[1] && coord[1] < subhi[1] &&
             coord[2] >= sublo[2] && coord[2] < subhi[2]) {
           m += avec->unpack_restart(&buf[m]);
         } else m += static_cast<int> (buf[m]);
       }
     }
 
     if (me == 0) fclose(fp);
   }
 
   // input of multiple native files with procs <= files
   // # of files = multiproc_file
   // each proc reads a subset of files, striding by nprocs
   // each proc keeps all atoms in all perproc chunks in its files
 
   else if (nprocs <= multiproc_file) {
 
     char *procfile = new char[strlen(file) + 16];
     char *ptr = strchr(file,'%');
 
     for (int iproc = me; iproc < multiproc_file; iproc += nprocs) {
       *ptr = '\0';
       sprintf(procfile,"%s%d%s",file,iproc,ptr+1);
       *ptr = '%';
       fp = fopen(procfile,"rb");
       if (fp == NULL) {
         char str[128];
         sprintf(str,"Cannot open restart file %s",procfile);
         error->one(FLERR,str);
       }
 
       fread(&flag,sizeof(int),1,fp);
       if (flag != PROCSPERFILE) 
         error->one(FLERR,"Invalid flag in peratom section of restart file");
       int procsperfile;
       fread(&procsperfile,sizeof(int),1,fp);
 
       for (int i = 0; i < procsperfile; i++) {
         fread(&flag,sizeof(int),1,fp);
         if (flag != PERPROC) 
           error->one(FLERR,"Invalid flag in peratom section of restart file");
         
         fread(&n,sizeof(int),1,fp);
         if (n > maxbuf) {
           maxbuf = n;
           memory->destroy(buf);
           memory->create(buf,maxbuf,"read_restart:buf");
         }
         fread(buf,sizeof(double),n,fp);
 
         m = 0;
         while (m < n) m += avec->unpack_restart(&buf[m]);
       }
 
       fclose(fp);
     }
 
     delete [] procfile;
   }
 
   // input of multiple native files with procs > files
   // # of files = multiproc_file
   // cluster procs based on # of files
   // 1st proc in each cluster reads per-proc chunks from file
   // sends chunks round-robin to other procs in its cluster
   // each proc keeps all atoms in its perproc chunks in file
 
   else {
 
     // nclusterprocs = # of procs in my cluster that read from one file
     // filewriter = 1 if this proc reads file, else 0
     // fileproc = ID of proc in my cluster who reads from file
     // clustercomm = MPI communicator within my cluster of procs
 
     int nfile = multiproc_file;
     int icluster = static_cast<int> ((bigint) me * nfile/nprocs);
     int fileproc = static_cast<int> ((bigint) icluster * nprocs/nfile);
     int fcluster = static_cast<int> ((bigint) fileproc * nfile/nprocs);
     if (fcluster < icluster) fileproc++;
     int fileprocnext = 
       static_cast<int> ((bigint) (icluster+1) * nprocs/nfile);
     fcluster = static_cast<int> ((bigint) fileprocnext * nfile/nprocs);
     if (fcluster < icluster+1) fileprocnext++;
     int nclusterprocs = fileprocnext - fileproc;
     int filereader = 0;
     if (me == fileproc) filereader = 1;
     MPI_Comm clustercomm;
     MPI_Comm_split(world,icluster,0,&clustercomm);
 
     if (filereader) {
       char *procfile = new char[strlen(file) + 16];
       char *ptr = strchr(file,'%');
       *ptr = '\0';
       sprintf(procfile,"%s%d%s",file,icluster,ptr+1);
       *ptr = '%';
       fp = fopen(procfile,"rb");
       if (fp == NULL) {
         char str[128];
         sprintf(str,"Cannot open restart file %s",procfile);
         error->one(FLERR,str);
       }
       delete [] procfile;
     }
 
     int flag,procsperfile;
 
     if (filereader) {
       fread(&flag,sizeof(int),1,fp);
       if (flag != PROCSPERFILE) 
         error->one(FLERR,"Invalid flag in peratom section of restart file");
       fread(&procsperfile,sizeof(int),1,fp);
     }
     MPI_Bcast(&procsperfile,1,MPI_INT,0,clustercomm);
 
     int tmp,iproc;
     MPI_Status status;
     MPI_Request request;
 
     for (int i = 0; i < procsperfile; i++) {
       if (filereader) {
         fread(&flag,sizeof(int),1,fp);
         if (flag != PERPROC) 
           error->one(FLERR,"Invalid flag in peratom section of restart file");
 
         fread(&n,sizeof(int),1,fp);
         if (n > maxbuf) {
           maxbuf = n;
           memory->destroy(buf);
           memory->create(buf,maxbuf,"read_restart:buf");
         }
         fread(buf,sizeof(double),n,fp);
 
         if (i % nclusterprocs) {
           iproc = me + (i % nclusterprocs);
           MPI_Send(&n,1,MPI_INT,iproc,0,world);
           MPI_Recv(&tmp,0,MPI_INT,iproc,0,world,&status);
           MPI_Rsend(buf,n,MPI_DOUBLE,iproc,0,world);
         }
 
       } else if (i % nclusterprocs == me - fileproc) {
         MPI_Recv(&n,1,MPI_INT,fileproc,0,world,&status);
         if (n > maxbuf) {
           maxbuf = n;
           memory->destroy(buf);
           memory->create(buf,maxbuf,"read_restart:buf");
         }
         MPI_Irecv(buf,n,MPI_DOUBLE,fileproc,0,world,&request);
         MPI_Send(&tmp,0,MPI_INT,fileproc,0,world);
         MPI_Wait(&request,&status);
       }
 
       if (i % nclusterprocs == me - fileproc) {
         m = 0;
         while (m < n) m += avec->unpack_restart(&buf[m]);
       }
     }
 
     if (filereader) fclose(fp);
     MPI_Comm_free(&clustercomm);
   }
 
   // clean-up memory
 
   delete [] file;
   memory->destroy(buf);
 
   // for multiproc or MPI-IO files:
   // perform irregular comm to migrate atoms to correct procs
 
   if (multiproc || mpiioflag) {
 
     // if remapflag set, remap all atoms I read back to box before migrating
 
     if (remapflag) {
       double **x = atom->x;
       imageint *image = atom->image;
       int nlocal = atom->nlocal;
 
       for (int i = 0; i < nlocal; i++)
         domain->remap(x[i],image[i]);
     }
 
     // create a temporary fix to hold and migrate extra atom info
     // necessary b/c irregular will migrate atoms
 
     if (nextra) {
       char cextra[8],fixextra[8];
       sprintf(cextra,"%d",nextra);
       sprintf(fixextra,"%d",modify->nfix_restart_peratom);
       char **newarg = new char*[5];
       newarg[0] = (char *) "_read_restart";
       newarg[1] = (char *) "all";
       newarg[2] = (char *) "READ_RESTART";
       newarg[3] = cextra;
       newarg[4] = fixextra;
       modify->add_fix(5,newarg);
       delete [] newarg;
     }
 
     // move atoms to new processors via irregular()
     // turn sorting on in migrate_atoms() to avoid non-reproducible restarts
     // in case read by different proc than wrote restart file
     // first do map_init() since irregular->migrate_atoms() will do map_clear()
 
     if (atom->map_style) atom->map_init();
     if (domain->triclinic) domain->x2lamda(atom->nlocal);
     Irregular *irregular = new Irregular(lmp);
     irregular->migrate_atoms(1);
     delete irregular;
     if (domain->triclinic) domain->lamda2x(atom->nlocal);
 
     // put extra atom info held by fix back into atom->extra
     // destroy temporary fix
 
     if (nextra) {
       memory->destroy(atom->extra);
       memory->create(atom->extra,atom->nmax,nextra,"atom:extra");
       int ifix = modify->find_fix("_read_restart");
       FixReadRestart *fix = (FixReadRestart *) modify->fix[ifix];
       int *count = fix->count;
       double **extra = fix->extra;
       double **atom_extra = atom->extra;
       int nlocal = atom->nlocal;
       for (int i = 0; i < nlocal; i++)
         for (int j = 0; j < count[i]; j++)
           atom_extra[i][j] = extra[i][j];
       modify->delete_fix("_read_restart");
     }
   }
 
   // check that all atoms were assigned to procs
 
   bigint natoms;
   bigint nblocal = atom->nlocal;
   MPI_Allreduce(&nblocal,&natoms,1,MPI_LMP_BIGINT,MPI_SUM,world);
 
   if (me == 0) {
     if (screen) fprintf(screen,"  " BIGINT_FORMAT " atoms\n",natoms);
     if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " atoms\n",natoms);
   }
 
   if (natoms != atom->natoms)
     error->all(FLERR,"Did not assign all restart atoms correctly");
 
   if (me == 0) {
     if (atom->nbonds) {
       if (screen) fprintf(screen,"  " BIGINT_FORMAT " bonds\n",atom->nbonds);
       if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " bonds\n",atom->nbonds);
     }
     if (atom->nangles) {
       if (screen) fprintf(screen,"  " BIGINT_FORMAT " angles\n",
                           atom->nangles);
       if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " angles\n",
                            atom->nangles);
     }
     if (atom->ndihedrals) {
       if (screen) fprintf(screen,"  " BIGINT_FORMAT " dihedrals\n",
                           atom->ndihedrals);
       if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " dihedrals\n",
                            atom->ndihedrals);
     }
     if (atom->nimpropers) {
       if (screen) fprintf(screen,"  " BIGINT_FORMAT " impropers\n",
                           atom->nimpropers);
       if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " impropers\n",
                            atom->nimpropers);
     }
   }
 
   // check that atom IDs are valid
 
   atom->tag_check();
 
   // create global mapping of atoms
 
   if (atom->map_style) {
     atom->map_init();
     atom->map_set();
   }
 
   // create special bond lists for molecular systems
 
   if (atom->molecular == 1) {
     Special special(lmp);
     special.build();
   }
 }
 
 /* ----------------------------------------------------------------------
    infile contains a "*"
    search for all files which match the infile pattern
    replace "*" with latest timestep value to create outfile name
    search dir referenced by initial pathname of file
    if infile also contains "%", use "base" when searching directory
    only called by proc 0
 ------------------------------------------------------------------------- */
 
 void ReadRestart::file_search(char *infile, char *outfile)
 {
   char *ptr;
 
   // separate infile into dir + filename
 
   char *dirname = new char[strlen(infile) + 1];
   char *filename = new char[strlen(infile) + 1];
 
   if (strchr(infile,'/')) {
     ptr = strrchr(infile,'/');
     *ptr = '\0';
     strcpy(dirname,infile);
     strcpy(filename,ptr+1);
     *ptr = '/';
   } else {
     strcpy(dirname,"./");
     strcpy(filename,infile);
   }
 
   // if filename contains "%" replace "%" with "base"
 
   char *pattern = new char[strlen(filename) + 16];
 
   if ((ptr = strchr(filename,'%'))) {
     *ptr = '\0';
     sprintf(pattern,"%s%s%s",filename,"base",ptr+1);
     *ptr = '%';
   } else strcpy(pattern,filename);
 
   // scan all files in directory, searching for files that match pattern
   // maxnum = largest int that matches "*"
 
   int n = strlen(pattern) + 16;
   char *begin = new char[n];
   char *middle = new char[n];
   char *end = new char[n];
 
   ptr = strchr(pattern,'*');
   *ptr = '\0';
   strcpy(begin,pattern);
   strcpy(end,ptr+1);
   int nbegin = strlen(begin);
   bigint maxnum = -1;
 
   struct dirent *ep;
   DIR *dp = opendir(dirname);
   if (dp == NULL)
     error->one(FLERR,"Cannot open dir to search for restart file");
   while ((ep = readdir(dp))) {
     if (strstr(ep->d_name,begin) != ep->d_name) continue;
     if ((ptr = strstr(&ep->d_name[nbegin],end)) == NULL) continue;
     if (strlen(end) == 0) ptr = ep->d_name + strlen(ep->d_name);
     *ptr = '\0';
     if (strlen(&ep->d_name[nbegin]) < n) {
       strcpy(middle,&ep->d_name[nbegin]);
       if (ATOBIGINT(middle) > maxnum) maxnum = ATOBIGINT(middle);
     }
   }
   closedir(dp);
   if (maxnum < 0) error->one(FLERR,"Found no restart file matching pattern");
 
   // create outfile with maxint substituted for "*"
   // use original infile, not pattern, since need to retain "%" in filename
 
   ptr = strchr(infile,'*');
   *ptr = '\0';
   sprintf(outfile,"%s" BIGINT_FORMAT "%s",infile,maxnum,ptr+1);
   *ptr = '*';
 
   // clean up
 
   delete [] dirname;
   delete [] filename;
   delete [] pattern;
   delete [] begin;
   delete [] middle;
   delete [] end;
 }
 
 /* ----------------------------------------------------------------------
    read header of restart file
 ------------------------------------------------------------------------- */
 
 void ReadRestart::header(int incompatible)
 {
   int xperiodic,yperiodic,zperiodic;
 
   // read flags and fields until flag = -1
 
   int flag = read_int();
   while (flag >= 0) {
 
     // check restart file version, warn if different
 
     if (flag == VERSION) {
       char *version = read_string();
       if (me == 0) {
         if (screen) fprintf(screen,"  restart file = %s, LAMMPS = %s\n",
                             version,universe->version);
       }
       if (incompatible) 
         error->all(FLERR,"Restart file incompatible with current version");
       delete [] version;
 
     // check lmptype.h sizes, error if different
 
     } else if (flag == SMALLINT) {
       int size = read_int();
       if (size != sizeof(smallint))
         error->all(FLERR,"Smallint setting in lmptype.h is not compatible");
     } else if (flag == IMAGEINT) {
       int size = read_int();
       if (size != sizeof(imageint))
         error->all(FLERR,"Imageint setting in lmptype.h is not compatible");
     } else if (flag == TAGINT) {
       int size = read_int();
       if (size != sizeof(tagint))
         error->all(FLERR,"Tagint setting in lmptype.h is not compatible");
     } else if (flag == BIGINT) {
       int size = read_int();
       if (size != sizeof(bigint))
         error->all(FLERR,"Bigint setting in lmptype.h is not compatible");
 
     // reset unit_style only if different
     // so that timestep,neighbor-skin are not changed
 
     } else if (flag == UNITS) {
       char *style = read_string();
       if (strcmp(style,update->unit_style) != 0) update->set_units(style);
       delete [] style;
 
     } else if (flag == NTIMESTEP) {
       update->ntimestep = read_bigint();
 
     // set dimension from restart file
 
     } else if (flag == DIMENSION) {
       int dimension = read_int();
       domain->dimension = dimension;
       if (domain->dimension == 2 && domain->zperiodic == 0)
         error->all(FLERR,
                    "Cannot run 2d simulation with nonperiodic Z dimension");
 
     // read nprocs from restart file, warn if different
 
     } else if (flag == NPROCS) {
       nprocs_file = read_int();
       if (nprocs_file != comm->nprocs && me == 0)
         error->warning(FLERR,"Restart file used different # of processors");
 
     // don't set procgrid, warn if different
 
     } else if (flag == PROCGRID) {
       int procgrid[3];
       read_int();
       read_int_vec(3,procgrid);
       if (comm->user_procgrid[0] != 0 &&
           (procgrid[0] != comm->user_procgrid[0] || 
            procgrid[1] != comm->user_procgrid[1] ||
            procgrid[2] != comm->user_procgrid[2]) && me == 0)
         error->warning(FLERR,"Restart file used different 3d processor grid");
 
     // don't set newton_pair, leave input script value unchanged
     // set newton_bond from restart file
     // warn if different and input script settings are not default
 
     } else if (flag == NEWTON_PAIR) {
       int newton_pair_file = read_int();
       if (force->newton_pair != 1) {
         if (newton_pair_file != force->newton_pair && me == 0)
           error->warning(FLERR,
                          "Restart file used different newton pair setting, "
                          "using input script value");
       }
     } else if (flag == NEWTON_BOND) {
       int newton_bond_file = read_int();
       if (force->newton_bond != 1) {
         if (newton_bond_file != force->newton_bond && me == 0)
           error->warning(FLERR,
                          "Restart file used different newton bond setting, "
                          "using restart file value");
       }
       force->newton_bond = newton_bond_file;
       if (force->newton_pair || force->newton_bond) force->newton = 1;
       else force->newton = 0;
 
       // set boundary settings from restart file
       // warn if different and input script settings are not default
 
     } else if (flag == XPERIODIC) {
       xperiodic = read_int();
     } else if (flag == YPERIODIC) {
       yperiodic = read_int();
     } else if (flag == ZPERIODIC) {
       zperiodic = read_int();
     } else if (flag == BOUNDARY) {
       int boundary[3][2];
       read_int();
       read_int_vec(6,&boundary[0][0]);
 
       if (domain->boundary[0][0] || domain->boundary[0][1] ||
           domain->boundary[1][0] || domain->boundary[1][1] ||
           domain->boundary[2][0] || domain->boundary[2][1]) {
         if (boundary[0][0] != domain->boundary[0][0] ||
             boundary[0][1] != domain->boundary[0][1] ||
             boundary[1][0] != domain->boundary[1][0] ||
             boundary[1][1] != domain->boundary[1][1] ||
             boundary[2][0] != domain->boundary[2][0] ||
             boundary[2][1] != domain->boundary[2][1]) {
           if (me == 0)
             error->warning(FLERR,
                            "Restart file used different boundary settings, "
                            "using restart file values");
         }
       }
 
       domain->boundary[0][0] = boundary[0][0];
       domain->boundary[0][1] = boundary[0][1];
       domain->boundary[1][0] = boundary[1][0];
       domain->boundary[1][1] = boundary[1][1];
       domain->boundary[2][0] = boundary[2][0];
       domain->boundary[2][1] = boundary[2][1];
 
       domain->periodicity[0] = domain->xperiodic = xperiodic;
       domain->periodicity[1] = domain->yperiodic = yperiodic;
       domain->periodicity[2] = domain->zperiodic = zperiodic;
 
       domain->nonperiodic = 0;
       if (xperiodic == 0 || yperiodic == 0 || zperiodic == 0) {
         domain->nonperiodic = 1;
         if (boundary[0][0] >= 2 || boundary[0][1] >= 2 ||
             boundary[1][0] >= 2 || boundary[1][1] >= 2 ||
             boundary[2][0] >= 2 || boundary[2][1] >= 2)
           domain->nonperiodic = 2;
       }
 
     // create new AtomVec class using any stored args
 
     } else if (flag == ATOM_STYLE) {
       char *style = read_string();
       int nargcopy = read_int();
       char **argcopy = new char*[nargcopy];
       for (int i = 0; i < nargcopy; i++)
         argcopy[i] = read_string();
-      atom->create_avec(style,nargcopy,argcopy);
+      atom->create_avec(style,nargcopy,argcopy,0);
       for (int i = 0; i < nargcopy; i++) delete [] argcopy[i];
       delete [] argcopy;
       delete [] style;
 
     } else if (flag == NATOMS) {
       atom->natoms = read_bigint();
     } else if (flag == NTYPES) {
       atom->ntypes = read_int();
     } else if (flag == NBONDS) {
       atom->nbonds = read_bigint();
     } else if (flag == NBONDTYPES) {
       atom->nbondtypes = read_int();
     } else if (flag == BOND_PER_ATOM) {
       atom->bond_per_atom = read_int();
     } else if (flag == NANGLES) {
       atom->nangles = read_bigint();
     } else if (flag == NANGLETYPES) {
       atom->nangletypes = read_int();
     } else if (flag == ANGLE_PER_ATOM) {
       atom->angle_per_atom = read_int();
     } else if (flag == NDIHEDRALS) {
       atom->ndihedrals = read_bigint();
     } else if (flag == NDIHEDRALTYPES) {
       atom->ndihedraltypes = read_int();
     } else if (flag == DIHEDRAL_PER_ATOM) {
       atom->dihedral_per_atom = read_int();
     } else if (flag == NIMPROPERS) {
       atom->nimpropers = read_bigint();
     } else if (flag == NIMPROPERTYPES) {
       atom->nimpropertypes = read_int();
     } else if (flag == IMPROPER_PER_ATOM) {
       atom->improper_per_atom = read_int();
 
     } else if (flag == TRICLINIC) {
       domain->triclinic = read_int();
     } else if (flag == BOXLO) {
       read_int();
       read_double_vec(3,domain->boxlo);
     } else if (flag == BOXHI) {
       read_int();
       read_double_vec(3,domain->boxhi);
     } else if (flag == XY) {
       domain->xy = read_double();
     } else if (flag == XZ) {
       domain->xz = read_double();
     } else if (flag == YZ) {
       domain->yz = read_double();
 
     } else if (flag == SPECIAL_LJ) {
       read_int();
       read_double_vec(3,&force->special_lj[1]);
     } else if (flag == SPECIAL_COUL) {
       read_int();
       read_double_vec(3,&force->special_coul[1]);
 
     } else error->all(FLERR,"Invalid flag in header section of restart file");
 
     flag = read_int();
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ReadRestart::type_arrays()
 {
   int flag = read_int();
   while (flag >= 0) {
 
     if (flag == MASS) {
       read_int();
       double *mass = new double[atom->ntypes+1];
       read_double_vec(atom->ntypes,&mass[1]);
       atom->set_mass(mass);
       delete [] mass;
 
     } else error->all(FLERR,
                       "Invalid flag in type arrays section of restart file");
 
     flag = read_int();
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ReadRestart::force_fields()
 {
   char *style;
 
   int flag = read_int();
   while (flag >= 0) {
 
     if (flag == PAIR) {
       style = read_string();
-      force->create_pair(style);
+      force->create_pair(style,0);
       delete [] style;
       force->pair->read_restart(fp);
 
     } else if (flag == BOND) {
       style = read_string();
-      force->create_bond(style);
+      force->create_bond(style,0);
       delete [] style;
       force->bond->read_restart(fp);
 
     } else if (flag == ANGLE) {
       style = read_string();
-      force->create_angle(style);
+      force->create_angle(style,0);
       delete [] style;
       force->angle->read_restart(fp);
 
     } else if (flag == DIHEDRAL) {
       style = read_string();
-      force->create_dihedral(style);
+      force->create_dihedral(style,0);
       delete [] style;
       force->dihedral->read_restart(fp);
 
     } else if (flag == IMPROPER) {
       style = read_string();
-      force->create_improper(style);
+      force->create_improper(style,0);
       delete [] style;
       force->improper->read_restart(fp);
 
     } else error->all(FLERR,
                       "Invalid flag in force field section of restart file");
 
     flag = read_int();
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ReadRestart::file_layout()
 {
   int flag = read_int();
   while (flag >= 0) {
 
     if (flag == MULTIPROC) {
       multiproc_file = read_int();
       if (multiproc == 0 && multiproc_file)
         error->all(FLERR,"Restart file is not a multi-proc file");
       if (multiproc && multiproc_file == 0)
         error->all(FLERR,"Restart file is a multi-proc file");
 
     } else if (flag == MPIIO) {
       int mpiioflag_file = read_int();
       if (mpiioflag == 0 && mpiioflag_file)
         error->all(FLERR,"Restart file is a MPI-IO file");
       if (mpiioflag && mpiioflag_file == 0)
         error->all(FLERR,"Restart file is not a MPI-IO file");
 
       if (mpiioflag) { 
         bigint *nproc_chunk_offsets;
         memory->create(nproc_chunk_offsets,nprocs,
                        "write_restart:nproc_chunk_offsets");
         bigint *nproc_chunk_sizes;
         memory->create(nproc_chunk_sizes,nprocs,
                        "write_restart:nproc_chunk_sizes");
 
         // on rank 0 read in the chunk sizes that were written out
         // then consolidate them and compute offsets relative to the
         // end of the header info to fit the current partition size
         // if the number of ranks that did the writing is different
 
         if (me == 0) {
           int *all_written_send_sizes;
           memory->create(all_written_send_sizes,nprocs_file,
                          "write_restart:all_written_send_sizes");
           int *nproc_chunk_number;
           memory->create(nproc_chunk_number,nprocs,
                          "write_restart:nproc_chunk_number");
           
           fread(all_written_send_sizes,sizeof(int),nprocs_file,fp);
           
           int init_chunk_number = nprocs_file/nprocs;
           int num_extra_chunks = nprocs_file - (nprocs*init_chunk_number);
           
           for (int i = 0; i < nprocs; i++) {
             if (i < num_extra_chunks)
               nproc_chunk_number[i] = init_chunk_number+1;
             else
               nproc_chunk_number[i] = init_chunk_number;
           }
           
           int all_written_send_sizes_index = 0;
           bigint current_offset = 0;
           for (int i=0;i<nprocs;i++) {
             nproc_chunk_offsets[i] = current_offset;
             nproc_chunk_sizes[i] = 0;
             for (int j=0;j<nproc_chunk_number[i];j++) {
               nproc_chunk_sizes[i] += 
                 all_written_send_sizes[all_written_send_sizes_index];
               current_offset += 
                 (all_written_send_sizes[all_written_send_sizes_index] * 
                  sizeof(double));
               all_written_send_sizes_index++;
             }
           
           }
           memory->destroy(all_written_send_sizes);
           memory->destroy(nproc_chunk_number);
         }
 
         // scatter chunk sizes and offsets to all procs
         
         MPI_Scatter(nproc_chunk_sizes, 1, MPI_LMP_BIGINT,
                     &assignedChunkSize , 1, MPI_LMP_BIGINT, 0,world);
         MPI_Scatter(nproc_chunk_offsets, 1, MPI_LMP_BIGINT,
                     &assignedChunkOffset , 1, MPI_LMP_BIGINT, 0,world);
         
         memory->destroy(nproc_chunk_sizes);
         memory->destroy(nproc_chunk_offsets);
       }
     }
 
     flag = read_int();
   }
 
   // if MPI-IO file, broadcast the end of the header offste
   // this allows all ranks to compute offset to their data
 
   if (mpiioflag) {
     if (me == 0) headerOffset = ftell(fp);
     MPI_Bcast(&headerOffset,1,MPI_LMP_BIGINT,0,world);
   }
 }
 
 // ----------------------------------------------------------------------
 // ----------------------------------------------------------------------
 // low-level fread methods
 // ----------------------------------------------------------------------
 // ----------------------------------------------------------------------
 
 /* ----------------------------------------------------------------------
 ------------------------------------------------------------------------- */
 
 void ReadRestart::magic_string()
 {
   int n = strlen(MAGIC_STRING) + 1;
   char *str = new char[n];
 
   int count;
   if (me == 0) count = fread(str,sizeof(char),n,fp);
   MPI_Bcast(&count,1,MPI_INT,0,world);
   if (count < n) 
     error->all(FLERR,"Invalid LAMMPS restart file");
   MPI_Bcast(str,n,MPI_CHAR,0,world);
   if (strcmp(str,MAGIC_STRING) != 0) 
     error->all(FLERR,"Invalid LAMMPS restart file");
   delete [] str;
 }
 
 /* ----------------------------------------------------------------------
 ------------------------------------------------------------------------- */
 
 void ReadRestart::endian()
 {
   int endian;
   if (me == 0) fread(&endian,sizeof(int),1,fp);
   MPI_Bcast(&endian,1,MPI_INT,0,world);
   if (endian == ENDIAN) return;
   if (endian == ENDIANSWAP)
     error->all(FLERR,"Restart file byte ordering is swapped");
   else error->all(FLERR,"Restart file byte ordering is not recognized");
 }
 
 /* ----------------------------------------------------------------------
 ------------------------------------------------------------------------- */
 
 int ReadRestart::version_numeric()
 {
   int vn;
   if (me == 0) fread(&vn,sizeof(int),1,fp);
   MPI_Bcast(&vn,1,MPI_INT,0,world);
   if (vn != VERSION_NUMERIC) return 1;
   return 0;
 }
 
 /* ----------------------------------------------------------------------
    read an int from restart file and bcast it
 ------------------------------------------------------------------------- */
 
 int ReadRestart::read_int()
 {
   int value;
   if (me == 0) fread(&value,sizeof(int),1,fp);
   MPI_Bcast(&value,1,MPI_INT,0,world);
   return value;
 }
 
 /* ----------------------------------------------------------------------
    read a bigint from restart file and bcast it
 ------------------------------------------------------------------------- */
 
 bigint ReadRestart::read_bigint()
 {
   bigint value;
   if (me == 0) fread(&value,sizeof(bigint),1,fp);
   MPI_Bcast(&value,1,MPI_LMP_BIGINT,0,world);
   return value;
 }
 
 /* ----------------------------------------------------------------------
    read a double from restart file and bcast it
 ------------------------------------------------------------------------- */
 
 double ReadRestart::read_double()
 {
   double value;
   if (me == 0) fread(&value,sizeof(double),1,fp);
   MPI_Bcast(&value,1,MPI_DOUBLE,0,world);
   return value;
 }
 
 /* ----------------------------------------------------------------------
    read a char string (including NULL) and bcast it
    str is allocated here, ptr is returned, caller must deallocate
 ------------------------------------------------------------------------- */
 
 char *ReadRestart::read_string()
 {
   int n;
   if (me == 0) fread(&n,sizeof(int),1,fp);
   MPI_Bcast(&n,1,MPI_INT,0,world);
   char *value = new char[n];
   if (me == 0) fread(value,sizeof(char),n,fp);
   MPI_Bcast(value,n,MPI_CHAR,0,world);
   return value;
 }
 
 /* ----------------------------------------------------------------------
    read vector of N ints from restart file and bcast them
    do not bcast them, caller does that if required
 ------------------------------------------------------------------------- */
 
 void ReadRestart::read_int_vec(int n, int *vec)
 {
   if (me == 0) fread(vec,sizeof(int),n,fp);
   MPI_Bcast(vec,n,MPI_INT,0,world);
 }
 
 /* ----------------------------------------------------------------------
    read vector of N doubles from restart file and bcast them
    do not bcast them, caller does that if required
 ------------------------------------------------------------------------- */
 
 void ReadRestart::read_double_vec(int n, double *vec)
 {
   if (me == 0) fread(vec,sizeof(double),n,fp);
   MPI_Bcast(vec,n,MPI_DOUBLE,0,world);
 }
diff --git a/src/replicate.cpp b/src/replicate.cpp
index 26f3fca7e..7300da096 100644
--- a/src/replicate.cpp
+++ b/src/replicate.cpp
@@ -1,422 +1,422 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "stdlib.h"
 #include "string.h"
 #include "replicate.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "atom_vec_hybrid.h"
 #include "force.h"
 #include "domain.h"
 #include "comm.h"
 #include "special.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define LB_FACTOR 1.1
 #define EPSILON   1.0e-6
 
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
 /* ---------------------------------------------------------------------- */
 
 Replicate::Replicate(LAMMPS *lmp) : Pointers(lmp) {}
 
 /* ---------------------------------------------------------------------- */
 
 void Replicate::command(int narg, char **arg)
 {
   int i,j,m,n;
 
   if (domain->box_exist == 0)
     error->all(FLERR,"Replicate command before simulation box is defined");
   if (narg != 3) error->all(FLERR,"Illegal replicate command");
 
   int me = comm->me;
   int nprocs = comm->nprocs;
 
   if (me == 0 && screen) fprintf(screen,"Replicating atoms ...\n");
 
   // nrep = total # of replications
 
   int nx = force->inumeric(FLERR,arg[0]);
   int ny = force->inumeric(FLERR,arg[1]);
   int nz = force->inumeric(FLERR,arg[2]);
   int nrep = nx*ny*nz;
 
   // error and warning checks
 
   if (nx <= 0 || ny <= 0 || nz <= 0)
     error->all(FLERR,"Illegal replicate command");
   if (domain->dimension == 2 && nz != 1)
     error->all(FLERR,"Cannot replicate 2d simulation in z dimension");
   if ((nx > 1 && domain->xperiodic == 0) ||
       (ny > 1 && domain->yperiodic == 0) ||
       (nz > 1 && domain->zperiodic == 0)) {
     if (comm->me == 0)
       error->warning(FLERR,"Replicating in a non-periodic dimension");
   }
 
   if (atom->nextra_grow || atom->nextra_restart || atom->nextra_store)
     error->all(FLERR,"Cannot replicate with fixes that store atom quantities");
 
   // maxtag = largest atom tag across all existing atoms
 
   tagint maxtag = 0;
   if (atom->tag_enable) {
     for (i = 0; i < atom->nlocal; i++) maxtag = MAX(atom->tag[i],maxtag);
     tagint maxtag_all;
     MPI_Allreduce(&maxtag,&maxtag_all,1,MPI_LMP_TAGINT,MPI_MAX,world);
     maxtag = maxtag_all;
   }
 
   // maxmol = largest molecule tag across all existing atoms
 
   tagint maxmol = 0;
   if (atom->molecule_flag) {
     for (i = 0; i < atom->nlocal; i++) maxmol = MAX(atom->molecule[i],maxmol);
     tagint maxmol_all;
     MPI_Allreduce(&maxmol,&maxmol_all,1,MPI_LMP_TAGINT,MPI_MAX,world);
     maxmol = maxmol_all;
   }
 
   // unmap existing atoms via image flags
 
   for (i = 0; i < atom->nlocal; i++)
     domain->unmap(atom->x[i],atom->image[i]);
 
   // communication buffer for all my atom's info
   // max_size = largest buffer needed by any proc
   // must do before new Atom class created,
   //   since size_restart() uses atom->nlocal
 
   int max_size;
   int send_size = atom->avec->size_restart();
   MPI_Allreduce(&send_size,&max_size,1,MPI_INT,MPI_MAX,world);
 
   double *buf;
   memory->create(buf,max_size,"replicate:buf");
 
   // old = original atom class
   // atom = new replicated atom class
 
   Atom *old = atom;
   atom = new Atom(lmp);
   atom->settings(old);
-  atom->create_avec(old->atom_style,old->avec->nargcopy,old->avec->argcopy);
+  atom->create_avec(old->atom_style,old->avec->nargcopy,old->avec->argcopy,0);
 
   // check that new system will not be too large
   // new tags cannot exceed MAXTAGINT
   // new system sizes cannot exceed MAXBIGINT
 
   if (atom->tag_enable) {
     bigint maxnewtag = maxtag + (nrep-1)*old->natoms;
     if (maxnewtag < 0 || maxnewtag >= MAXTAGINT)
       error->all(FLERR,"Replicated system atom IDs are too big");
   }
 
   if (nrep*old->natoms < 0 || nrep*old->natoms >= MAXBIGINT ||
       nrep*old->nbonds < 0 || nrep*old->nbonds >= MAXBIGINT ||
       nrep*old->nangles < 0 || nrep*old->nangles >= MAXBIGINT ||
       nrep*old->ndihedrals < 0 || nrep*old->ndihedrals >= MAXBIGINT ||
       nrep*old->nimpropers < 0 || nrep*old->nimpropers >= MAXBIGINT)
     error->all(FLERR,"Replicated system is too big");
 
   // assign atom and topology counts in new class from old one
 
   atom->natoms = old->natoms * nrep;
   atom->nbonds = old->nbonds * nrep;
   atom->nangles = old->nangles * nrep;
   atom->ndihedrals = old->ndihedrals * nrep;
   atom->nimpropers = old->nimpropers * nrep;
 
   atom->ntypes = old->ntypes;
   atom->nbondtypes = old->nbondtypes;
   atom->nangletypes = old->nangletypes;
   atom->ndihedraltypes = old->ndihedraltypes;
   atom->nimpropertypes = old->nimpropertypes;
 
   atom->bond_per_atom = old->bond_per_atom;
   atom->angle_per_atom = old->angle_per_atom;
   atom->dihedral_per_atom = old->dihedral_per_atom;
   atom->improper_per_atom = old->improper_per_atom;
 
   // store old simulation box
 
   int triclinic = domain->triclinic;
   double old_xprd = domain->xprd;
   double old_yprd = domain->yprd;
   double old_zprd = domain->zprd;
   double old_xy = domain->xy;
   double old_xz = domain->xz;
   double old_yz = domain->yz;
 
   // setup new simulation box
 
   domain->boxhi[0] = domain->boxlo[0] + nx*old_xprd;
   domain->boxhi[1] = domain->boxlo[1] + ny*old_yprd;
   domain->boxhi[2] = domain->boxlo[2] + nz*old_zprd;
   if (triclinic) {
     domain->xy *= ny;
     domain->xz *= nz;
     domain->yz *= nz;
   }
 
   // new problem setup using new box boundaries
 
   if (nprocs == 1) n = static_cast<int> (atom->natoms);
   else n = static_cast<int> (LB_FACTOR * atom->natoms / nprocs);
 
   atom->allocate_type_arrays();
   atom->avec->grow(n);
   n = atom->nmax;
 
   domain->print_box("  ");
   domain->set_initial_box();
   domain->set_global_box();
   comm->set_proc_grid();
   domain->set_local_box();
 
   // copy type arrays to new atom class
 
   if (atom->mass) {
     for (int itype = 1; itype <= atom->ntypes; itype++) {
       atom->mass_setflag[itype] = old->mass_setflag[itype];
       if (atom->mass_setflag[itype]) atom->mass[itype] = old->mass[itype];
     }
   }
 
   // set bounds for my proc
   // if periodic and I am lo/hi proc, adjust bounds by EPSILON
   // insures all replicated atoms will be owned even with round-off
 
   double epsilon[3];
   if (triclinic) epsilon[0] = epsilon[1] = epsilon[2] = EPSILON;
   else {
     epsilon[0] = domain->prd[0] * EPSILON;
     epsilon[1] = domain->prd[1] * EPSILON;
     epsilon[2] = domain->prd[2] * EPSILON;
   }
 
   double sublo[3],subhi[3];
   if (triclinic == 0) {
     sublo[0] = domain->sublo[0]; subhi[0] = domain->subhi[0];
     sublo[1] = domain->sublo[1]; subhi[1] = domain->subhi[1];
     sublo[2] = domain->sublo[2]; subhi[2] = domain->subhi[2];
   } else {
     sublo[0] = domain->sublo_lamda[0]; subhi[0] = domain->subhi_lamda[0];
     sublo[1] = domain->sublo_lamda[1]; subhi[1] = domain->subhi_lamda[1];
     sublo[2] = domain->sublo_lamda[2]; subhi[2] = domain->subhi_lamda[2];
   }
 
   if (comm->layout != LAYOUT_TILED) {
     if (domain->xperiodic) {
       if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
       if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
     }
     if (domain->yperiodic) {
       if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
       if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
     }
     if (domain->zperiodic) {
       if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
       if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
     }
 
   } else {
     if (domain->xperiodic) {
       if (comm->mysplit[0][0] == 0.0) sublo[0] -= epsilon[0];
       if (comm->mysplit[0][1] == 1.0) subhi[0] += epsilon[0];
     }
     if (domain->yperiodic) {
       if (comm->mysplit[1][0] == 0.0) sublo[1] -= epsilon[1];
       if (comm->mysplit[1][1] == 1.0) subhi[1] += epsilon[1];
     }
     if (domain->zperiodic) {
       if (comm->mysplit[2][0] == 0.0) sublo[2] -= epsilon[2];
       if (comm->mysplit[2][1] == 1.0) subhi[2] += epsilon[2];
     }
   }
 
   // loop over all procs
   // if this iteration of loop is me:
   //   pack my unmapped atom data into buf
   //   bcast it to all other procs
   // performs 3d replicate loop with while loop over atoms in buf
   //   x = new replicated position, remapped into simulation box
   //   unpack atom into new atom class from buf if I own it
   //   adjust tag, mol #, coord, topology info as needed
 
   AtomVec *old_avec = old->avec;
   AtomVec *avec = atom->avec;
 
   int ix,iy,iz;
   tagint atom_offset,mol_offset;
   imageint image;
   double x[3],lamda[3];
   double *coord;
   int tag_enable = atom->tag_enable;
 
   for (int iproc = 0; iproc < nprocs; iproc++) {
     if (me == iproc) {
       n = 0;
       for (i = 0; i < old->nlocal; i++) n += old_avec->pack_restart(i,&buf[n]);
     }
     MPI_Bcast(&n,1,MPI_INT,iproc,world);
     MPI_Bcast(buf,n,MPI_DOUBLE,iproc,world);
 
     for (ix = 0; ix < nx; ix++) {
       for (iy = 0; iy < ny; iy++) {
         for (iz = 0; iz < nz; iz++) {
 
           // while loop over one proc's atom list
 
           m = 0;
           while (m < n) {
             image = ((imageint) IMGMAX << IMG2BITS) |
               ((imageint) IMGMAX << IMGBITS) | IMGMAX;
             if (triclinic == 0) {
               x[0] = buf[m+1] + ix*old_xprd;
               x[1] = buf[m+2] + iy*old_yprd;
               x[2] = buf[m+3] + iz*old_zprd;
             } else {
               x[0] = buf[m+1] + ix*old_xprd + iy*old_xy + iz*old_xz;
               x[1] = buf[m+2] + iy*old_yprd + iz*old_yz;
               x[2] = buf[m+3] + iz*old_zprd;
             }
             domain->remap(x,image);
             if (triclinic) {
               domain->x2lamda(x,lamda);
               coord = lamda;
             } else coord = x;
 
             if (coord[0] >= sublo[0] && coord[0] < subhi[0] &&
                 coord[1] >= sublo[1] && coord[1] < subhi[1] &&
                 coord[2] >= sublo[2] && coord[2] < subhi[2]) {
 
               m += avec->unpack_restart(&buf[m]);
 
               i = atom->nlocal - 1;
               if (tag_enable)
                 atom_offset = iz*ny*nx*maxtag + iy*nx*maxtag + ix*maxtag;
               else atom_offset = 0;
               mol_offset = iz*ny*nx*maxmol + iy*nx*maxmol + ix*maxmol;
 
               atom->x[i][0] = x[0];
               atom->x[i][1] = x[1];
               atom->x[i][2] = x[2];
 
               atom->tag[i] += atom_offset;
               atom->image[i] = image;
 
               if (atom->molecular) {
                 if (atom->molecule[i] > 0)
                   atom->molecule[i] += mol_offset;
                 if (atom->molecular == 1) {
                   if (atom->avec->bonds_allow)
                     for (j = 0; j < atom->num_bond[i]; j++)
                       atom->bond_atom[i][j] += atom_offset;
                   if (atom->avec->angles_allow)
                     for (j = 0; j < atom->num_angle[i]; j++) {
                       atom->angle_atom1[i][j] += atom_offset;
                       atom->angle_atom2[i][j] += atom_offset;
                       atom->angle_atom3[i][j] += atom_offset;
                     }
                   if (atom->avec->dihedrals_allow)
                     for (j = 0; j < atom->num_dihedral[i]; j++) {
                       atom->dihedral_atom1[i][j] += atom_offset;
                       atom->dihedral_atom2[i][j] += atom_offset;
                       atom->dihedral_atom3[i][j] += atom_offset;
                       atom->dihedral_atom4[i][j] += atom_offset;
                     }
                   if (atom->avec->impropers_allow)
                     for (j = 0; j < atom->num_improper[i]; j++) {
                       atom->improper_atom1[i][j] += atom_offset;
                       atom->improper_atom2[i][j] += atom_offset;
                       atom->improper_atom3[i][j] += atom_offset;
                       atom->improper_atom4[i][j] += atom_offset;
                     }
                 }
               }
             } else m += static_cast<int> (buf[m]);
           }
         }
       }
     }
   }
 
   // free communication buffer and old atom class
 
   memory->destroy(buf);
   delete old;
 
   // check that all atoms were assigned to procs
 
   bigint natoms;
   bigint nblocal = atom->nlocal;
   MPI_Allreduce(&nblocal,&natoms,1,MPI_LMP_BIGINT,MPI_SUM,world);
 
   if (me == 0) {
     if (screen) fprintf(screen,"  " BIGINT_FORMAT " atoms\n",natoms);
     if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " atoms\n",natoms);
   }
 
   if (natoms != atom->natoms)
     error->all(FLERR,"Replicate did not assign all atoms correctly");
 
   if (me == 0) {
     if (atom->nbonds) {
       if (screen) fprintf(screen,"  " BIGINT_FORMAT " bonds\n",atom->nbonds);
       if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " bonds\n",atom->nbonds);
     }
     if (atom->nangles) {
       if (screen) fprintf(screen,"  " BIGINT_FORMAT " angles\n",
                           atom->nangles);
       if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " angles\n",
                            atom->nangles);
     }
     if (atom->ndihedrals) {
       if (screen) fprintf(screen,"  " BIGINT_FORMAT " dihedrals\n",
                           atom->ndihedrals);
       if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " dihedrals\n",
                            atom->ndihedrals);
     }
     if (atom->nimpropers) {
       if (screen) fprintf(screen,"  " BIGINT_FORMAT " impropers\n",
                           atom->nimpropers);
       if (logfile) fprintf(logfile,"  " BIGINT_FORMAT " impropers\n",
                            atom->nimpropers);
     }
   }
 
   // check that atom IDs are valid
 
   atom->tag_check();
 
   // create global mapping of atoms
 
   if (atom->map_style) {
     atom->map_init();
     atom->map_set();
   }
 
   // create special bond lists for molecular systems
 
   if (atom->molecular == 1) {
     Special special(lmp);
     special.build();
   }
 }
diff --git a/src/suffix.h b/src/suffix.h
index 2a150ed5e..43493d620 100644
--- a/src/suffix.h
+++ b/src/suffix.h
@@ -1,29 +1,30 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_SUFFIX_H
 #define LMP_SUFFIX_H
 
 namespace LAMMPS_NS {
 
 namespace Suffix {
   static const int NONE = 0;
   static const int OPT  = 1<<0;
   static const int GPU  = 1<<1;
   static const int CUDA = 1<<2;
   static const int OMP  = 1<<3;
+  static const int INTEL  = 1<<4;
 }
 
 }
 
 #endif
diff --git a/src/update.cpp b/src/update.cpp
index a2017db06..610cce1ab 100644
--- a/src/update.cpp
+++ b/src/update.cpp
@@ -1,478 +1,496 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "string.h"
 #include "stdlib.h"
 #include "update.h"
 #include "integrate.h"
 #include "min.h"
 #include "style_integrate.h"
 #include "style_minimize.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "force.h"
 #include "modify.h"
 #include "fix.h"
 #include "domain.h"
 #include "region.h"
 #include "compute.h"
 #include "output.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 Update::Update(LAMMPS *lmp) : Pointers(lmp)
 {
   char *str;
 
   ntimestep = 0;
   atime = 0.0;
   atimestep = 0;
   first_update = 0;
 
   whichflag = 0;
   firststep = laststep = 0;
   beginstep = endstep = 0;
   setupflag = 0;
   multireplica = 0;
 
   restrict_output = 0;
 
   eflag_global = vflag_global = -1;
 
   unit_style = NULL;
   set_units("lj");
 
   integrate_style = NULL;
   integrate = NULL;
   minimize_style = NULL;
   minimize = NULL;
 
   str = (char *) "verlet";
-  create_integrate(1,&str,lmp->suffix);
+  create_integrate(1,&str,1);
 
   str = (char *) "cg";
   create_minimize(1,&str);
 }
 
 /* ---------------------------------------------------------------------- */
 
 Update::~Update()
 {
   delete [] unit_style;
 
   delete [] integrate_style;
   delete integrate;
 
   delete [] minimize_style;
   delete minimize;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Update::init()
 {
   // if USER-CUDA mode is enabled:
   // integrate/minimize style must be CUDA variant
 
   if (whichflag == 1 && lmp->cuda)
     if (strstr(integrate_style,"cuda") == NULL)
       error->all(FLERR,"USER-CUDA mode requires CUDA variant of run style");
   if (whichflag == 2 && lmp->cuda)
     if (strstr(minimize_style,"cuda") == NULL)
       error->all(FLERR,"USER-CUDA mode requires CUDA variant of min style");
 
   // init the appropriate integrate and/or minimize class
   // if neither (e.g. from write_restart) then just return
 
   if (whichflag == 0) return;
   if (whichflag == 1) integrate->init();
   else if (whichflag == 2) minimize->init();
 
   // only set first_update if a run or minimize is being performed
 
   first_update = 1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Update::set_units(const char *style)
 {
   // physical constants from:
   // http://physics.nist.gov/cuu/Constants/Table/allascii.txt
   // using thermochemical calorie = 4.184 J
 
   if (strcmp(style,"lj") == 0) {
     force->boltz = 1.0;
     force->hplanck = 0.18292026;  // using LJ parameters for argon
     force->mvv2e = 1.0;
     force->ftm2v = 1.0;
     force->mv2d = 1.0;
     force->nktv2p = 1.0;
     force->qqr2e = 1.0;
     force->qe2f = 1.0;
     force->vxmu2f = 1.0;
     force->xxt2kmu = 1.0;
     force->e_mass = 0.0;    // not yet set
     force->hhmrr2e = 0.0;
     force->mvh2r = 0.0;
     force->angstrom = 1.0;
     force->femtosecond = 1.0;
     force->qelectron = 1.0;
 
     dt = 0.005;
     neighbor->skin = 0.3;
 
   } else if (strcmp(style,"real") == 0) {
     force->boltz = 0.0019872067;
     force->hplanck = 95.306976368;
     force->mvv2e = 48.88821291 * 48.88821291;
     force->ftm2v = 1.0 / 48.88821291 / 48.88821291;
     force->mv2d = 1.0 / 0.602214179;
     force->nktv2p = 68568.415;
     force->qqr2e = 332.06371;
     force->qe2f = 23.060549;
     force->vxmu2f = 1.4393264316e4;
     force->xxt2kmu = 0.1;
     force->e_mass = 1.0/1836.1527556560675;
     force->hhmrr2e = 0.0957018663603261;
     force->mvh2r = 1.5339009481951;
     force->angstrom = 1.0;
     force->femtosecond = 1.0;
     force->qelectron = 1.0;
 
     dt = 1.0;
     neighbor->skin = 2.0;
 
   } else if (strcmp(style,"metal") == 0) {
     force->boltz = 8.617343e-5;
     force->hplanck = 4.135667403e-3;
     force->mvv2e = 1.0364269e-4;
     force->ftm2v = 1.0 / 1.0364269e-4;
     force->mv2d = 1.0 / 0.602214179;
     force->nktv2p = 1.6021765e6;
     force->qqr2e = 14.399645;
     force->qe2f = 1.0;
     force->vxmu2f = 0.6241509647;
     force->xxt2kmu = 1.0e-4;
     force->e_mass = 0.0;    // not yet set
     force->hhmrr2e = 0.0;
     force->mvh2r = 0.0;
     force->angstrom = 1.0;
     force->femtosecond = 1.0e-3;
     force->qelectron = 1.0;
 
     dt = 0.001;
     neighbor->skin = 2.0;
 
   } else if (strcmp(style,"si") == 0) {
     force->boltz = 1.3806504e-23;
     force->hplanck = 6.62606896e-34;
     force->mvv2e = 1.0;
     force->ftm2v = 1.0;
     force->mv2d = 1.0;
     force->nktv2p = 1.0;
     force->qqr2e = 8.9876e9;
     force->qe2f = 1.0;
     force->vxmu2f = 1.0;
     force->xxt2kmu = 1.0;
     force->e_mass = 0.0;    // not yet set
     force->hhmrr2e = 0.0;
     force->mvh2r = 0.0;
     force->angstrom = 1.0e-10;
     force->femtosecond = 1.0e-15;
     force->qelectron = 1.6021765e-19;
 
     dt = 1.0e-8;
     neighbor->skin = 0.001;
 
   } else if (strcmp(style,"cgs") == 0) {
     force->boltz = 1.3806504e-16;
     force->hplanck = 6.62606896e-27;
     force->mvv2e = 1.0;
     force->ftm2v = 1.0;
     force->mv2d = 1.0;
     force->nktv2p = 1.0;
     force->qqr2e = 1.0;
     force->qe2f = 1.0;
     force->vxmu2f = 1.0;
     force->xxt2kmu = 1.0;
     force->e_mass = 0.0;    // not yet set
     force->hhmrr2e = 0.0;
     force->mvh2r = 0.0;
     force->angstrom = 1.0e-8;
     force->femtosecond = 1.0e-15;
     force->qelectron = 4.8032044e-10;
 
     dt = 1.0e-8;
     neighbor->skin = 0.1;
 
   } else if (strcmp(style,"electron") == 0) {
     force->boltz = 3.16681534e-6;
     force->hplanck = 0.1519829846;
     force->mvv2e = 1.06657236;
     force->ftm2v = 0.937582899;
     force->mv2d = 1.0;
     force->nktv2p = 2.94210108e13;
     force->qqr2e = 1.0;
     force->qe2f = 1.94469051e-10;
     force->vxmu2f = 3.39893149e1;
     force->xxt2kmu = 3.13796367e-2;
     force->e_mass = 0.0;    // not yet set
     force->hhmrr2e = 0.0;
     force->mvh2r = 0.0;
     force->angstrom = 1.88972612;
     force->femtosecond = 0.0241888428;
     force->qelectron = 1.0;
 
     dt = 0.001;
     neighbor->skin = 2.0;
 
   } else if (strcmp(style,"micro") == 0) {
     force->boltz = 1.3806504e-8;
     force->hplanck = 6.62606896e-13;
     force->mvv2e = 1.0;
     force->ftm2v = 1.0;
     force->mv2d = 1.0;
     force->nktv2p = 1.0;
     force->qqr2e = 8.987556e6;
     force->qe2f = 1.0;
     force->vxmu2f = 1.0;
     force->xxt2kmu = 1.0;
     force->e_mass = 0.0;    // not yet set
     force->hhmrr2e = 0.0;
     force->mvh2r = 0.0;
     force->angstrom = 1.0e-4;
     force->femtosecond = 1.0e-9;
     force->qelectron = 1.6021765e-7;
 
     dt = 2.0;
     neighbor->skin = 0.1;
                                               
   } else if (strcmp(style,"nano") == 0) {  
     force->boltz = 0.013806504;
     force->hplanck = 6.62606896e-4;
     force->mvv2e = 1.0;
     force->ftm2v = 1.0;
     force->mv2d = 1.0;
     force->nktv2p = 1.0;
     force->qqr2e = 230.7078669;
     force->qe2f = 1.0;
     force->vxmu2f = 1.0;
     force->xxt2kmu = 1.0;
     force->e_mass = 0.0;    // not yet set
     force->hhmrr2e = 0.0;
     force->mvh2r = 0.0;
     force->angstrom = 1.0e-1;
     force->femtosecond = 1.0e-6;
     force->qelectron = 1.0;
        
     dt = 0.00045;
     neighbor->skin = 0.1;
 
   } else error->all(FLERR,"Illegal units command");
 
   delete [] unit_style;
   int n = strlen(style) + 1;
   unit_style = new char[n];
   strcpy(unit_style,style);
 }
 
 /* ---------------------------------------------------------------------- */
 
-void Update::create_integrate(int narg, char **arg, char *suffix)
+void Update::create_integrate(int narg, char **arg, int trysuffix)
 {
   if (narg < 1) error->all(FLERR,"Illegal run_style command");
 
   delete [] integrate_style;
   delete integrate;
 
   int sflag;
-  new_integrate(arg[0],narg-1,&arg[1],suffix,sflag);
+  new_integrate(arg[0],narg-1,&arg[1],trysuffix,sflag);
 
   if (sflag) {
     char estyle[256];
-    sprintf(estyle,"%s/%s",arg[0],suffix);
+    if (sflag == 1) sprintf(estyle,"%s/%s",arg[0],lmp->suffix);
+    else sprintf(estyle,"%s/%s",arg[0],lmp->suffix2);
     int n = strlen(estyle) + 1;
     integrate_style = new char[n];
     strcpy(integrate_style,estyle);
   } else {
     int n = strlen(arg[0]) + 1;
     integrate_style = new char[n];
     strcpy(integrate_style,arg[0]);
   }
 }
 
 /* ----------------------------------------------------------------------
    create the Integrate style, first with suffix appended
 ------------------------------------------------------------------------- */
 
 void Update::new_integrate(char *style, int narg, char **arg,
-                           char *suffix, int &sflag)
+                           int trysuffix, int &sflag)
 {
-  int success = 0;
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      int success = 1;
 
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    success = 1;
-
-    if (0) return;
+      if (0) return;
 
 #define INTEGRATE_CLASS
 #define IntegrateStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg);
+      else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg);
 #include "style_integrate.h"
 #undef IntegrateStyle
 #undef INTEGRATE_CLASS
+      
+      else success = 0;
+      if (success) return;
+    }
+    
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+      int success = 1;
+
+      if (0) return;
 
-    else success = 0;
+#define INTEGRATE_CLASS
+#define IntegrateStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg);
+#include "style_integrate.h"
+#undef IntegrateStyle
+#undef INTEGRATE_CLASS
+      
+      else success = 0;
+      if (success) return;
+    }
   }
 
-  if (!success) {
-    sflag = 0;
-
-    if (0) return;
+  sflag = 0;
+  if (0) return;
 
 #define INTEGRATE_CLASS
 #define IntegrateStyle(key,Class) \
-    else if (strcmp(style,#key) == 0) integrate = new Class(lmp,narg,arg);
+  else if (strcmp(style,#key) == 0) integrate = new Class(lmp,narg,arg);
 #include "style_integrate.h"
 #undef IntegrateStyle
 #undef INTEGRATE_CLASS
 
-    else error->all(FLERR,"Illegal integrate style");
-  }
+  else error->all(FLERR,"Illegal integrate style");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void Update::create_minimize(int narg, char **arg)
 {
   if (narg != 1) error->all(FLERR,"Illegal min_style command");
 
   delete [] minimize_style;
   delete minimize;
 
   if (0) return;      // dummy line to enable else-if macro expansion
 
 #define MINIMIZE_CLASS
 #define MinimizeStyle(key,Class) \
   else if (strcmp(arg[0],#key) == 0) minimize = new Class(lmp);
 #include "style_minimize.h"
 #undef MINIMIZE_CLASS
 
   else error->all(FLERR,"Illegal min_style command");
 
   int n = strlen(arg[0]) + 1;
   minimize_style = new char[n];
   strcpy(minimize_style,arg[0]);
 }
 
 /* ----------------------------------------------------------------------
    reset timestep as called from input script
 ------------------------------------------------------------------------- */
 
 void Update::reset_timestep(int narg, char **arg)
 {
   if (narg != 1) error->all(FLERR,"Illegal reset_timestep command");
   bigint newstep = ATOBIGINT(arg[0]);
   reset_timestep(newstep);
 }
 
 /* ----------------------------------------------------------------------
    reset timestep
    called from rerun command and input script (indirectly)
 ------------------------------------------------------------------------- */
 
 void Update::reset_timestep(bigint newstep)
 {
   ntimestep = newstep;
   if (ntimestep < 0) error->all(FLERR,"Timestep must be >= 0");
   if (ntimestep > MAXBIGINT) error->all(FLERR,"Too big a timestep");
 
   // set atimestep to new timestep
   // so future update_time() calls will be correct
 
   atimestep = ntimestep;
 
   // trigger reset of timestep for output and for fixes that require it
   // do not allow any timestep-dependent fixes to be defined
 
   output->reset_timestep(ntimestep);
 
   for (int i = 0; i < modify->nfix; i++) {
     if (modify->fix[i]->time_depend)
       error->all(FLERR,
                  "Cannot reset timestep with a time-dependent fix defined");
     modify->fix[i]->reset_timestep(ntimestep);
   }
 
   // reset eflag/vflag global so no commands will think eng/virial are current
 
   eflag_global = vflag_global = -1;
 
   // reset invoked flags of computes,
   // so no commands will think they are current between runs
 
   for (int i = 0; i < modify->ncompute; i++) {
     modify->compute[i]->invoked_scalar = -1;
     modify->compute[i]->invoked_vector = -1;
     modify->compute[i]->invoked_array = -1;
     modify->compute[i]->invoked_peratom = -1;
     modify->compute[i]->invoked_local = -1;
   }
 
   // clear timestep list of computes that store future invocation times
 
   for (int i = 0; i < modify->ncompute; i++)
     if (modify->compute[i]->timeflag) modify->compute[i]->clearstep();
 
   // set last_build of all neigh lists to -1 to force rebuild
 
   for (int i = 0; i < neighbor->nlist; i++)
     neighbor->lists[i]->last_build = -1;
 
   // NOTE: 7Jun12, adding rerun command, don't think this is required
 
   //for (int i = 0; i < domain->nregion; i++)
   //  if (domain->regions[i]->dynamic_check())
   //    error->all(FLERR,"Cannot reset timestep with a dynamic region defined");
 }
 
 /* ----------------------------------------------------------------------
    update elapsed simulation time
    called at end of runs or when timestep size changes
 ------------------------------------------------------------------------- */
 
 void Update::update_time()
 {
   atime += (ntimestep-atimestep) * dt;
   atimestep = ntimestep;
 }
 
 /* ----------------------------------------------------------------------
    memory usage of update and integrate/minimize
 ------------------------------------------------------------------------- */
 
 bigint Update::memory_usage()
 {
   bigint bytes = 0;
   if (whichflag == 1) bytes += integrate->memory_usage();
   else if (whichflag == 2) bytes += minimize->memory_usage();
   return bytes;
 }
diff --git a/src/update.h b/src/update.h
index 53bf041de..5c6c15bad 100644
--- a/src/update.h
+++ b/src/update.h
@@ -1,108 +1,108 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_UPDATE_H
 #define LMP_UPDATE_H
 
 #include "pointers.h"
 
 namespace LAMMPS_NS {
 
 class Update : protected Pointers {
  public:
   double dt;                      // timestep
   double etol,ftol;               // minimizer tolerances on energy/force
   bigint ntimestep;               // current step (dynamics or min iterations)
   int nsteps;                     // # of steps to run (dynamics or min iter)
   int whichflag;                  // 0 for unset, 1 for dynamics, 2 for min
   double atime;                   // simulation time at atime_step
   bigint atimestep;               // last timestep atime was updated
   bigint firststep,laststep;      // 1st & last step of this run
   bigint beginstep,endstep;       // 1st and last step of multiple runs
   int first_update;               // 0 before initial update, 1 after
   int max_eval;                   // max force evaluations for minimizer
   int restrict_output;            // 1 if output should not write dump/restart
   int setupflag;                  // set when setup() is computing forces
   int multireplica;               // 1 if min across replicas, else 0
 
   bigint eflag_global,eflag_atom;  // timestep global/peratom eng is tallied on
   bigint vflag_global,vflag_atom;  // ditto for virial
 
   char *unit_style;
 
   class Integrate *integrate;
   char *integrate_style;
 
   class Min *minimize;
   char *minimize_style;
 
   Update(class LAMMPS *);
   ~Update();
   void init();
   void set_units(const char *);
-  void create_integrate(int, char **, char *);
+  void create_integrate(int, char **, int);
   void create_minimize(int, char **);
   void reset_timestep(int, char **);
   void reset_timestep(bigint);
   void update_time();
   bigint memory_usage();
 
  private:
-  void new_integrate(char *, int, char **, char *, int &);
+  void new_integrate(char *, int, char **, int, int &);
 
 };
 
 }
 
 #endif
 
 /* ERROR/WARNING messages:
 
 E: USER-CUDA mode requires CUDA variant of run style
 
 CUDA mode is enabled, so the run style must include a cuda suffix.
 
 E: USER-CUDA mode requires CUDA variant of min style
 
 CUDA mode is enabled, so the min style must include a cuda suffix.
 
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
 E: Illegal integrate style
 
 Self-explanatory.
 
 E: Timestep must be >= 0
 
 Specified timestep is invalid.
 
 E: Too big a timestep
 
 Specified timestep is too large.
 
 E: Cannot reset timestep with a time-dependent fix defined
 
 You cannot reset the timestep when a fix that keeps track of elapsed
 time is in place.
 
 E: Cannot reset timestep with a dynamic region defined
 
 Dynamic regions (see the region command) have a time dependence.
 Thus you cannot change the timestep when one or more of these
 are defined.
 
 */