diff --git a/src/REPLICA/fix_neb.cpp b/src/REPLICA/fix_neb.cpp
index 5b426465e..959db8426 100644
--- a/src/REPLICA/fix_neb.cpp
+++ b/src/REPLICA/fix_neb.cpp
@@ -1,584 +1,584 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include <mpi.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include "fix_neb.h"
 #include "universe.h"
 #include "update.h"
 #include "atom.h"
 #include "domain.h"
 #include "comm.h"
 #include "modify.h"
 #include "compute.h"
 #include "group.h"
 #include "memory.h"
 #include "error.h"
 #include "force.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 enum{SINGLE_PROC_DIRECT,SINGLE_PROC_MAP,MULTI_PROC};
 
 /* ---------------------------------------------------------------------- */
 
 FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
   Fix(lmp, narg, arg)
 {
   if (narg != 4) error->all(FLERR,"Illegal fix neb command");
 
   kspring = force->numeric(FLERR,arg[3]);
   if (kspring <= 0.0) error->all(FLERR,"Illegal fix neb command");
 
   // nreplica = number of partitions
   // ireplica = which world I am in universe
   // nprocs_universe = # of procs in all replicase
   // procprev,procnext = root proc in adjacent replicas
 
   me = comm->me;
   nprocs = comm->nprocs;
 
   nprocs_universe = universe->nprocs;
   nreplica = universe->nworlds;
   ireplica = universe->iworld;
 
   if (ireplica > 0) procprev = universe->root_proc[ireplica-1];
   else procprev = -1;
   if (ireplica < nreplica-1) procnext = universe->root_proc[ireplica+1];
   else procnext = -1;
   uworld = universe->uworld;
 
   // create a new compute pe style
   // id = fix-ID + pe, compute group = all
 
   int n = strlen(id) + 4;
   id_pe = new char[n];
   strcpy(id_pe,id);
   strcat(id_pe,"_pe");
 
   char **newarg = new char*[3];
   newarg[0] = id_pe;
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "pe";
   modify->add_compute(3,newarg);
   delete [] newarg;
 
   // initialize local storage
 
   maxlocal = 0;
   ntotal = 0;
 
   xprev = xnext = tangent = NULL;
   xsend = xrecv = NULL;
   tagsend = tagrecv = NULL;
   xsendall = xrecvall = NULL;
   tagsendall = tagrecvall = NULL;
   counts = displacements = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixNEB::~FixNEB()
 {
   modify->delete_compute(id_pe);
   delete [] id_pe;
 
   memory->destroy(xprev);
   memory->destroy(xnext);
   memory->destroy(tangent);
 
   memory->destroy(xsend);
   memory->destroy(xrecv);
   memory->destroy(tagsend);
   memory->destroy(tagrecv);
 
   memory->destroy(xsendall);
   memory->destroy(xrecvall);
   memory->destroy(tagsendall);
   memory->destroy(tagrecvall);
 
   memory->destroy(counts);
   memory->destroy(displacements);
 }
 
 /* ---------------------------------------------------------------------- */
 
 int FixNEB::setmask()
 {
   int mask = 0;
   mask |= MIN_POST_FORCE;
   return mask;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixNEB::init()
 {
   int icompute = modify->find_compute(id_pe);
   if (icompute < 0)
     error->all(FLERR,"Potential energy ID for fix neb does not exist");
   pe = modify->compute[icompute];
 
   // turn off climbing mode, NEB command turns it on after init()
 
   rclimber = -1;
 
   // nebatoms = # of atoms in fix group = atoms with inter-replica forces
 
   bigint count = group->count(igroup);
   if (count > MAXSMALLINT) error->all(FLERR,"Too many active NEB atoms");
   nebatoms = count;
 
-  // comm style for inter-replica exchange of coords
+  // comm mode for inter-replica exchange of coords
 
   if (nreplica == nprocs_universe &&
       nebatoms == atom->natoms && atom->sortfreq == 0) 
     cmode = SINGLE_PROC_DIRECT;
   else if (nreplica == nprocs_universe) cmode = SINGLE_PROC_MAP;
   else cmode = MULTI_PROC;
 
   // ntotal = total # of atoms in system, NEB atoms or not
 
   if (atom->natoms > MAXSMALLINT) error->all(FLERR,"Too many atoms for NEB");
   ntotal = atom->natoms;
 
   if (atom->nlocal > maxlocal) reallocate();
 
   if (MULTI_PROC && counts == NULL) {
     memory->create(xsendall,ntotal,3,"neb:xsendall");
     memory->create(xrecvall,ntotal,3,"neb:xrecvall");
     memory->create(tagsendall,ntotal,"neb:tagsendall");
     memory->create(tagrecvall,ntotal,"neb:tagrecvall");
     memory->create(counts,nprocs,"neb:counts");
     memory->create(displacements,nprocs,"neb:displacements");
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixNEB::min_setup(int vflag)
 {
   min_post_force(vflag);
 
   // trigger potential energy computation on next timestep
 
   pe->addstep(update->ntimestep+1);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixNEB::min_post_force(int vflag)
 {
   double vprev,vnext,vmax,vmin;
   double delx,dely,delz;
   double delta1[3],delta2[3];
 
   // veng = PE of this replica
   // vprev,vnext = PEs of adjacent replicas
   // only proc 0 in each replica communicates
 
   vprev = vnext = veng = pe->compute_scalar();
 
   if (ireplica < nreplica-1 && me == 0) 
     MPI_Send(&veng,1,MPI_DOUBLE,procnext,0,uworld);
   if (ireplica > 0 && me == 0) 
     MPI_Recv(&vprev,1,MPI_DOUBLE,procprev,0,uworld,MPI_STATUS_IGNORE);
 
   if (ireplica > 0 && me == 0)
     MPI_Send(&veng,1,MPI_DOUBLE,procprev,0,uworld);
   if (ireplica < nreplica-1 && me == 0)
     MPI_Recv(&vnext,1,MPI_DOUBLE,procnext,0,uworld,MPI_STATUS_IGNORE);
 
   if (cmode == MULTI_PROC) {
     MPI_Bcast(&vprev,1,MPI_DOUBLE,0,world);
     MPI_Bcast(&vnext,1,MPI_DOUBLE,0,world);
   }
 
   // communicate atoms to/from adjacent replicas to fill xprev,xnext
 
   inter_replica_comm();
 
   // trigger potential energy computation on next timestep
 
   pe->addstep(update->ntimestep+1);
 
   // compute norm of GradV for log output
 
   double **f = atom->f;
   int nlocal = atom->nlocal;
 
   double fsq = 0.0;
   for (int i = 0; i < nlocal; i++)
     fsq += f[i][0]*f[i][0] + f[i][1]*f[i][1] + f[i][2]*f[i][2];
 
   MPI_Allreduce(&fsq,&gradvnorm,1,MPI_DOUBLE,MPI_SUM,world);
   gradvnorm = sqrt(gradvnorm);
 
   // first or last replica has no change to forces, just return
 
   if (ireplica == 0 || ireplica == nreplica-1) {
     plen = nlen = 0.0;
     return;
   }
 
   // tangent = unit tangent vector in 3N space
   // based on delta vectors between atoms and their images in adjacent replicas
   // use one or two delta vecs to compute tangent,
   // depending on relative PEs of 3 replicas
   // see Henkelman & Jonsson 2000 paper, eqs 8-11
 
   double **x = atom->x;
   int *mask = atom->mask;
 
   if (vnext > veng && veng > vprev) {
     for (int i = 0; i < nlocal; i++)
       if (mask[i] & groupbit) {
         tangent[i][0] = xnext[i][0] - x[i][0];
         tangent[i][1] = xnext[i][1] - x[i][1];
         tangent[i][2] = xnext[i][2] - x[i][2];
         domain->minimum_image(tangent[i]);
       }
   } else if (vnext < veng && veng < vprev) {
     for (int i = 0; i < nlocal; i++)
       if (mask[i] & groupbit) {
         tangent[i][0] = x[i][0] - xprev[i][0];
         tangent[i][1] = x[i][1] - xprev[i][1];
         tangent[i][2] = x[i][2] - xprev[i][2];
         domain->minimum_image(tangent[i]);
       }
   } else {
     vmax = MAX(fabs(vnext-veng),fabs(vprev-veng));
     vmin = MIN(fabs(vnext-veng),fabs(vprev-veng));
     for (int i = 0; i < nlocal; i++)
       if (mask[i] & groupbit) {
         delta1[0] = xnext[i][0] - x[i][0];
         delta1[1] = xnext[i][1] - x[i][1];
         delta1[2] = xnext[i][2] - x[i][2];
         domain->minimum_image(delta1);
         delta2[0] = x[i][0] - xprev[i][0];
         delta2[1] = x[i][1] - xprev[i][1];
         delta2[2] = x[i][2] - xprev[i][2];
         domain->minimum_image(delta2);
         if (vnext > vprev) {
           tangent[i][0] = vmax*delta1[0] + vmin*delta2[0];
           tangent[i][1] = vmax*delta1[1] + vmin*delta2[1];
           tangent[i][2] = vmax*delta1[2] + vmin*delta2[2];
         } else {
           tangent[i][0] = vmin*delta1[0] + vmax*delta2[0];
           tangent[i][1] = vmin*delta1[1] + vmax*delta2[1];
           tangent[i][2] = vmin*delta1[2] + vmax*delta2[2];
         }
       }
   }
 
   // tlen,plen,nlen = lengths of tangent, prev, next vectors
 
   double tlen = 0.0;
   plen = 0.0;
   nlen = 0.0;
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
       tlen += tangent[i][0]*tangent[i][0] + tangent[i][1]*tangent[i][1] +
         tangent[i][2]*tangent[i][2];
 
       delx = x[i][0] - xprev[i][0];
       dely = x[i][1] - xprev[i][1];
       delz = x[i][2] - xprev[i][2];
       domain->minimum_image(delx,dely,delz);
       plen += delx*delx + dely*dely + delz*delz;
 
       delx = xnext[i][0] - x[i][0];
       dely = xnext[i][1] - x[i][1];
       delz = xnext[i][2] - x[i][2];
       domain->minimum_image(delx,dely,delz);
       nlen += delx*delx + dely*dely + delz*delz;
     }
 
   double lenall;
   MPI_Allreduce(&tlen,&lenall,1,MPI_DOUBLE,MPI_SUM,world);
   tlen = sqrt(lenall);
 
   MPI_Allreduce(&plen,&lenall,1,MPI_DOUBLE,MPI_SUM,world);
   plen = sqrt(lenall);
 
   MPI_Allreduce(&nlen,&lenall,1,MPI_DOUBLE,MPI_SUM,world);
   nlen = sqrt(lenall);
 
   // normalize tangent vector
 
   if (tlen > 0.0) {
     double tleninv = 1.0/tlen;
     for (int i = 0; i < nlocal; i++)
       if (mask[i] & groupbit) {
         tangent[i][0] *= tleninv;
         tangent[i][1] *= tleninv;
         tangent[i][2] *= tleninv;
       }
   }
 
   // reset force on each atom in this replica
   // regular NEB for all replicas except rclimber does hill-climbing NEB
   // currently have F = -Grad(V) = -Grad(V)_perp - Grad(V)_parallel
   // want F = -Grad(V)_perp + Fspring for regular NEB
   // thus Fdelta = Grad(V)_parallel + Fspring for regular NEB
   // want F = -Grad(V) + 2 Grad(V)_parallel for hill-climbing NEB
   // thus Fdelta = 2 Grad(V)_parallel for hill-climbing NEB
   // Grad(V)_parallel = (Grad(V) . utan) * utangent = -(F . utan) * utangent
   // Fspring = k (nlen - plen) * utangent
   // see Henkelman & Jonsson 2000 paper, eqs 3,4,12
   // see Henkelman & Jonsson 2000a paper, eq 5
 
   double dot = 0.0;
   for (int i = 0; i < nlocal; i++) {
     if (mask[i] & groupbit)
       dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] +
         f[i][2]*tangent[i][2];
   }
 
   double dotall;
   MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
 
   double prefactor;
   if (ireplica == rclimber) prefactor = -2.0*dotall;
   else prefactor = -dotall + kspring*(nlen-plen);
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
       f[i][0] += prefactor*tangent[i][0];
       f[i][1] += prefactor*tangent[i][1];
       f[i][2] += prefactor*tangent[i][2];
     }
 }
 
 /* ----------------------------------------------------------------------
    send/recv NEB atoms to/from adjacent replicas
    received atoms matching my local atoms are stored in xprev,xnext
    replicas 0 and N-1 send but do not receive any atoms
 ------------------------------------------------------------------------- */
 
 void FixNEB::inter_replica_comm()
 {
   int i,m;
   MPI_Request request;
   MPI_Request requests[2];
   MPI_Status statuses[2];
 
   // reallocate memory if necessary
 
   if (atom->nlocal > maxlocal) reallocate();
 
   double **x = atom->x;
   tagint *tag = atom->tag;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   // -----------------------------------------------------
   // 3 cases: two for single proc per replica
   //          one for multiple procs per replica
   // -----------------------------------------------------
 
   // single proc per replica
-  // all atoms are NEB atoms and no atom sorting is enabled
+  // all atoms are NEB atoms and no atom sorting
   // direct comm of x -> xprev and x -> xnext
 
   if (cmode == SINGLE_PROC_DIRECT) {
     if (ireplica > 0)
       MPI_Irecv(xprev[0],3*nlocal,MPI_DOUBLE,procprev,0,uworld,&request);
     if (ireplica < nreplica-1)
       MPI_Send(x[0],3*nlocal,MPI_DOUBLE,procnext,0,uworld);
     if (ireplica > 0) MPI_Wait(&request,MPI_STATUS_IGNORE);
     
     if (ireplica < nreplica-1)
       MPI_Irecv(xnext[0],3*nlocal,MPI_DOUBLE,procnext,0,uworld,&request);
     if (ireplica > 0)
       MPI_Send(x[0],3*nlocal,MPI_DOUBLE,procprev,0,uworld);
     if (ireplica < nreplica-1) MPI_Wait(&request,MPI_STATUS_IGNORE);
 
     return;
   }
 
   // single proc per replica
   // but only some atoms are NEB atoms or atom sorting is enabled
   // send atom IDs and coords of only NEB atoms to prev/next proc
-  // recv proc uses atom->map() to match received coords to owned atoms
+  // recv procs use atom->map() to match received coords to owned atoms
 
   if (cmode == SINGLE_PROC_MAP) {
     m = 0;
     for (i = 0; i < nlocal; i++)
       if (mask[i] & groupbit) {
         tagsend[m] = tag[i];
         xsend[m][0] = x[i][0];
         xsend[m][1] = x[i][1];
         xsend[m][2] = x[i][2];
         m++;
       }
 
     if (ireplica > 0) {
       MPI_Irecv(xrecv[0],3*nebatoms,MPI_DOUBLE,procprev,0,uworld,&requests[0]);
       MPI_Irecv(tagrecv,nebatoms,MPI_LMP_TAGINT,procprev,0,uworld,&requests[1]);
     }
     if (ireplica < nreplica-1) {
       MPI_Send(xsend[0],3*nebatoms,MPI_DOUBLE,procnext,0,uworld);
       MPI_Send(tagsend,nebatoms,MPI_LMP_TAGINT,procnext,0,uworld);
     }
 
     if (ireplica > 0) {
       MPI_Waitall(2,requests,statuses);
       for (i = 0; i < nebatoms; i++) {
         m = atom->map(tagrecv[i]);
         xprev[m][0] = xrecv[i][0];
         xprev[m][1] = xrecv[i][1];
         xprev[m][2] = xrecv[i][2];
       }
     }
       
     if (ireplica < nreplica-1) {
       MPI_Irecv(xrecv[0],3*nebatoms,MPI_DOUBLE,procnext,0,uworld,&requests[0]);
       MPI_Irecv(tagrecv,nebatoms,MPI_LMP_TAGINT,procnext,0,uworld,&requests[1]);
     }
     if (ireplica > 0) {
       MPI_Send(xsend[0],3*nebatoms,MPI_DOUBLE,procprev,0,uworld);
       MPI_Send(tagsend,nebatoms,MPI_LMP_TAGINT,procprev,0,uworld);
     }
 
     if (ireplica < nreplica-1) {
       MPI_Waitall(2,requests,statuses);
       for (i = 0; i < nebatoms; i++) {
         m = atom->map(tagrecv[i]);
         xnext[m][0] = xrecv[i][0];
         xnext[m][1] = xrecv[i][1];
         xnext[m][2] = xrecv[i][2];
       }
     }
 
     return;
   }
 
   // multiple procs per replica
   // MPI_Gather all coords and atom IDs to root proc of each replica
   // send to root of adjacent replicas
   // bcast within each replica
   // each proc extracts info for atoms it owns via atom->map()
 
   m = 0;
   for (i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
       tagsend[m] = tag[i];
       xsend[m][0] = x[i][0];
       xsend[m][1] = x[i][1];
       xsend[m][2] = x[i][2];
       m++;
     }
 
   MPI_Gather(&m,1,MPI_INT,counts,1,MPI_INT,0,world);
   displacements[0] = 0;
   for (i = 0; i < nprocs-1; i++)
     displacements[i+1] = displacements[i] + counts[i];
   MPI_Gatherv(tagsend,m,MPI_LMP_TAGINT,
               tagsendall,counts,displacements,MPI_LMP_TAGINT,0,world);
   for (i = 0; i < nprocs; i++) counts[i] *= 3;
   for (i = 0; i < nprocs-1; i++)
     displacements[i+1] = displacements[i] + counts[i];
   if (xsend)
     MPI_Gatherv(xsend[0],3*m,MPI_DOUBLE,
                 xsendall[0],counts,displacements,MPI_DOUBLE,0,world);
   else
     MPI_Gatherv(NULL,3*m,MPI_DOUBLE,
                 xsendall[0],counts,displacements,MPI_DOUBLE,0,world);
 
   if (ireplica > 0 && me == 0) {
     MPI_Irecv(xrecvall[0],3*nebatoms,MPI_DOUBLE,procprev,0,uworld,&requests[0]);
     MPI_Irecv(tagrecvall,nebatoms,MPI_LMP_TAGINT,procprev,0,uworld,
               &requests[1]);
   }
   if (ireplica < nreplica-1 && me == 0) {
     MPI_Send(xsendall[0],3*nebatoms,MPI_DOUBLE,procnext,0,uworld);
     MPI_Send(tagsendall,nebatoms,MPI_LMP_TAGINT,procnext,0,uworld);
   }
 
   if (ireplica > 0) {
     if (me == 0) MPI_Waitall(2,requests,statuses);
 
     MPI_Bcast(tagrecvall,nebatoms,MPI_INT,0,world);
     MPI_Bcast(xrecvall[0],3*nebatoms,MPI_DOUBLE,0,world);
 
     for (i = 0; i < nebatoms; i++) {
       m = atom->map(tagrecvall[i]);
       if (m < 0 || m >= nlocal) continue;
       xprev[m][0] = xrecvall[i][0];
       xprev[m][1] = xrecvall[i][1];
       xprev[m][2] = xrecvall[i][2];
     }
   }
 
   if (ireplica < nreplica-1 && me == 0) {
     MPI_Irecv(xrecvall[0],3*nebatoms,MPI_DOUBLE,procnext,0,uworld,&requests[0]);
     MPI_Irecv(tagrecvall,nebatoms,MPI_LMP_TAGINT,procnext,0,uworld,
               &requests[1]);
   }
   if (ireplica > 0 && me == 0) {
     MPI_Send(xsendall[0],3*nebatoms,MPI_DOUBLE,procprev,0,uworld);
     MPI_Send(tagsendall,nebatoms,MPI_LMP_TAGINT,procprev,0,uworld);
   }
 
   if (ireplica < nreplica-1) {
     if (me == 0) MPI_Waitall(2,requests,statuses);
 
     MPI_Bcast(tagrecvall,nebatoms,MPI_INT,0,world);
     MPI_Bcast(xrecvall[0],3*nebatoms,MPI_DOUBLE,0,world);
 
     for (i = 0; i < nebatoms; i++) {
       m = atom->map(tagrecvall[i]);
       if (m < 0 || m >= nlocal) continue;
       xnext[m][0] = xrecvall[i][0];
       xnext[m][1] = xrecvall[i][1];
       xnext[m][2] = xrecvall[i][2];
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    reallocate xprev,xnext,tangent arrays if necessary
    reallocate communication arrays if necessary
 ------------------------------------------------------------------------- */
 
 void FixNEB::reallocate()
 {
   memory->destroy(xprev);
   memory->destroy(xnext);
   memory->destroy(tangent);
 
   if (cmode != SINGLE_PROC_DIRECT) {
     memory->destroy(xsend);
     memory->destroy(xrecv);
     memory->destroy(tagsend);
     memory->destroy(tagrecv);
   }
 
   maxlocal = atom->nmax;
 
   memory->create(xprev,maxlocal,3,"neb:xprev");
   memory->create(xnext,maxlocal,3,"neb:xnext");
   memory->create(tangent,maxlocal,3,"neb:tangent");
 
   if (cmode != SINGLE_PROC_DIRECT) {
     memory->create(xsend,maxlocal,3,"neb:xsend");
     memory->create(xrecv,maxlocal,3,"neb:xrecv");
     memory->create(tagsend,maxlocal,"neb:tagsend");
     memory->create(tagrecv,maxlocal,"neb:tagrecv");
   }
 }
diff --git a/src/REPLICA/prd.cpp b/src/REPLICA/prd.cpp
index 9f13951f5..a928425df 100644
--- a/src/REPLICA/prd.cpp
+++ b/src/REPLICA/prd.cpp
@@ -1,909 +1,957 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Mike Brown (SNL)
 ------------------------------------------------------------------------- */
 
 // lmptype.h must be first b/c this file uses MAXBIGINT and includes mpi.h
 // due to OpenMPI bug which sets INT64_MAX via its mpi.h
 //   before lmptype.h can set flags to insure it is done correctly
 
 #include "lmptype.h"
 #include <mpi.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include "prd.h"
 #include "universe.h"
 #include "update.h"
 #include "atom.h"
 #include "domain.h"
 #include "region.h"
 #include "comm.h"
 #include "velocity.h"
 #include "integrate.h"
 #include "min.h"
 #include "neighbor.h"
 #include "modify.h"
 #include "compute.h"
 #include "fix.h"
 #include "fix_event_prd.h"
 #include "force.h"
 #include "pair.h"
 #include "random_park.h"
 #include "random_mars.h"
 #include "output.h"
 #include "dump.h"
 #include "finish.h"
 #include "timer.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
+enum{SINGLE_PROC_DIRECT,SINGLE_PROC_MAP,MULTI_PROC};
+
 /* ---------------------------------------------------------------------- */
 
 PRD::PRD(LAMMPS *lmp) : Pointers(lmp) {}
 
 /* ----------------------------------------------------------------------
    perform PRD simulation on one or more replicas
 ------------------------------------------------------------------------- */
 
 void PRD::command(int narg, char **arg)
 {
   int ireplica;
 
   // error checks
 
   if (domain->box_exist == 0)
     error->all(FLERR,"PRD command before simulation box is defined");
   if (universe->nworlds != universe->nprocs &&
       atom->map_style == 0)
     error->all(FLERR,"Cannot use PRD with multi-processor replicas "
                "unless atom map exists");
   if (universe->nworlds == 1 && comm->me == 0)
     error->warning(FLERR,"Running PRD with only one replica");
 
   if (narg < 7) error->universe_all(FLERR,"Illegal prd command");
 
   // read as double so can cast to bigint
 
   int nsteps = force->inumeric(FLERR,arg[0]);
   t_event = force->inumeric(FLERR,arg[1]);
   n_dephase = force->inumeric(FLERR,arg[2]);
   t_dephase = force->inumeric(FLERR,arg[3]);
   t_corr = force->inumeric(FLERR,arg[4]);
 
   char *id_compute = new char[strlen(arg[5])+1];
   strcpy(id_compute,arg[5]);
   int seed = force->inumeric(FLERR,arg[6]);
 
   options(narg-7,&arg[7]);
 
   // total # of timesteps must be multiple of t_event
 
   if (t_event <= 0) error->universe_all(FLERR,"Invalid t_event in prd command");
   if (nsteps % t_event)
     error->universe_all(FLERR,"PRD nsteps must be multiple of t_event");
   if (t_corr % t_event)
     error->universe_all(FLERR,"PRD t_corr must be multiple of t_event");
 
   // local storage
 
   int me_universe = universe->me;
   int nprocs_universe = universe->nprocs;
   int nreplica = universe->nworlds;
   int iworld = universe->iworld;
 
   MPI_Comm_rank(world,&me);
   MPI_Comm_size(world,&nprocs);
 
   // comm_replica = communicator between all proc 0s across replicas
 
   int color = me;
   MPI_Comm_split(universe->uworld,color,0,&comm_replica);
 
-  // equal_size_replicas = 1 if all replicas have same # of procs
-  // no longer used
+  // comm mode for inter-replica exchange of coords
 
-  //flag = 0;
-  //if (nreplica*nprocs == nprocs_universe) flag = 1;
-  //MPI_Allreduce(&flag,&equal_size_replicas,1,MPI_INT,MPI_MIN,
-  //              universe->uworld);
+  if (nreplica == nprocs_universe && atom->sortfreq == 0) 
+    cmode = SINGLE_PROC_DIRECT;
+  else if (nreplica == nprocs_universe) cmode = SINGLE_PROC_MAP;
+  else cmode = MULTI_PROC;
 
-  // workspace for inter-replica communication via gathers
+  // workspace for inter-replica communication
 
   natoms = atom->natoms;
 
-  displacements = NULL;
   tagall = NULL;
   xall = NULL;
   imageall = NULL;
 
-  if (nreplica != nprocs_universe) {
-    displacements = new int[nprocs];
+  if (cmode != SINGLE_PROC_DIRECT) {
     memory->create(tagall,natoms,"prd:tagall");
     memory->create(xall,natoms,3,"prd:xall");
     memory->create(imageall,natoms,"prd:imageall");
   }
 
+  counts = NULL;
+  displacements = NULL;
+
+  if (cmode == MULTI_PROC) {
+    memory->create(counts,nprocs,"prd:counts");
+    memory->create(displacements,nprocs,"prd:displacements");
+  }
+
   // random_select = same RNG for each replica, for multiple event selection
   // random_clock = same RNG for each replica, for clock updates
   // random_dephase = unique RNG for each replica, for dephasing
 
   random_select = new RanPark(lmp,seed);
   random_clock = new RanPark(lmp,seed+1000);
   random_dephase = new RanMars(lmp,seed+iworld);
 
   // create ComputeTemp class to monitor temperature
 
   char **args = new char*[3];
   args[0] = (char *) "prd_temp";
   args[1] = (char *) "all";
   args[2] = (char *) "temp";
   modify->add_compute(3,args);
   temperature = modify->compute[modify->ncompute-1];
 
   // create Velocity class for velocity creation in dephasing
   // pass it temperature compute, loop_setting, dist_setting settings
 
   atom->check_mass();
   velocity = new Velocity(lmp);
   velocity->init_external("all");
 
   args[0] = (char *) "temp";
   args[1] = (char *) "prd_temp";
   velocity->options(2,args);
   args[0] = (char *) "loop";
   args[1] = (char *) loop_setting;
   if (loop_setting) velocity->options(2,args);
   args[0] = (char *) "dist";
   args[1] = (char *) dist_setting;
   if (dist_setting) velocity->options(2,args);
 
   // create FixEventPRD class to store event and pre-quench states
 
   args[0] = (char *) "prd_event";
   args[1] = (char *) "all";
   args[2] = (char *) "EVENT/PRD";
   modify->add_fix(3,args);
   fix_event = (FixEventPRD *) modify->fix[modify->nfix-1];
 
   // create Finish for timing output
 
   finish = new Finish(lmp);
 
   // string clean-up
 
   delete [] args;
   delete [] loop_setting;
   delete [] dist_setting;
 
   // assign FixEventPRD to event-detection compute
   // necessary so it will know atom coords at last event
 
   int icompute = modify->find_compute(id_compute);
   if (icompute < 0) error->all(FLERR,"Could not find compute ID for PRD");
   compute_event = modify->compute[icompute];
   compute_event->reset_extra_compute_fix("prd_event");
 
   // reset reneighboring criteria since will perform minimizations
 
   neigh_every = neighbor->every;
   neigh_delay = neighbor->delay;
   neigh_dist_check = neighbor->dist_check;
 
   if (neigh_every != 1 || neigh_delay != 0 || neigh_dist_check != 1) {
     if (me == 0)
       error->warning(FLERR,"Resetting reneighboring criteria during PRD");
   }
 
   neighbor->every = 1;
   neighbor->delay = 0;
   neighbor->dist_check = 1;
 
   // initialize PRD as if one long dynamics run
 
   update->whichflag = 1;
   update->nsteps = nsteps;
   update->beginstep = update->firststep = update->ntimestep;
   update->endstep = update->laststep = update->firststep + nsteps;
   update->restrict_output = 1;
   if (update->laststep < 0)
     error->all(FLERR,"Too many timesteps");
 
   lmp->init();
 
   // init minimizer settings and minimizer itself
 
   update->etol = etol;
   update->ftol = ftol;
   update->max_eval = maxeval;
 
   update->minimize->init();
 
   // cannot use PRD with a changing box
 
   if (domain->box_change)
     error->all(FLERR,"Cannot use PRD with a changing box");
 
-  // cannot use PRD with time-dependent fixes or regions or atom sorting
+  // cannot use PRD with time-dependent fixes or regions
 
   for (int i = 0; i < modify->nfix; i++)
     if (modify->fix[i]->time_depend)
       error->all(FLERR,"Cannot use PRD with a time-dependent fix defined");
 
   for (int i = 0; i < domain->nregion; i++)
     if (domain->regions[i]->dynamic_check())
       error->all(FLERR,"Cannot use PRD with a time-dependent region defined");
 
-  if (atom->sortfreq > 0)
-    error->all(FLERR,"Cannot use PRD with atom_modify sort enabled");
-
   // perform PRD simulation
 
   if (me_universe == 0 && universe->uscreen)
     fprintf(universe->uscreen,"Setting up PRD ...\n");
 
   if (me_universe == 0) {
     if (universe->uscreen)
       fprintf(universe->uscreen,"Step CPU Clock Event "
               "Correlated Coincident Replica\n");
     if (universe->ulogfile)
       fprintf(universe->ulogfile,"Step CPU Clock Event "
               "Correlated Coincident Replica\n");
   }
 
   // store hot state and quenched event for replica 0
   // use share_event() to copy that info to all replicas
   // this insures all start from same place
 
   // need this line if quench() does only setup_minimal()
   // update->minimize->setup();
 
   fix_event->store_state_quench();
   quench();
   ncoincident = 0;
   share_event(0,0,0);
 
   timer->init();
   timer->barrier_start();
   time_start = timer->get_wall(Timer::TOTAL);
 
   log_event();
 
   // do full init/setup since are starting all replicas after event
   // replica 0 bcasts temp to all replicas if temp_dephase is not set
 
   update->whichflag = 1;
   lmp->init();
   update->integrate->setup();
 
   if (temp_flag == 0) {
     if (universe->iworld == 0) temp_dephase = temperature->compute_scalar();
     MPI_Bcast(&temp_dephase,1,MPI_DOUBLE,universe->root_proc[0],
               universe->uworld);
   }
 
   // main loop: look for events until out of time
   // (1) dephase independently on each proc after event
   // (2) loop: dynamics, store state, quench, check event, restore state
   // (3) share and record event
 
   nbuild = ndanger = 0;
   time_dephase = time_dynamics = time_quench = time_comm = time_output = 0.0;
   bigint clock = 0;
 
   timer->barrier_start();
   time_start = timer->get_wall(Timer::TOTAL);
 
   int istep = 0;
 
   while (istep < nsteps) {
     dephase();
 
     if (stepmode == 0) istep = update->ntimestep - update->beginstep;
     else istep = clock;
 
     ireplica = -1;
     while (istep < nsteps) {
       dynamics(t_event,time_dynamics);
       fix_event->store_state_quench();
       quench();
       clock = clock + t_event*universe->nworlds;
       ireplica = check_event();
       if (ireplica >= 0) break;
       fix_event->restore_state_quench();
       if (stepmode == 0) istep = update->ntimestep - update->beginstep;
       else istep = clock;
     }
     if (ireplica < 0) break;
 
     // decrement clock by random time at which 1 or more events occurred
 
     int frac_t_event = t_event;
     for (int i = 0; i < fix_event->ncoincident; i++) {
       int frac_rand = static_cast<int> (random_clock->uniform() * t_event);
       frac_t_event = MIN(frac_t_event,frac_rand);
     }
     int decrement = (t_event - frac_t_event)*universe->nworlds;
     clock -= decrement;
 
     // share event across replicas
     // NOTE: would be potentially more efficient for correlated events
     //   if don't share until correlated check below has completed
     // this will complicate the dump (always on replica 0)
 
     share_event(ireplica,1,decrement);
     log_event();
 
     int restart_flag = 0;
     if (output->restart_flag && universe->iworld == 0) {
       if (output->restart_every_single &&
           fix_event->event_number % output->restart_every_single == 0)
         restart_flag = 1;
       if (output->restart_every_double &&
           fix_event->event_number % output->restart_every_double == 0)
         restart_flag = 1;
     }
 
     // correlated event loop
     // other procs could be dephasing during this time
 
     int corr_endstep = update->ntimestep + t_corr;
     while (update->ntimestep < corr_endstep) {
       if (update->ntimestep == update->endstep) {
         restart_flag = 0;
         break;
       }
       dynamics(t_event,time_dynamics);
       fix_event->store_state_quench();
       quench();
       clock += t_event;
       int corr_event_check = check_event(ireplica);
       if (corr_event_check >= 0) {
         share_event(ireplica,2,0);
         log_event();
         corr_endstep = update->ntimestep + t_corr;
       } else fix_event->restore_state_quench();
     }
 
     // full init/setup since are starting all replicas after event
     // event replica bcasts temp to all replicas if temp_dephase is not set
 
     update->whichflag = 1;
     lmp->init();
     update->integrate->setup();
 
     timer->barrier_start();
 
     if (t_corr > 0) replicate(ireplica);
     if (temp_flag == 0) {
       if (ireplica == universe->iworld)
         temp_dephase = temperature->compute_scalar();
       MPI_Bcast(&temp_dephase,1,MPI_DOUBLE,universe->root_proc[ireplica],
                       universe->uworld);
     }
 
     timer->barrier_stop();
     time_comm += timer->get_wall(Timer::TOTAL);
 
     // write restart file of hot coords
 
     if (restart_flag) {
       timer->barrier_start();
       output->write_restart(update->ntimestep);
       timer->barrier_stop();
       time_output += timer->get_wall(Timer::TOTAL);
     }
 
     if (stepmode == 0) istep = update->ntimestep - update->beginstep;
     else istep = clock;
   }
 
   if (stepmode) nsteps = update->ntimestep - update->beginstep;
 
   // set total timers and counters so Finish() will process them
 
   timer->set_wall(Timer::TOTAL, time_start);
   timer->barrier_stop();
 
   timer->set_wall(Timer::DEPHASE, time_dephase);
   timer->set_wall(Timer::DYNAMICS, time_dynamics);
   timer->set_wall(Timer::QUENCH, time_quench);
   timer->set_wall(Timer::REPCOMM, time_comm);
   timer->set_wall(Timer::REPOUT, time_output);
 
   neighbor->ncalls = nbuild;
   neighbor->ndanger = ndanger;
 
   if (me_universe == 0) {
     if (universe->uscreen)
       fprintf(universe->uscreen,
               "Loop time of %g on %d procs for %d steps with " BIGINT_FORMAT
               " atoms\n",
-              timer->get_wall(Timer::TOTAL),nprocs_universe,nsteps,atom->natoms);
+              timer->get_wall(Timer::TOTAL),nprocs_universe,
+              nsteps,atom->natoms);
     if (universe->ulogfile)
       fprintf(universe->ulogfile,
               "Loop time of %g on %d procs for %d steps with " BIGINT_FORMAT
               " atoms\n",
-              timer->get_wall(Timer::TOTAL),nprocs_universe,nsteps,atom->natoms);
+              timer->get_wall(Timer::TOTAL),nprocs_universe,
+              nsteps,atom->natoms);
   }
 
   if (me == 0) {
     if (screen) fprintf(screen,"\nPRD done\n");
     if (logfile) fprintf(logfile,"\nPRD done\n");
   }
 
   finish->end(2);
 
   update->whichflag = 0;
   update->firststep = update->laststep = 0;
   update->beginstep = update->endstep = 0;
   update->restrict_output = 0;
 
   // reset reneighboring criteria
 
   neighbor->every = neigh_every;
   neighbor->delay = neigh_delay;
   neighbor->dist_check = neigh_dist_check;
 
   // clean up
 
-  delete [] displacements;
   memory->destroy(tagall);
   memory->destroy(xall);
   memory->destroy(imageall);
+  memory->destroy(counts);
+  memory->destroy(displacements);
 
   delete [] id_compute;
   MPI_Comm_free(&comm_replica);
   delete random_select;
   delete random_clock;
   delete random_dephase;
   delete velocity;
   delete finish;
   modify->delete_compute("prd_temp");
   modify->delete_fix("prd_event");
 
   compute_event->reset_extra_compute_fix(NULL);
 }
 
 /* ----------------------------------------------------------------------
    dephasing = one or more short runs with new random velocities
 ------------------------------------------------------------------------- */
 
 void PRD::dephase()
 {
   bigint ntimestep_hold = update->ntimestep;
 
   // n_dephase iterations of dephasing, each of t_dephase steps
 
   for (int i = 0; i < n_dephase; i++) {
 
     fix_event->store_state_dephase();
 
     // do not proceed to next iteration until an event-free run occurs
 
     int done = 0;
     while (!done) {
       int seed = static_cast<int> (random_dephase->uniform() * MAXSMALLINT);
       if (seed == 0) seed = 1;
       velocity->create(temp_dephase,seed);
 
       dynamics(t_dephase,time_dephase);
       fix_event->store_state_quench();
       quench();
 
       if (compute_event->compute_scalar() > 0.0) {
         fix_event->restore_state_dephase();
         update->ntimestep -= t_dephase;
         log_event();
       } else {
         fix_event->restore_state_quench();
         done = 1;
       }
 
       if (temp_flag == 0) temp_dephase = temperature->compute_scalar();
     }
   }
 
   // reset timestep as if dephase did not occur
   // clear timestep storage from computes, since now invalid
 
   update->ntimestep = ntimestep_hold;
   for (int i = 0; i < modify->ncompute; i++)
     if (modify->compute[i]->timeflag) modify->compute[i]->clearstep();
 }
 
 /* ----------------------------------------------------------------------
    short dynamics run: for event search, decorrelation, or dephasing
 ------------------------------------------------------------------------- */
 
 void PRD::dynamics(int nsteps, double &time_category)
 {
   update->whichflag = 1;
   update->nsteps = nsteps;
 
   lmp->init();
   update->integrate->setup();
   // this may be needed if don't do full init
   //modify->addstep_compute_all(update->ntimestep);
   bigint ncalls = neighbor->ncalls;
 
   timer->barrier_start();
   update->integrate->run(nsteps);
   timer->barrier_stop();
   time_category += timer->get_wall(Timer::TOTAL);
 
   nbuild += neighbor->ncalls - ncalls;
   ndanger += neighbor->ndanger;
 
   update->integrate->cleanup();
   finish->end(0);
 }
 
 /* ----------------------------------------------------------------------
    quench minimization
 ------------------------------------------------------------------------- */
 
 void PRD::quench()
 {
   bigint ntimestep_hold = update->ntimestep;
   bigint endstep_hold = update->endstep;
 
   // need to change whichflag so that minimize->setup() calling
   // modify->setup() will call fix->min_setup()
 
   update->whichflag = 2;
   update->nsteps = maxiter;
   update->endstep = update->laststep = update->firststep + maxiter;
   if (update->laststep < 0)
     error->all(FLERR,"Too many iterations");
 
   // full init works
 
   lmp->init();
   update->minimize->setup();
 
   // partial init does not work
 
   //modify->addstep_compute_all(update->ntimestep);
   //update->minimize->setup_minimal(1);
 
   int ncalls = neighbor->ncalls;
 
   timer->barrier_start();
   update->minimize->run(maxiter);
   timer->barrier_stop();
   time_quench += timer->get_wall(Timer::TOTAL);
 
   if (neighbor->ncalls == ncalls) quench_reneighbor = 0;
   else quench_reneighbor = 1;
 
   update->minimize->cleanup();
   finish->end(0);
 
   // reset timestep as if quench did not occur
   // clear timestep storage from computes, since now invalid
 
   update->ntimestep = ntimestep_hold;
   update->endstep = update->laststep = endstep_hold;
   for (int i = 0; i < modify->ncompute; i++)
     if (modify->compute[i]->timeflag) modify->compute[i]->clearstep();
 }
 
 /* ----------------------------------------------------------------------
    check for an event in any replica
    if replica_num is non-negative only check for event on replica_num
    if multiple events, choose one at random
    return -1 if no event
    else return ireplica = world in which event occured
 ------------------------------------------------------------------------- */
 
 int PRD::check_event(int replica_num)
 {
   int worldflag,universeflag,scanflag,replicaflag,ireplica;
 
   worldflag = 0;
   if (compute_event->compute_scalar() > 0.0) worldflag = 1;
   if (replica_num >= 0 && replica_num != universe->iworld) worldflag = 0;
 
   timer->barrier_start();
 
   if (me == 0) MPI_Allreduce(&worldflag,&universeflag,1,
                              MPI_INT,MPI_SUM,comm_replica);
   MPI_Bcast(&universeflag,1,MPI_INT,0,world);
 
   ncoincident = universeflag;
 
   if (!universeflag) ireplica = -1;
   else {
 
     // multiple events, choose one at random
     // iwhich = random # from 1 to N, N = # of events to choose from
     // scanflag = 1 to N on replicas with an event, 0 on non-event replicas
     // exit with worldflag = 1 on chosen replica, 0 on all others
     // note worldflag is already 0 on replicas that didn't perform event
 
     if (universeflag > 1) {
       int iwhich = static_cast<int>
         (universeflag*random_select->uniform()) + 1;
 
       if (me == 0)
         MPI_Scan(&worldflag,&scanflag,1,MPI_INT,MPI_SUM,comm_replica);
       MPI_Bcast(&scanflag,1,MPI_INT,0,world);
 
       if (scanflag != iwhich) worldflag = 0;
     }
 
     if (worldflag) replicaflag = universe->iworld;
     else replicaflag = 0;
 
     if (me == 0) MPI_Allreduce(&replicaflag,&ireplica,1,
                                MPI_INT,MPI_SUM,comm_replica);
     MPI_Bcast(&ireplica,1,MPI_INT,0,world);
   }
 
   timer->barrier_stop();
   time_comm += timer->get_wall(Timer::TOTAL);
 
   return ireplica;
 }
 
 /* ----------------------------------------------------------------------
    share quenched and hot coords owned by ireplica with all replicas
    all replicas store event in fix_event
    replica 0 dumps event snapshot
    flag = 0 = called before PRD run
    flag = 1 = called during PRD run = not correlated event
    flag = 2 = called during PRD run = correlated event
 ------------------------------------------------------------------------- */
 
 void PRD::share_event(int ireplica, int flag, int decrement)
 {
   timer->barrier_start();
 
   // communicate quenched coords to all replicas and store as event
   // decrement event counter if flag = 0 since not really an event
 
   replicate(ireplica);
   timer->barrier_stop();
   time_comm += timer->get_wall(Timer::TOTAL);
 
   // adjust time for last correlated event check (not on first event)
 
   int corr_adjust = t_corr;
   if (fix_event->event_number < 1 || flag == 2) corr_adjust = 0;
 
   // delta = time since last correlated event check
 
   int delta = update->ntimestep - fix_event->event_timestep - corr_adjust;
 
   // if this is a correlated event, time elapsed only on one partition
 
   if (flag != 2) delta *= universe->nworlds;
   if (delta > 0 && flag != 2) delta -= decrement;
   delta += corr_adjust;
 
   // delta passed to store_event_prd() should make its clock update
   //   be consistent with clock in main PRD loop
   // don't change the clock or timestep if this is a restart
 
   if (flag == 0 && fix_event->event_number != 0)
     fix_event->store_event_prd(fix_event->event_timestep,0);
   else {
     fix_event->store_event_prd(update->ntimestep,delta);
     fix_event->replica_number = ireplica;
     fix_event->correlated_event = 0;
     if (flag == 2) fix_event->correlated_event = 1;
     fix_event->ncoincident = ncoincident;
   }
   if (flag == 0) fix_event->event_number--;
 
   // dump snapshot of quenched coords, only on replica 0
   // must reneighbor and compute forces before dumping
   // since replica 0 possibly has new state from another replica
   // addstep_compute_all insures eng/virial are calculated if needed
 
   if (output->ndump && universe->iworld == 0) {
     timer->barrier_start();
     modify->addstep_compute_all(update->ntimestep);
     update->integrate->setup_minimal(1);
     output->write_dump(update->ntimestep);
     timer->barrier_stop();
     time_output += timer->get_wall(Timer::TOTAL);
   }
 
   // restore and communicate hot coords to all replicas
 
   fix_event->restore_state_quench();
   timer->barrier_start();
   replicate(ireplica);
   timer->barrier_stop();
   time_comm += timer->get_wall(Timer::TOTAL);
 }
 
 /* ----------------------------------------------------------------------
    universe proc 0 prints event info
 ------------------------------------------------------------------------- */
 
 void PRD::log_event()
 {
   timer->set_wall(Timer::TOTAL, time_start);
   if (universe->me == 0) {
     if (universe->uscreen)
       fprintf(universe->uscreen,
               BIGINT_FORMAT " %.3f " BIGINT_FORMAT " %d %d %d %d\n",
               fix_event->event_timestep,
               timer->elapsed(Timer::TOTAL),
               fix_event->clock,
               fix_event->event_number,fix_event->correlated_event,
               fix_event->ncoincident,
               fix_event->replica_number);
     if (universe->ulogfile)
       fprintf(universe->ulogfile,
               BIGINT_FORMAT " %.3f " BIGINT_FORMAT " %d %d %d %d\n",
               fix_event->event_timestep,
               timer->elapsed(Timer::TOTAL),
               fix_event->clock,
               fix_event->event_number,fix_event->correlated_event,
               fix_event->ncoincident,
               fix_event->replica_number);
   }
 }
 
 /* ----------------------------------------------------------------------
   communicate atom coords and image flags in ireplica to all other replicas
   if one proc per replica:
     direct overwrite via bcast
   else atoms could be stored in different order on a proc or on different procs:
     gather to root proc of event replica
     bcast to roots of other replicas
     bcast within each replica
     each proc extracts info for atoms it owns using atom IDs
 ------------------------------------------------------------------------- */
 
 void PRD::replicate(int ireplica)
 {
   int nreplica = universe->nworlds;
   int nprocs_universe = universe->nprocs;
   int i,m;
 
-  if (nreplica == nprocs_universe) {
-    MPI_Bcast(atom->image,atom->nlocal,MPI_INT,ireplica,comm_replica);
-    MPI_Bcast(atom->x[0],3*atom->nlocal,MPI_DOUBLE,ireplica,comm_replica);
-
-  } else {
-    int *counts = new int[nprocs];
+  // -----------------------------------------------------
+  // 3 cases: two for single proc per replica
+  //          one for multiple procs per replica
+  // -----------------------------------------------------
 
-    if (universe->iworld == ireplica) {
-      MPI_Gather(&atom->nlocal,1,MPI_INT,counts,1,MPI_INT,0,world);
-      displacements[0] = 0;
-      for (i = 0; i < nprocs-1; i++)
-        displacements[i+1] = displacements[i] + counts[i];
-      MPI_Gatherv(atom->tag,atom->nlocal,MPI_LMP_TAGINT,
-                  tagall,counts,displacements,MPI_LMP_TAGINT,0,world);
-      MPI_Gatherv(atom->image,atom->nlocal,MPI_INT,
-                        imageall,counts,displacements,MPI_INT,0,world);
-      for (i = 0; i < nprocs; i++) counts[i] *= 3;
-      for (i = 0; i < nprocs-1; i++)
-        displacements[i+1] = displacements[i] + counts[i];
-      MPI_Gatherv(atom->x[0],3*atom->nlocal,MPI_DOUBLE,
-                        xall[0],counts,displacements,MPI_DOUBLE,0,world);
-    }
+  // single proc per replica, no atom sorting
+  // direct bcast of image and x
 
-    if (me == 0) {
-      MPI_Bcast(tagall,natoms,MPI_INT,ireplica,comm_replica);
-      MPI_Bcast(imageall,natoms,MPI_INT,ireplica,comm_replica);
-      MPI_Bcast(xall[0],3*natoms,MPI_DOUBLE,ireplica,comm_replica);
-    }
+  if (cmode == SINGLE_PROC_DIRECT) {
+    MPI_Bcast(atom->x[0],3*atom->nlocal,MPI_DOUBLE,ireplica,comm_replica);
+    MPI_Bcast(atom->image,atom->nlocal,MPI_INT,ireplica,comm_replica);
+    return;
+  }
 
-    MPI_Bcast(tagall,natoms,MPI_INT,0,world);
-    MPI_Bcast(imageall,natoms,MPI_INT,0,world);
-    MPI_Bcast(xall[0],3*natoms,MPI_DOUBLE,0,world);
+  // single proc per replica, atom sorting is enabled
+  // bcast atom IDs, x, image via tagall, xall, imageall
+  // recv procs use atom->map() to match received info to owned atoms
 
+  if (cmode == SINGLE_PROC_MAP) {
     double **x = atom->x;
+    tagint *tag = atom->tag;
+    imageint *image = atom->image;
     int nlocal = atom->nlocal;
 
-    for (i = 0; i < natoms; i++) {
+    if (universe->iworld == ireplica) {
+      memcpy(tagall,tag,nlocal*sizeof(tagint));
+      memcpy(xall[0],x[0],3*nlocal*sizeof(double));
+      memcpy(imageall,image,nlocal*sizeof(imageint));
+    }
+
+    MPI_Bcast(tagall,natoms,MPI_INT,ireplica,comm_replica);
+    MPI_Bcast(xall[0],3*natoms,MPI_DOUBLE,ireplica,comm_replica);
+    MPI_Bcast(imageall,natoms,MPI_INT,ireplica,comm_replica);
+
+    for (i = 0; i < nlocal; i++) {
       m = atom->map(tagall[i]);
-      if (m >= 0 && m < nlocal) {
-        x[m][0] = xall[i][0];
-        x[m][1] = xall[i][1];
-        x[m][2] = xall[i][2];
-        atom->image[m] = imageall[i];
-      }
+      x[m][0] = xall[i][0];
+      x[m][1] = xall[i][1];
+      x[m][2] = xall[i][2];
+      atom->image[m] = imageall[i];
     }
 
-    delete [] counts;
+    return;
+  }
+
+  // multiple procs per replica
+  // MPI_Gather all atom IDs, x, image to root proc of ireplica
+  // bcast to root of other replicas
+  // bcast within each replica
+  // each proc extracts info for atoms it owns via atom->map()
+  // NOTE: assumes imagint and tagint are always the same size
+
+  if (universe->iworld == ireplica) {
+    MPI_Gather(&atom->nlocal,1,MPI_INT,counts,1,MPI_INT,0,world);
+    displacements[0] = 0;
+    for (i = 0; i < nprocs-1; i++)
+      displacements[i+1] = displacements[i] + counts[i];
+    MPI_Gatherv(atom->tag,atom->nlocal,MPI_LMP_TAGINT,
+                tagall,counts,displacements,MPI_LMP_TAGINT,0,world);
+    MPI_Gatherv(atom->image,atom->nlocal,MPI_LMP_TAGINT,
+                imageall,counts,displacements,MPI_LMP_TAGINT,0,world);
+    for (i = 0; i < nprocs; i++) counts[i] *= 3;
+    for (i = 0; i < nprocs-1; i++)
+      displacements[i+1] = displacements[i] + counts[i];
+    MPI_Gatherv(atom->x[0],3*atom->nlocal,MPI_DOUBLE,
+                xall[0],counts,displacements,MPI_DOUBLE,0,world);
+  }
+  
+  if (me == 0) {
+    MPI_Bcast(tagall,natoms,MPI_INT,ireplica,comm_replica);
+    MPI_Bcast(imageall,natoms,MPI_INT,ireplica,comm_replica);
+    MPI_Bcast(xall[0],3*natoms,MPI_DOUBLE,ireplica,comm_replica);
+  }
+  
+  MPI_Bcast(tagall,natoms,MPI_INT,0,world);
+  MPI_Bcast(imageall,natoms,MPI_INT,0,world);
+  MPI_Bcast(xall[0],3*natoms,MPI_DOUBLE,0,world);
+  
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+  
+  for (i = 0; i < natoms; i++) {
+    m = atom->map(tagall[i]);
+    if (m < 0 || m >= nlocal) continue;
+    x[m][0] = xall[i][0];
+    x[m][1] = xall[i][1];
+    x[m][2] = xall[i][2];
+    atom->image[m] = imageall[i];
   }
 }
 
 /* ----------------------------------------------------------------------
    parse optional parameters at end of PRD input line
 ------------------------------------------------------------------------- */
 
 void PRD::options(int narg, char **arg)
 {
   if (narg < 0) error->all(FLERR,"Illegal prd command");
 
   // set defaults
 
   etol = 0.1;
   ftol = 0.1;
   maxiter = 40;
   maxeval = 50;
   temp_flag = 0;
   stepmode = 0;
 
   char *str = (char *) "geom";
   int n = strlen(str) + 1;
   loop_setting = new char[n];
   strcpy(loop_setting,str);
 
   str = (char *) "gaussian";
   n = strlen(str) + 1;
   dist_setting = new char[n];
   strcpy(dist_setting,str);
 
   int iarg = 0;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"min") == 0) {
       if (iarg+5 > narg) error->all(FLERR,"Illegal prd command");
       etol = force->numeric(FLERR,arg[iarg+1]);
       ftol = force->numeric(FLERR,arg[iarg+2]);
       maxiter = force->inumeric(FLERR,arg[iarg+3]);
       maxeval = force->inumeric(FLERR,arg[iarg+4]);
       if (maxiter < 0) error->all(FLERR,"Illegal prd command");
       iarg += 5;
 
     } else if (strcmp(arg[iarg],"temp") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal prd command");
       temp_flag = 1;
       temp_dephase = force->numeric(FLERR,arg[iarg+1]);
       if (temp_dephase <= 0.0) error->all(FLERR,"Illegal prd command");
       iarg += 2;
 
     } else if (strcmp(arg[iarg],"vel") == 0) {
       if (iarg+3 > narg) error->all(FLERR,"Illegal prd command");
       delete [] loop_setting;
       delete [] dist_setting;
 
       if (strcmp(arg[iarg+1],"all") == 0) loop_setting = NULL;
       else if (strcmp(arg[iarg+1],"local") == 0) loop_setting = NULL;
       else if (strcmp(arg[iarg+1],"geom") == 0) loop_setting = NULL;
       else error->all(FLERR,"Illegal prd command");
       int n = strlen(arg[iarg+1]) + 1;
       loop_setting = new char[n];
       strcpy(loop_setting,arg[iarg+1]);
 
       if (strcmp(arg[iarg+2],"uniform") == 0) dist_setting = NULL;
       else if (strcmp(arg[iarg+2],"gaussian") == 0) dist_setting = NULL;
       else error->all(FLERR,"Illegal prd command");
       n = strlen(arg[iarg+2]) + 1;
       dist_setting = new char[n];
       strcpy(dist_setting,arg[iarg+2]);
 
       iarg += 3;
 
     } else if (strcmp(arg[iarg],"time") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal prd command");
       if (strcmp(arg[iarg+1],"steps") == 0) stepmode = 0;
       else if (strcmp(arg[iarg+1],"clock") == 0) stepmode = 1;
       else error->all(FLERR,"Illegal prd command");
       iarg += 2;
 
     } else error->all(FLERR,"Illegal prd command");
   }
 }
diff --git a/src/REPLICA/prd.h b/src/REPLICA/prd.h
index f74facd5d..cc866007d 100644
--- a/src/REPLICA/prd.h
+++ b/src/REPLICA/prd.h
@@ -1,148 +1,149 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifdef COMMAND_CLASS
 
 CommandStyle(prd,PRD)
 
 #else
 
 #ifndef LMP_PRD_H
 #define LMP_PRD_H
 
 #include "pointers.h"
 
 namespace LAMMPS_NS {
 
 class PRD : protected Pointers {
  public:
   PRD(class LAMMPS *);
   ~PRD() {}
   void command(int, char **);
 
  private:
   int me,nprocs;
   int t_event,n_dephase,t_dephase,t_corr;
   double etol,ftol,temp_dephase;
-  int maxiter,maxeval,temp_flag,stepmode;
+  int maxiter,maxeval,temp_flag,stepmode,cmode;
   char *loop_setting,*dist_setting;
 
   int equal_size_replicas,natoms;
   int neigh_every,neigh_delay,neigh_dist_check;
   int quench_reneighbor;
   bigint nbuild,ndanger;
 
   double time_dephase,time_dynamics,time_quench,time_comm,time_output;
   double time_start;
 
   MPI_Comm comm_replica;
+  int *counts,*displacements;
   tagint *tagall;
-  int *displacements,*imageall;
   double **xall;
+  imageint *imageall;
 
   int ncoincident;
 
   class RanPark *random_select,*random_clock;
   class RanMars *random_dephase;
   class Compute *compute_event;
   class FixEventPRD *fix_event;
   class Velocity *velocity;
   class Compute *temperature;
   class Finish *finish;
 
   void dephase();
   void dynamics(int, double &);
   void quench();
   int check_event(int replica = -1);
   void share_event(int, int, int);
   void log_event();
   void replicate(int);
   void options(int, char **);
 };
 
 }
 
 #endif
 #endif
 
 /* ERROR/WARNING messages:
 
 E: PRD command before simulation box is defined
 
 The prd command cannot be used before a read_data,
 read_restart, or create_box command.
 
 E: Cannot use PRD with multi-processor replicas unless atom map exists
 
 Use the atom_modify command to create an atom map.
 
 W: Running PRD with only one replica
 
 This is allowed, but you will get no parallel speed-up.
 
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
 E: Invalid t_event in prd command
 
 Self-explanatory.
 
 E: PRD nsteps must be multiple of t_event
 
 Self-explanatory.
 
 E: PRD t_corr must be multiple of t_event
 
 Self-explanatory.
 
 E: Could not find compute ID for PRD
 
 Self-explanatory.
 
 W: Resetting reneighboring criteria during PRD
 
 A PRD simulation requires that neigh_modify settings be delay = 0,
 every = 1, check = yes.  Since these settings were not in place,
 LAMMPS changed them and will restore them to their original values
 after the PRD simulation.
 
 E: Too many timesteps
 
 The cummulative timesteps must fit in a 64-bit integer.
 
 E: Cannot use PRD with a changing box
 
 The current box dimensions are not copied between replicas
 
 E: Cannot use PRD with a time-dependent fix defined
 
 PRD alters the timestep in ways that will mess up these fixes.
 
 E: Cannot use PRD with a time-dependent region defined
 
 PRD alters the timestep in ways that will mess up these regions.
 
 E: Cannot use PRD with atom_modify sort enabled
 
 This is a current restriction of PRD.  You must turn off sorting,
 which is enabled by default, via the atom_modify command.
 
 E: Too many iterations
 
 You must use a number of iterations that fit in a 32-bit integer
 for minimization.
 
 */
diff --git a/src/REPLICA/tad.cpp b/src/REPLICA/tad.cpp
index 871901293..6c20deed6 100644
--- a/src/REPLICA/tad.cpp
+++ b/src/REPLICA/tad.cpp
@@ -1,1045 +1,1045 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Aidan Thompson (SNL)
 ------------------------------------------------------------------------- */
 
 // lmptype.h must be first b/c this file uses MAXBIGINT and includes mpi.h
 // due to OpenMPI bug which sets INT64_MAX via its mpi.h
 //   before lmptype.h can set flags to insure it is done correctly
 
 #include "lmptype.h"
 #include <mpi.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include "tad.h"
 #include "universe.h"
 #include "update.h"
 #include "atom.h"
 #include "domain.h"
 #include "region.h"
 #include "comm.h"
 #include "velocity.h"
 #include "integrate.h"
 #include "min.h"
 #include "neighbor.h"
 #include "modify.h"
 #include "neb.h"
 #include "compute.h"
 #include "fix.h"
 #include "fix_event_tad.h"
 #include "fix_store.h"
 #include "force.h"
 #include "pair.h"
 #include "random_park.h"
 #include "random_mars.h"
 #include "output.h"
 #include "dump.h"
 #include "finish.h"
 #include "timer.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 TAD::TAD(LAMMPS *lmp) : Pointers(lmp) {}
 
 /* ---------------------------------------------------------------------- */
 
 TAD::~TAD()
 {
   memory->sfree(fix_event_list);
   if (neb_logfilename != NULL) delete [] neb_logfilename;
   delete [] min_style;
   delete [] min_style_neb;
 }
 
 /* ----------------------------------------------------------------------
    perform TAD simulation on root proc
    other procs only used for NEB calcs
 ------------------------------------------------------------------------- */
 
 void TAD::command(int narg, char **arg)
 {
   fix_event_list = NULL;
   n_event_list = 0;
   nmax_event_list = 0;
   nmin_event_list = 10;
 
   // error checks
 
   if (domain->box_exist == 0)
     error->all(FLERR,"Tad command before simulation box is defined");
   if (universe->nworlds == 1)
     error->all(FLERR,"Cannot use TAD with a single replica for NEB");
   if (universe->nworlds != universe->nprocs)
     error->all(FLERR,"Can only use TAD with 1-processor replicas for NEB");
   if (atom->sortfreq > 0)
     error->all(FLERR,"Cannot use TAD with atom_modify sort enabled for NEB");
   if (atom->map_style == 0)
     error->all(FLERR,"Cannot use TAD unless atom map exists for NEB");
 
   if (narg < 7) error->universe_all(FLERR,"Illegal tad command");
 
   nsteps = force->inumeric(FLERR,arg[0]);
   t_event = force->inumeric(FLERR,arg[1]);
   templo = force->numeric(FLERR,arg[2]);
   temphi = force->numeric(FLERR,arg[3]);
   delta_conf = force->numeric(FLERR,arg[4]);
   tmax = force->numeric(FLERR,arg[5]);
 
   char *id_compute = new char[strlen(arg[6])+1];
   strcpy(id_compute,arg[6]);
 
   // quench minimizer is set by min_style command
   // NEB minimizer is set by options, default = quickmin
 
   int n = strlen(update->minimize_style) + 1;
   min_style = new char[n];
   strcpy(min_style,update->minimize_style);
 
   options(narg-7,&arg[7]);
 
   // total # of timesteps must be multiple of t_event
 
   if (t_event <= 0) error->universe_all(FLERR,"Invalid t_event in tad command");
   if (nsteps % t_event)
     error->universe_all(FLERR,"TAD nsteps must be multiple of t_event");
 
   if (delta_conf <= 0.0 || delta_conf >= 1.0)
     error->universe_all(FLERR,"Invalid delta_conf in tad command");
 
   if (tmax <= 0.0)
     error->universe_all(FLERR,"Invalid tmax in tad command");
 
   // deltconf = (ln(1/delta))/freq_min (timestep units)
 
   deltconf = -log(delta_conf)*tmax/update->dt;
 
   // local storage
 
   int me_universe = universe->me;
   int nprocs_universe = universe->nprocs;
 
   MPI_Comm_rank(world,&me);
   MPI_Comm_size(world,&nprocs);
 
   delta_beta = (1.0/templo - 1.0/temphi) / force->boltz;
   ratio_beta = templo/temphi;
 
   // create FixEventTAD object to store last event
 
   int narg2 = 3;
   char **args = new char*[narg2];
   args[0] = (char *) "tad_event";
   args[1] = (char *) "all";
   args[2] = (char *) "EVENT/TAD";
   modify->add_fix(narg2,args);
   fix_event = (FixEventTAD *) modify->fix[modify->nfix-1];
   delete [] args;
 
   // create FixStore object to store revert state
 
   narg2 = 6;
   args = new char*[narg2];
   args[0] = (char *) "tad_revert";
   args[1] = (char *) "all";
   args[2] = (char *) "STORE";
   args[3] = (char *) "peratom";
   args[4] = (char *) "0";
   args[5] = (char *) "7";
   modify->add_fix(narg2,args);
   fix_revert = (FixStore *) modify->fix[modify->nfix-1];
   delete [] args;
 
   // create Finish for timing output
 
   finish = new Finish(lmp);
 
   // assign FixEventTAD to event-detection compute
   // necessary so it will know atom coords at last event
 
   int icompute = modify->find_compute(id_compute);
   if (icompute < 0) error->all(FLERR,"Could not find compute ID for TAD");
   compute_event = modify->compute[icompute];
   compute_event->reset_extra_compute_fix("tad_event");
 
   // reset reneighboring criteria since will perform minimizations
 
   neigh_every = neighbor->every;
   neigh_delay = neighbor->delay;
   neigh_dist_check = neighbor->dist_check;
 
   if (neigh_every != 1 || neigh_delay != 0 || neigh_dist_check != 1) {
     if (me_universe == 0)
       error->warning(FLERR,"Resetting reneighboring criteria during TAD");
   }
 
   neighbor->every = 1;
   neighbor->delay = 0;
   neighbor->dist_check = 1;
 
   // initialize TAD as if one long dynamics run
 
   update->whichflag = 1;
   update->nsteps = nsteps;
   update->beginstep = update->firststep = update->ntimestep;
   update->endstep = update->laststep = update->firststep + nsteps;
   update->restrict_output = 1;
   if (update->laststep < 0)
     error->all(FLERR,"Too many timesteps");
 
   lmp->init();
 
   // set minimize style for quench
 
   narg2 = 1;
   args = new char*[narg2];
   args[0] = min_style;
 
   update->create_minimize(narg2,args);
 
   delete [] args;
 
   // init minimizer settings and minimizer itself
 
   update->etol = etol;
   update->ftol = ftol;
   update->max_eval = maxeval;
 
   update->minimize->init();
 
   // perform TAD simulation
 
   if (me_universe == 0 && universe->uscreen)
     fprintf(universe->uscreen,"Setting up TAD ...\n");
 
   if (me_universe == 0) {
     if (universe->uscreen)
       fprintf(universe->uscreen,
               "Step CPU N M Status Barrier Margin t_lo delt_lo\n");
     if (universe->ulogfile)
       fprintf(universe->ulogfile,
               "Step CPU N M Status Barrier Margin t_lo delt_lo\n");
   }
 
   ulogfile_lammps = universe->ulogfile;
   uscreen_lammps = universe->uscreen;
   ulogfile_neb = NULL;
   uscreen_neb = NULL;
   if (me_universe == 0 && neb_logfilename)
     ulogfile_neb = fopen(neb_logfilename,"w");
 
   // store hot state and quenched event, only on replica 0
 
   // need this line if quench() does only setup_minimal()
   // update->minimize->setup();
 
   // This should work with if uncommented, but does not
   // if (universe->iworld == 0) {
 
   fix_event->store_state_quench();
   quench();
 
   timer->init();
   timer->barrier_start();
   time_start = timer->get_wall(Timer::TOTAL);
   fix_event->store_event_tad(update->ntimestep);
   log_event(0);
   fix_event->restore_state_quench();
 
   // do full init/setup
 
   update->whichflag = 1;
   lmp->init();
   update->integrate->setup();
 
   // main loop: look for events until out of time
   // (1) dynamics, store state, quench, check event, restore state
   // (2) if event, perform NEB, record in fix_event_list
   // (3) if confident, pick earliest event
 
   nbuild = ndanger = 0;
   time_neb = time_dynamics = time_quench = time_comm = time_output = 0.0;
 
   timer->barrier_start();
   time_start = timer->get_wall(Timer::TOTAL);
 
   int confident_flag, event_flag;
 
   if (universe->iworld == 0) {
     while (update->ntimestep < update->endstep) {
 
       // initialize list of possible events
 
       initialize_event_list();
       confident_flag = 0;
 
       while (update->ntimestep < update->endstep) {
         event_flag = 0;
         while (update->ntimestep < update->endstep) {
 
           dynamics();
           fix_event->store_state_quench();
           quench();
 
           event_flag = check_event();
           MPI_Bcast(&event_flag,1,MPI_INT,0,universe->uworld);
 
           if (event_flag) break;
 
           // restore hot state
 
           fix_event->restore_state_quench();
 
           // store hot state in revert
 
           store_state();
         }
         if (!event_flag) break;
 
         add_event();
 
         perform_neb(n_event_list-1);
         compute_tlo(n_event_list-1);
         confident_flag = check_confidence();
         MPI_Bcast(&confident_flag,1,MPI_INT,0,universe->uworld);
         if (confident_flag) break;
         if (universe->iworld == 0) revert_state();
       }
       if (!confident_flag) break;
 
       perform_event(event_first);
 
       // need to sync timestep with TAD
 
       MPI_Bcast(&(update->ntimestep),1,MPI_INT,0,universe->uworld);
 
       int restart_flag = 0;
       if (output->restart_flag && universe->iworld == 0) {
         if (output->restart_every_single &&
             fix_event->event_number % output->restart_every_single == 0)
           restart_flag = 1;
         if (output->restart_every_double &&
             fix_event->event_number % output->restart_every_double == 0)
           restart_flag = 1;
       }
 
       // full init/setup since are starting after event
 
       update->whichflag = 1;
       lmp->init();
       update->integrate->setup();
 
     // write restart file of hot coords
 
       if (restart_flag) {
         timer->barrier_start();
         output->write_restart(update->ntimestep);
         timer->barrier_stop();
         time_output += timer->get_wall(Timer::TOTAL);
       }
     }
 
   } else {
 
     while (update->ntimestep < update->endstep) {
       confident_flag = 0;
       while (update->ntimestep < update->endstep) {
         event_flag = 0;
         while (update->ntimestep < update->endstep) {
           update->ntimestep += t_event;
           MPI_Bcast(&event_flag,1,MPI_INT,0,universe->uworld);
 
           if (event_flag) break;
         }
         if (!event_flag) break;
         perform_neb(-1);
         MPI_Bcast(&confident_flag,1,MPI_INT,0,universe->uworld);
         if (confident_flag) break;
       }
       if (!confident_flag) break;
 
       // need to sync timestep with TAD
 
       MPI_Bcast(&(update->ntimestep),1,MPI_INT,0,universe->uworld);
     }
   }
 
   // set total timers and counters so Finish() will process them
 
   timer->set_wall(Timer::TOTAL, time_start);
   timer->barrier_stop();
 
   timer->set_wall(Timer::NEB, time_neb);
   timer->set_wall(Timer::DYNAMICS, time_dynamics);
   timer->set_wall(Timer::QUENCH, time_quench);
   timer->set_wall(Timer::REPCOMM, time_comm);
   timer->set_wall(Timer::REPOUT, time_output);
 
   neighbor->ncalls = nbuild;
   neighbor->ndanger = ndanger;
 
   if (me_universe == 0) {
     if (universe->uscreen)
       fprintf(universe->uscreen,
               "Loop time of %g on %d procs for %d steps with " BIGINT_FORMAT
               " atoms\n",
               timer->get_wall(Timer::TOTAL),nprocs_universe,nsteps,atom->natoms);
     if (universe->ulogfile)
       fprintf(universe->ulogfile,
               "Loop time of %g on %d procs for %d steps with " BIGINT_FORMAT
               " atoms\n",
               timer->get_wall(Timer::TOTAL),nprocs_universe,nsteps,atom->natoms);
   }
 
   if ((me_universe == 0) && ulogfile_neb) fclose(ulogfile_neb);
 
   if (me == 0) {
     if (screen) fprintf(screen,"\nTAD done\n");
     if (logfile) fprintf(logfile,"\nTAD done\n");
   }
 
   finish->end(3);
 
   update->whichflag = 0;
   update->firststep = update->laststep = 0;
   update->beginstep = update->endstep = 0;
   update->restrict_output = 0;
 
   // reset reneighboring criteria
 
   neighbor->every = neigh_every;
   neighbor->delay = neigh_delay;
   neighbor->dist_check = neigh_dist_check;
 
 
   delete [] id_compute;
   delete finish;
   modify->delete_fix("tad_event");
   modify->delete_fix("tad_revert");
   delete_event_list();
 
   compute_event->reset_extra_compute_fix(NULL);
 }
 
 /* ----------------------------------------------------------------------
    single short dynamics run
 ------------------------------------------------------------------------- */
 
 void TAD::dynamics()
 {
   update->whichflag = 1;
   update->nsteps = t_event;
 
   lmp->init();
   update->integrate->setup();
   // this may be needed if don't do full init
   //modify->addstep_compute_all(update->ntimestep);
   int ncalls = neighbor->ncalls;
 
   timer->barrier_start();
   update->integrate->run(t_event);
   timer->barrier_stop();
   time_dynamics += timer->get_wall(Timer::TOTAL);
 
   nbuild += neighbor->ncalls - ncalls;
   ndanger += neighbor->ndanger;
 
   update->integrate->cleanup();
   finish->end(0);
 }
 
 /* ----------------------------------------------------------------------
    quench minimization
 ------------------------------------------------------------------------- */
 
 void TAD::quench()
 {
   bigint ntimestep_hold = update->ntimestep;
   bigint endstep_hold = update->endstep;
 
   // need to change whichflag so that minimize->setup() calling
   // modify->setup() will call fix->min_setup()
 
   update->whichflag = 2;
   update->nsteps = maxiter;
   update->endstep = update->laststep = update->firststep + maxiter;
   if (update->laststep < 0)
     error->all(FLERR,"Too many iterations");
 
   // full init works
 
   lmp->init();
   update->minimize->setup();
 
   // partial init does not work
 
   //modify->addstep_compute_all(update->ntimestep);
   //update->minimize->setup_minimal(1);
 
   int ncalls = neighbor->ncalls;
 
   timer->barrier_start();
   update->minimize->run(maxiter);
   timer->barrier_stop();
   time_quench += timer->get_wall(Timer::TOTAL);
 
   if (neighbor->ncalls == ncalls) quench_reneighbor = 0;
   else quench_reneighbor = 1;
 
   update->minimize->cleanup();
   finish->end(1);
 
   // reset timestep as if quench did not occur
   // clear timestep storage from computes, since now invalid
 
   update->ntimestep = ntimestep_hold;
   update->endstep = update->laststep = endstep_hold;
   for (int i = 0; i < modify->ncompute; i++)
     if (modify->compute[i]->timeflag) modify->compute[i]->clearstep();
 }
 
 /* ----------------------------------------------------------------------
    check for an event
    return 0 if no event
    return 1 if event
 ------------------------------------------------------------------------- */
 
 int TAD::check_event()
 {
   int flag;
 
   flag = 0;
   if (compute_event->compute_scalar() > 0.0) flag = 1;
 
   return flag;
 }
 
 /* ----------------------------------------------------------------------
    universe proc 0 prints event info
 ------------------------------------------------------------------------- */
 
 void TAD::log_event(int ievent)
 {
   timer->set_wall(Timer::TOTAL, time_start);
   if (universe->me == 0) {
     double tfrac = 0.0;
     if (universe->uscreen)
       fprintf(universe->uscreen,
               BIGINT_FORMAT " %.3f %d %d %s %.3f %.3f %.3f %.3f\n",
               fix_event->event_timestep,
               timer->elapsed(Timer::TOTAL),
               fix_event->event_number,ievent,
               "E ",
               fix_event->ebarrier,tfrac,
               fix_event->tlo,deltfirst);
     if (universe->ulogfile)
       fprintf(universe->ulogfile,
               BIGINT_FORMAT " %.3f %d %d %s %.3f %.3f %.3f %.3f\n",
               fix_event->event_timestep,
               timer->elapsed(Timer::TOTAL),
               fix_event->event_number,ievent,
               "E ",
               fix_event->ebarrier,tfrac,
               fix_event->tlo,deltfirst);
   }
 
   // dump snapshot of quenched coords
   // must reneighbor and compute forces before dumping
   // addstep_compute_all insures eng/virial are calculated if needed
 
   if (output->ndump && universe->iworld == 0) {
     timer->barrier_start();
     modify->addstep_compute_all(update->ntimestep);
     update->integrate->setup_minimal(1);
     output->write_dump(update->ntimestep);
     timer->barrier_stop();
     time_output += timer->get_wall(Timer::TOTAL);
   }
 
 }
 
 /* ----------------------------------------------------------------------
    parse optional parameters at end of TAD input line
 ------------------------------------------------------------------------- */
 
 void TAD::options(int narg, char **arg)
 {
   if (narg < 0) error->all(FLERR,"Illegal tad command");
 
   // set defaults
 
   etol = 0.1;
   ftol = 0.1;
   maxiter = 40;
   maxeval = 50;
 
   etol_neb = 0.01;
   ftol_neb = 0.01;
   n1steps_neb = 100;
   n2steps_neb = 100;
   nevery_neb = 10;
 
   int n = strlen("quickmin") + 1;
   min_style_neb = new char[n];
   strcpy(min_style_neb,"quickmin");
   dt_neb = update->dt;
   neb_logfilename = NULL;
 
   int iarg = 0;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"min") == 0) {
       if (iarg+5 > narg) error->all(FLERR,"Illegal tad command");
       etol = force->numeric(FLERR,arg[iarg+1]);
       ftol = force->numeric(FLERR,arg[iarg+2]);
       maxiter = force->inumeric(FLERR,arg[iarg+3]);
       maxeval = force->inumeric(FLERR,arg[iarg+4]);
       if (maxiter < 0 || maxeval < 0 ||
           etol < 0.0 || ftol < 0.0 )
         error->all(FLERR,"Illegal tad command");
       iarg += 5;
 
     } else if (strcmp(arg[iarg],"neb") == 0) {
       if (iarg+6 > narg) error->all(FLERR,"Illegal tad command");
       etol_neb = force->numeric(FLERR,arg[iarg+1]);
       ftol_neb = force->numeric(FLERR,arg[iarg+2]);
       n1steps_neb = force->inumeric(FLERR,arg[iarg+3]);
       n2steps_neb = force->inumeric(FLERR,arg[iarg+4]);
       nevery_neb = force->inumeric(FLERR,arg[iarg+5]);
       if (etol_neb < 0.0 || ftol_neb < 0.0 ||
           n1steps_neb < 0 || n2steps_neb < 0 ||
           nevery_neb < 0) error->all(FLERR,"Illegal tad command");
       iarg += 6;
 
     } else if (strcmp(arg[iarg],"neb_style") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal tad command");
       delete [] min_style_neb;
       int n = strlen(arg[iarg+1]) + 1;
       min_style_neb = new char[n];
       strcpy(min_style_neb,arg[iarg+1]);
       iarg += 2;
 
     } else if (strcmp(arg[iarg],"neb_step") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal tad command");
       dt_neb = force->numeric(FLERR,arg[iarg+1]);
       if (dt_neb <= 0.0) error->all(FLERR,"Illegal tad command");
       iarg += 2;
 
     } else if (strcmp(arg[iarg],"neb_log") == 0) {
       delete [] neb_logfilename;
       if (iarg+2 > narg) error->all(FLERR,"Illegal tad command");
       if (strcmp(arg[iarg+1],"none") == 0) neb_logfilename = NULL;
       else {
         int n = strlen(arg[iarg+1]) + 1;
         neb_logfilename = new char[n];
         strcpy(neb_logfilename,arg[iarg+1]);
       }
       iarg += 2;
     } else error->all(FLERR,"Illegal tad command");
   }
 }
 
 /* ----------------------------------------------------------------------
    perform NEB calculation
 ------------------------------------------------------------------------- */
 
 void TAD::perform_neb(int ievent)
 {
 
   double **x = atom->x;
   int nlocal = atom->nlocal;
 
   double *buf_final;
   memory->create(buf_final,3*nlocal,"tad:buffinal");
 
   // set system to quenched state of event ievent
 
   if (universe->iworld == 0) {
 
     fix_event_list[ievent]->restore_event();
 
     int ii = 0;
     for (int i = 0; i < nlocal; i++) {
       buf_final[ii++] = x[i][0];
       buf_final[ii++] = x[i][1];
       buf_final[ii++] = x[i][2];
     }
   }
 
   MPI_Bcast(buf_final,3*nlocal,MPI_DOUBLE,universe->root_proc[0],
             universe->uworld);
 
   double *buf_init;
   memory->create(buf_init,3*nlocal,"tad:bufinit");
 
   // set system to quenched state of fix_event
 
   if (universe->iworld == 0) {
 
     fix_event->restore_event();
 
     int ii = 0;
     for (int i = 0; i < nlocal; i++) {
       buf_init[ii++] = x[i][0];
       buf_init[ii++] = x[i][1];
       buf_init[ii++] = x[i][2];
     }
   }
 
   MPI_Bcast(buf_init,3*nlocal,MPI_DOUBLE,universe->root_proc[0],
             universe->uworld);
 
   // create FixNEB object to support NEB
 
   int narg2 = 4;
   char **args = new char*[narg2];
   args[0] = (char *) "neb";
   args[1] = (char *) "all";
   args[2] = (char *) "neb";
   char str[128];
   args[3] = str;
   double kspring = 1.0;
   sprintf(args[3],"%f",kspring);
   modify->add_fix(narg2,args);
   fix_neb = (Fix *) modify->fix[modify->nfix-1];
   delete [] args;
 
   // switch minimize style to quickmin for NEB
 
   narg2 = 1;
   args = new char*[narg2];
   args[0] = min_style_neb;
 
   update->create_minimize(narg2,args);
 
   delete [] args;
 
   // create NEB object
 
   neb = new NEB(lmp,etol_neb,ftol_neb,n1steps_neb,
                 n2steps_neb,nevery_neb,buf_init,buf_final);
 
   // free up temporary arrays
 
   memory->destroy(buf_init);
   memory->destroy(buf_final);
 
   // run NEB with NEB timestep
 
   int beginstep_hold = update->beginstep;
   int endstep_hold = update->endstep;
   int ntimestep_hold = update->ntimestep;
   int nsteps_hold = update->nsteps;
 
   if (universe->me == 0) {
     universe->ulogfile = ulogfile_neb;
     universe->uscreen = uscreen_neb;
   }
 
   // had to bypass timer interface
   // because timer->array is reset inside neb->run()
 
   //    timer->barrier_start();
   //    neb->run();
   //    timer->barrier_stop();
   //    time_neb += timer->get_wall(Timer::TOTAL);
 
   MPI_Barrier(world);
   double time_tmp = MPI_Wtime();
 
   double dt_hold = update->dt;
   update->dt = dt_neb;
   neb->run();
   update->dt = dt_hold;
 
   MPI_Barrier(world);
   time_neb += MPI_Wtime() - time_tmp;
 
   if (universe->me == 0) {
     universe->ulogfile = ulogfile_lammps;
     universe->uscreen = uscreen_lammps;
   }
 
   // extract barrier energy from NEB
 
   if (universe->iworld == 0)
     fix_event_list[ievent]->ebarrier = neb->ebf;
 
   update->beginstep = update->firststep = beginstep_hold;
   update->endstep = update->laststep = endstep_hold;
   update->ntimestep = ntimestep_hold;
   update->nsteps = nsteps_hold;
 
   // switch minimize style back for quench
 
   narg2 = 1;
   args = new char*[narg2];
   args[0] = min_style;
 
   update->create_minimize(narg2,args);
 
   update->etol = etol;
   update->ftol = ftol;
 
   delete [] args;
 
   // clean up
 
   modify->delete_fix("neb");
   delete neb;
 }
 
 /* ----------------------------------------------------------------------
    check if confidence criterion for tstop is satisfied
    return 0 if not satisfied
    return 1 if satisfied
 ------------------------------------------------------------------------- */
 
 int TAD::check_confidence()
 {
   int flag;
 
   // update stopping time
 
   deltstop = deltconf*pow(deltfirst/deltconf, ratio_beta);
 
   flag = 0;
   if (deltstop < update->ntimestep - fix_event->event_timestep) flag = 1;
 
   return flag;
 }
 
 /* ----------------------------------------------------------------------
    store state in fix_revert
 ------------------------------------------------------------------------- */
 
 void TAD::store_state()
 {
   double **x = atom->x;
   double **v = atom->v;
   imageint *image = atom->image;
   int nlocal = atom->nlocal;
 
   double **astore = fix_revert->astore;
 
   for (int i = 0; i < nlocal; i++) {
     astore[i][0] = x[i][0];
     astore[i][1] = x[i][1];
     astore[i][2] = x[i][2];
     astore[i][3] = v[i][0];
     astore[i][4] = v[i][1];
     astore[i][5] = v[i][2];
     *((imageint *) &astore[i][6]) = image[i];
   }
 }
 
 /* ----------------------------------------------------------------------
    restore state archived in fix_revert
    flip sign of velocities to reflect back to starting state
 ------------------------------------------------------------------------- */
 
 void TAD::revert_state()
 {
   double **x = atom->x;
   double **v = atom->v;
   imageint *image = atom->image;
   int nlocal = atom->nlocal;
 
   double **astore = fix_revert->astore;
 
   for (int i = 0; i < nlocal; i++) {
     x[i][0] = astore[i][0];
     x[i][1] = astore[i][1];
     x[i][2] = astore[i][2];
     v[i][0] = -astore[i][3];
     v[i][1] = -astore[i][4];
     v[i][2] = -astore[i][5];
     image[i] = *((imageint *) &astore[i][6]);
   }
 }
 
 /* ----------------------------------------------------------------------
-   Initialize list of possible events
+   initialize list of possible events
 ------------------------------------------------------------------------- */
 
 void TAD::initialize_event_list() {
 
   // First delete old events, if any
 
   delete_event_list();
 
   // Create new list
 
   n_event_list = 0;
   grow_event_list(nmin_event_list);
 }
 
 /* ----------------------------------------------------------------------
-   Delete list of possible events
+   delete list of possible events
 ------------------------------------------------------------------------- */
 
 void TAD::delete_event_list() {
 
   for (int i = 0; i < n_event_list; i++) {
     char str[128];
     sprintf(str,"tad_event_%d",i);
     modify->delete_fix(str);
   }
   memory->sfree(fix_event_list);
   fix_event_list = NULL;
   n_event_list = 0;
   nmax_event_list = 0;
 
 }
 
 /* ----------------------------------------------------------------------
    add event
 ------------------------------------------------------------------------- */
 
 void TAD::add_event()
 {
 
   // create FixEventTAD object to store possible event
 
   int narg = 3;
   char **args = new char*[narg];
 
   char str[128];
   sprintf(str,"tad_event_%d",n_event_list);
 
   args[0] = str;
   args[1] = (char *) "all";
   args[2] = (char *) "EVENT/TAD";
   modify->add_fix(narg,args);
 
   if (n_event_list == nmax_event_list)
     grow_event_list(nmax_event_list+nmin_event_list);
   n_event_list += 1;
   int ievent = n_event_list-1;
   fix_event_list[ievent] = (FixEventTAD *) modify->fix[modify->nfix-1];
 
   // store quenched state for new event
 
   fix_event_list[ievent]->store_event_tad(update->ntimestep);
 
   // store hot state for new event
 
   fix_event->restore_state_quench();
   fix_event_list[ievent]->store_state_quench();
 
   // string clean-up
 
   delete [] args;
 
 }
 
 /* ----------------------------------------------------------------------
    compute cold time for event ievent
 ------------------------------------------------------------------------- */
 
 void TAD::compute_tlo(int ievent)
 {
   double deltlo,delthi,ebarrier;
 
   ebarrier = fix_event_list[ievent]->ebarrier;
   delthi = fix_event_list[ievent]->event_timestep
     - fix_event->event_timestep;
   deltlo = delthi*exp(ebarrier*delta_beta);
   fix_event_list[ievent]->tlo = fix_event->tlo + deltlo;
 
   // update first event
 
   char* statstr = (char *) "D ";
 
   if (ievent == 0) {
     deltfirst = deltlo;
     event_first = ievent;
     statstr = (char *) "DF";
   } else if (deltlo < deltfirst) {
     deltfirst = deltlo;
     event_first = ievent;
     statstr = (char *) "DF";
   }
 
   // first-replica output about each event
 
   timer->set_wall(Timer::TOTAL, time_start);
   if (universe->me == 0) {
     double tfrac = 0.0;
     if (ievent > 0) tfrac = delthi/deltstop;
 
     if (universe->uscreen)
       fprintf(universe->uscreen,
               BIGINT_FORMAT " %.3f %d %d %s %.3f %.3f %.3f %.3f\n",
               fix_event_list[ievent]->event_timestep,
               timer->elapsed(Timer::TOTAL),
               fix_event->event_number,
               ievent,statstr,ebarrier,tfrac,
               fix_event->tlo,deltlo);
 
     if (universe->ulogfile)
       fprintf(universe->ulogfile,
               BIGINT_FORMAT " %.3f %d %d %s %.3f %.3f %.3f %.3f\n",
               fix_event_list[ievent]->event_timestep,
               timer->elapsed(Timer::TOTAL),
               fix_event->event_number,
               ievent,statstr,ebarrier,tfrac,
               fix_event->tlo,deltlo);
   }
 
 }
 
 /* ----------------------------------------------------------------------
    perform event
 ------------------------------------------------------------------------- */
 
 void TAD::perform_event(int ievent)
 {
   // reset timestep to that of event
 
   update->ntimestep = fix_event_list[ievent]->event_timestep;
 
   // Copy event to current event
   // Should really use copy constructor for this
   fix_event->tlo = fix_event_list[ievent]->tlo;
   fix_event->ebarrier = fix_event_list[ievent]->ebarrier;
   fix_event->event_number++;
   fix_event->event_timestep = update->ntimestep;
   fix_event_list[ievent]->restore_event();
   fix_event->store_event_tad(fix_event_list[ievent]->event_timestep);
 
   // output stats and dump for quenched state
 
   log_event(ievent);
 
   // load and store hot state
 
   fix_event_list[ievent]->restore_state_quench();
   fix_event->store_state_quench();
 }
 
 /* ----------------------------------------------------------------------
    Allocate list of pointers to events
 ------------------------------------------------------------------------- */
 
 void TAD::grow_event_list(int nmax) {
   if (nmax_event_list > nmax) return;
   fix_event_list = (FixEventTAD **)
     memory->srealloc(fix_event_list,nmax*sizeof(FixEventTAD *),"tad:eventlist");
   nmax_event_list = nmax;
 }