diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp
index c142a4867..33b73568b 100644
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@@ -1,153 +1,153 @@
 /***************************************************************************
-                                   lal_buck.cpp
+                                  buck.cpp
                              -------------------
-                            W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the lj/cut pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifdef USE_OPENCL
 #include "buck_cl.h"
 #else
 #include "buck_ptx.h"
 #endif
 
 #include "lal_buck.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define BuckT Buck<numtyp, acctyp>
 
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
 BuckT::Buck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
 BuckT::~Buck() { 
   clear();
 }
  
 template <class numtyp, class acctyp>
 int BuckT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
 int BuckT::init(const int ntypes, double **host_cutsq,
            double **host_rhoinv, double **host_buck1, double **host_buck2, 
            double **host_a, double **host_c, 
            double **host_offset, double *host_special_lj,
            const int nlocal, const int nall, const int max_nbors, 
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                             _screen,buck);
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
   int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
 
   coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
                          host_buck1,host_buck2,host_cutsq);
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		         host_offset);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   dview.view(host_special_lj,4,*(this->ucl_device));
   ucl_copy(sp_lj,dview,false);
 
   _allocated=true;
   this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes();
   return 0;
 }
 
 template <class numtyp, class acctyp>
 void BuckT::clear() {
   if (!_allocated)
     return;
   _allocated=false;
 
   coeff1.clear();
   coeff2.clear();
   sp_lj.clear();
   this->clear_atomic();
 }
 
 template <class numtyp, class acctyp>
 double BuckT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Buck<numtyp,acctyp>);
 }
 
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BuckT::loop(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
   if (_eflag)
     eflag=1;
   else
     eflag=0;
 
   if (_vflag)
     vflag=1;
   else
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
                           &coeff2.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
 
 template class Buck<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index 011a116a6..1281fef64 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -1,195 +1,195 @@
 // **************************************************************************
-//                                   lj.cu
+//                                   buck.cu
 //                             -------------------
-//                           W. Michael Brown (ORNL)
+//                           Trung Dac Nguyen (ORNL)
 //
-//  Device code for acceleration of the lj/cut pair style
+//  Device code for acceleration of the buck pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
-//    email                : brownw@ornl.gov
+//    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
 ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
 #endif
 #endif
 
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
                           __global numtyp4* coeff2, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag, 
                           const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
         
       int mtype=itype*lj_types+jtype;
       if (r2inv<coeff1[mtype].w) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
                 - coeff1[mtype].z*r6inv);
         force*=factor_lj;
       
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].z); 
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
     store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                   ans,engv);
   } // if ii
 }
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                                __global numtyp4* coeff2_in, 
                                __global numtyp* sp_lj_in, 
                                __global int *dev_nbor, __global int *dev_packed, 
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
                                const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
   
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
 
   __syncthreads();
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
         
       if (r2inv<coeff1[mtype].w) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
                 - coeff1[mtype].z*r6inv);
         force*=factor_lj;
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].z); 
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
     store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                   ans,engv);
   } // if ii
 }
 
diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h
index b60df32fd..b0b6347c4 100644
--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@@ -1,80 +1,80 @@
 /***************************************************************************
-                                  lal_buck.h
+                                   buck.h
                              -------------------
-                            W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the buck pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifndef LAL_BUCK_H
 #define LAL_BUCK_H
 
 #include "lal_base_atomic.h"
 
 namespace LAMMPS_AL {
 
 template <class numtyp, class acctyp>
 class Buck : public BaseAtomic<numtyp, acctyp> {
  public:
   Buck();
   ~Buck(); 
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * 
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
            double **host_rhoinv, double **host_buck1, double **host_buck2, 
            double **host_a, double **host_c, 
            double **host_offset, double *host_special_lj,
            const int nlocal, const int nall, const int max_nbors, 
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
 
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
 
   // --------------------------- TYPE DATA --------------------------
 
   /// coeff1.x = rhoinv, coeff1.y = buck1, coeff1.z = buck2, coeff1.w = cutsq
   UCL_D_Vec<numtyp4> coeff1;
   /// coeff2.x = a, coeff2.y = c, coeff2.z = offset
   UCL_D_Vec<numtyp4> coeff2;
   /// Special LJ values
   UCL_D_Vec<numtyp> sp_lj;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
   /// Number of atom types 
   int _lj_types;
 
  private:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
 };
 
 }
 
 #endif
diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp
index 86ca89cc4..42dbfb3e7 100644
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@@ -1,166 +1,166 @@
 /***************************************************************************
-                              lal_buck_coul.cpp
+                                buck_coul.cpp
                              -------------------
-                      Trung Dac Nguyen, W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the buck/coul/cut pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov nguyentd@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifdef USE_OPENCL
 #include "buck_coul_cl.h"
 #else
 #include "buck_coul_ptx.h"
 #endif
 
 #include "lal_buck_coul.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define BuckCoulT BuckCoul<numtyp, acctyp>
 
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
 BuckCoulT::BuckCoul() : BaseCharge<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
 BuckCoulT::~BuckCoul() { 
   clear();
 }
  
 template <class numtyp, class acctyp>
 int BuckCoulT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
 int BuckCoulT::init(const int ntypes, double **host_cutsq,
                    double **host_rhoinv, double **host_buck1, double **host_buck2, 
                    double **host_a, double **host_c, 
                    double **host_offset, double *host_special_lj,
                    const int nlocal, const int nall, const int max_nbors, 
                    const int maxspecial, const double cell_size, 
                    const double gpu_split, FILE *_screen, double **host_cut_ljsq,
                    double **host_cut_coulsq, double *host_special_coul,
                    const double qqrd2e) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                             _screen,buck_coul);
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
   int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
 
   coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
                          host_buck1,host_buck2);
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		         host_offset);
   
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq,
             host_cut_ljsq, host_cut_coulsq);
   
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
     host_write[i+4]=host_special_coul[i];
   }
   ucl_copy(sp_lj,host_write,8,false);
   
   _qqrd2e = qqrd2e;
   
   _allocated=true;
   this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes();
   return 0;
 }
 
 template <class numtyp, class acctyp>
 void BuckCoulT::clear() {
   if (!_allocated)
     return;
   _allocated=false;
 
   coeff1.clear();
   coeff2.clear();
   sp_lj.clear();
   this->clear_atomic();
 }
 
 template <class numtyp, class acctyp>
 double BuckCoulT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(BuckCoul<numtyp,acctyp>);
 }
 
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
   if (_eflag)
     eflag=1;
   else
     eflag=0;
 
   if (_vflag)
     vflag=1;
   else
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
                           &coeff2.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &nbor_pitch,
                           &this->atom->dev_q.begin(), &cutsq.begin(),
                           &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &nbor_pitch, &this->atom->dev_q.begin(),
                      &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
 
 template class BuckCoul<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index 39e3a11cc..df4a824b4 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -1,244 +1,244 @@
 // **************************************************************************
-//                             lal_buck_coul.cu
+//                                buck_coul.cu
 //                             -------------------
-//                    Trung Dac Nguyen, W. Michael Brown (ORNL)
+//                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the buck/coul/cut pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
-//    email                : brownw@ornl.gov nguyentd@ornl.gov
+//    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
 ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
 ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
 
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
                           __global numtyp4* coeff2, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag, 
                           const int vflag, const int inum,
                           const int nbor_pitch, __global numtyp *q_ ,
                           __global numtyp4 *cutsq, const numtyp qqrd2e,
                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
   sp_lj[4]=sp_lj_in[4];
   sp_lj[5]=sp_lj_in[5];
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int itype=ix.w;
     
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
       factor_coul = sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
         
       int mtype=itype*lj_types+jtype;
       if (rsq<cutsq[mtype].x) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, forcebuck, force, r6inv;
         numtyp rexp = (numtyp)0.0;
         
         if (rsq < cutsq[mtype].y) { // buckingham
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
           forcebuck = (coeff1[mtype].y*r*rexp 
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           forcebuck = (numtyp)0.0;
         
         if (rsq < coeff2[mtype].z)  // coul
           forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
         else
           forcecoul = (numtyp)0.0;
         
         force = (forcebuck + forcecoul) * r2inv;
         
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           e_coul += forcecoul;
           if (rsq < cutsq[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
     store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                     vflag,ans,engv);
   } // if ii
 }
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                                __global numtyp4* coeff2_in, 
                                __global numtyp* sp_lj_in, 
                                __global int *dev_nbor, __global int *dev_packed, 
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
                                const int nbor_pitch, __global numtyp *q_,
                                __global numtyp4 *_cutsq, const numtyp qqrd2e, 
                                const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
     cutsq[tid]=_cutsq[tid];
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
   
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
 
   __syncthreads();
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
       factor_coul = sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
         
       if (rsq<cutsq[mtype].x) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, forcebuck, force, r6inv;
         numtyp rexp = (numtyp)0.0;
         
         if (rsq < cutsq[mtype].y) { // buckingham
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
           forcebuck = (coeff1[mtype].y*r*rexp 
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           forcebuck = (numtyp)0.0;
         
         if (rsq < cutsq[mtype].z)  // coul
           forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
         else
           forcecoul = (numtyp)0.0;
         
         force = (forcebuck + forcecoul) * r2inv;
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           e_coul += forcecoul;
           if (rsq < cutsq[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
           } 
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
     store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                     vflag,ans,engv);
   } // if ii
 }
 
diff --git a/lib/gpu/lal_buck_coul.h b/lib/gpu/lal_buck_coul.h
index 44ec43988..e4bf59107 100644
--- a/lib/gpu/lal_buck_coul.h
+++ b/lib/gpu/lal_buck_coul.h
@@ -1,86 +1,86 @@
 /***************************************************************************
-                               lal_buck_coul.h
+                                 buck_coul.h
                              -------------------
-                        Trung Dac Nguyen, W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the buck/coul/cut pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov nguyentd@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifndef LAL_BUCK_COUL_H
 #define LAL_BUCK_COUL_H
 
 #include "lal_base_charge.h"
 
 namespace LAMMPS_AL {
 
 template <class numtyp, class acctyp>
 class BuckCoul : public BaseCharge<numtyp, acctyp> {
  public:
   BuckCoul();
   ~BuckCoul(); 
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * 
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
            double **host_rhoinv, double **host_buck1, double **host_buck2, 
            double **host_a, double **host_c, 
            double **host_offset, double *host_special_lj,
            const int nlocal, const int nall, const int max_nbors, 
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
            const double qqrd2e);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
 
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
 
   // --------------------------- TYPE DATA --------------------------
 
   /// coeff1.x = rhoinv, coeff1.y = buck1, coeff1.z = buck2
   UCL_D_Vec<numtyp4> coeff1;
   /// coeff2.x = a, coeff2.y = c, coeff2.z = offset
   UCL_D_Vec<numtyp4> coeff2;
   /// cutsq.x = cutsq, cutsq.y = cutsq_lj, cutsq.z = cutsq_coul
   UCL_D_Vec<numtyp4> cutsq;
   /// Special LJ values
   UCL_D_Vec<numtyp> sp_lj;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
   /// Number of atom types 
   int _lj_types;
   
   numtyp _qqrd2e;
   
  private:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
 };
 
 }
 
 #endif
diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp
index 9755f852f..ac3e6b891 100644
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@@ -1,131 +1,131 @@
 /***************************************************************************
-                            lal_buck_coul_ext.cpp
+                              buck_coul_ext.cpp
                              -------------------
-                      Trung Dac Nguyen, W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to buck/coul/cut acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov nguyentd@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_buck_coul.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static BuckCoul<PRECISION,ACC_PRECISION> BUCKCMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                  double **host_buck1, double **host_buck2, 
                  double **host_a, double **host_c,       
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, double **host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e) {
   BUCKCMF.clear();
   gpu_mode=BUCKCMF.device->gpu_mode();
   double gpu_split=BUCKCMF.device->particle_split();
   int first_gpu=BUCKCMF.device->first_device();
   int last_gpu=BUCKCMF.device->last_device();
   int world_me=BUCKCMF.device->world_me();
   int gpu_rank=BUCKCMF.device->gpu_rank();
   int procs_per_gpu=BUCKCMF.device->procs_per_gpu();
 
   BUCKCMF.device->init_message(screen,"buck",first_gpu,last_gpu);
 
   bool message=false;
   if (BUCKCMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
     fprintf(screen,"Initializing GPU and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, 
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
 
   BUCKCMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
         fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, 
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
 
     BUCKCMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BUCKCMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void buckc_gpu_clear() {
   BUCKCMF.clear(); 
 }
 
 int ** buckc_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double *host_q, double *boxlo,
                         double *prd) {
   return BUCKCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
 }  
 			
 void buckc_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double *host_q,
                      const int nlocal, double *boxlo, double *prd) {
   BUCKCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                 vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                 nlocal,boxlo,prd);
 }
 
 double buckc_gpu_bytes() {
   return BUCKCMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp
index 681983a81..c47f46d1b 100644
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@@ -1,171 +1,171 @@
 /***************************************************************************
-                            lal_buck_coul_long.cpp
+                             buck_coul_long.cpp
                              -------------------
-                      Trung Dac Nguyen, W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the buck/coul/long pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov nguyentd@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifdef USE_OPENCL
 #include "buck_coul_long_cl.h"
 #else
 #include "buck_coul_long_ptx.h"
 #endif
 
 #include "lal_buck_coul_long.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define BuckCoulLongT BuckCoulLong<numtyp, acctyp>
 
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
 BuckCoulLongT::BuckCoulLong() : BaseCharge<numtyp,acctyp>(),
                                     _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
 BuckCoulLongT::~BuckCoulLongT() {
   clear();
 }
  
 template <class numtyp, class acctyp>
 int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
 int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
                        double **host_rhoinv, double **host_buck1, double **host_buck2, 
                        double **host_a, double **host_c, double **host_offset, 
                        double *host_special_lj, const int nlocal,
                        const int nall, const int max_nbors,
                        const int maxspecial, const double cell_size,
                        const double gpu_split, FILE *_screen,
                        double **host_cut_ljsq, const double host_cut_coulsq,
                        double *host_special_coul, const double qqrd2e,
                        const double g_ewald) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                             _screen,buck_coul_long);
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
   int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
 
   coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
                          host_buck1,host_buck2,host_cut_ljsq);
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		         host_offset);
   
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
   
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
     host_write[i+4]=host_special_coul[i];
   }
   ucl_copy(sp_lj,host_write,8,false);
 
   _cut_coulsq=host_cut_coulsq;
   _qqrd2e=qqrd2e;
   _g_ewald=g_ewald;
 
   _allocated=true;
   this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes();
   return 0;
 }
 
 template <class numtyp, class acctyp>
 void BuckCoulLongT::clear() {
   if (!_allocated)
     return;
   _allocated=false;
 
   coeff1.clear();
   coeff2.clear();
   sp_lj.clear();
   this->clear_atomic();
 }
 
 template <class numtyp, class acctyp>
 double BuckCoulLongT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(BuckCoulLong<numtyp,acctyp>);
 }
 
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
   if (_eflag)
     eflag=1;
   else
     eflag=0;
 
   if (_vflag)
     vflag=1;
   else
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
                           &coeff2.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
                           &cutsq.begin(), &_cut_coulsq, &_qqrd2e, 
                           &_g_ewald, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), 
                    &coeff2.begin(), &_lj_types, &sp_lj.begin(), 
                    &this->nbor->dev_nbor.begin(),
                    &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                    &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                    &nbor_pitch, &this->atom->dev_q.begin(), 
                    &cutsq.begin(), &_cut_coulsq,
                    &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
 
 template class BuckCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index 0884741f3..95c13dc26 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -1,258 +1,258 @@
 // **************************************************************************
-//                             lal_buck_coul_long.cu
+//                              buck_coul_long.cu
 //                             -------------------
-//                        Trung Dac Nguyen, W. Michael Brown (ORNL)
+//                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the buck/coul/long pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
-//    email                : brownw@ornl.gov nguyentd@ornl.gov
+//    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
 ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
 ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
 
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
                           __global numtyp4* coeff2, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag, 
                           const int vflag, const int inum,
                           const int nbor_pitch, __global numtyp *q_,
                           __global numtyp *cutsq, 
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
   sp_lj[4]=sp_lj_in[4];
   sp_lj[5]=sp_lj_in[5];
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
       int mtype=itype*lj_types+jtype;
       if (rsq<cutsq[mtype]) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
         numtyp rexp = (numtyp)0.0;
         
         if (rsq < coeff1[mtype].w) { // cut_ljsq
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
           force_lj = (coeff1[mtype].y*r*rexp 
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           force_lj = (numtyp)0.0;
 
         if (rsq < cut_coulsq) {
           numtyp r = ucl_rsqrt(r2inv);
           numtyp grij = g_ewald * r;
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
 
         force = (force_lj + forcecoul) * r2inv;
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < coeff1[mtype].w) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
           } 
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
     store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                     vflag,ans,engv);
   } // if ii
 }
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                                __global numtyp4* coeff2_in, 
                                __global numtyp* sp_lj_in,
                                __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
                                const int nbor_pitch, __global numtyp *q_,
                                __global numtyp *cutsq,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
   
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
   
   __syncthreads();
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
       if (rsq<cutsq[mtype]) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
         numtyp rexp = (numtyp)0.0;
         
         if (rsq < coeff1[mtype].w) {
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
           force_lj = (coeff1[mtype].y*r*rexp 
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           force_lj = (numtyp)0.0;
 
         if (rsq < cut_coulsq) {
           numtyp r = ucl_rsqrt(r2inv);
           numtyp grij = g_ewald * r;
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
 
         force = (force_lj + forcecoul) * r2inv;
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < coeff1[mtype].w) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
     store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                     vflag,ans,engv);
   } // if ii
 }
 
diff --git a/lib/gpu/lal_buck_coul_long.h b/lib/gpu/lal_buck_coul_long.h
index 098dece75..dc59d7ad4 100644
--- a/lib/gpu/lal_buck_coul_long.h
+++ b/lib/gpu/lal_buck_coul_long.h
@@ -1,86 +1,86 @@
 /***************************************************************************
-                            lal_buck_coul_long.h
+                              buck_coul_long.h
                              -------------------
-                    Trung Dac Nguyen, W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the buck/coul/long pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov nguyentd@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifndef LAL_BUCK_COUL_LONG_H
 #define LAL_BUCK_COUL_LONG_H
 
 #include "lal_base_charge.h"
 
 namespace LAMMPS_AL {
 
 template <class numtyp, class acctyp>
 class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
  public:
   BuckCoulLong();
   ~BuckCoulLong();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * 
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
            double **host_rhoinv, double **host_buck1, double **host_buck2, 
            double **host_a, double **host_c, 
            double **host_offset, double *host_special_lj,
            const int nlocal, const int nall, const int max_nbors, 
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
 
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
 
   // --------------------------- TYPE DATA --------------------------
 
   /// coeff1.x = rhoinv, coeff1.y = buck1, coeff1.z = buck2, coeff1.w = cutsq_lj
   UCL_D_Vec<numtyp4> coeff1;
   /// coeff2.x = a, coeff2.y = c, coeff2.z = offset
   UCL_D_Vec<numtyp4> coeff2;
   /// cutsq
   UCL_D_Vec<numtyp> cutsq;
   /// Special LJ values [0-3] and Special Coul values [4-7]
   UCL_D_Vec<numtyp> sp_lj;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
   /// Number of atom types 
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
 
  private:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
 };
 
 }
 
 #endif
diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp
index 2c3d3e5a9..d9328a921 100644
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@@ -1,130 +1,130 @@
 /***************************************************************************
-                           lal_buck_coul_long_ext.cpp
+                           buck_coul_long_ext.cpp
                              -------------------
-                      Trung Dac Nguyen, W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to buck/coul/long acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov nguyentd@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_buck_coul_long.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static BuckCoulLong<PRECISION,ACC_PRECISION> BUCKCLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                  double **host_buck1, double **host_buck2, 
                  double **host_a, double **host_c,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald) {
   BUCKCLMF.clear();
   gpu_mode=BUCKCLMF.device->gpu_mode();
   double gpu_split=BUCKCLMF.device->particle_split();
   int first_gpu=BUCKCLMF.device->first_device();
   int last_gpu=BUCKCLMF.device->last_device();
   int world_me=BUCKCLMF.device->world_me();
   int gpu_rank=BUCKCLMF.device->gpu_rank();
   int procs_per_gpu=BUCKCLMF.device->procs_per_gpu();
 
   BUCKCLMF.device->init_message(screen,"buck/coul/long",first_gpu,last_gpu);
 
   bool message=false;
   if (BUCKCLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
     fprintf(screen,"Initializing GPU and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                         host_a, host_c, offset, special_lj, inum, nall, 300, 
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
   BUCKCLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
         fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                         host_a, host_c, offset, special_lj, inum, nall, 300, 
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     BUCKCLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BUCKCLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void buckcl_gpu_clear() {
   BUCKCLMF.clear();
 }
 
 int** buckcl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd) {
   return BUCKCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
 }  
 			
 void buckcl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_q,
                       const int nlocal, double *boxlo, double *prd) {
   BUCKCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 host_q,nlocal,boxlo,prd);
 }
 
 double buckcl_gpu_bytes() {
   return BUCKCLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp
index 1c4aaa222..9f7f725ae 100644
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@@ -1,121 +1,121 @@
 /***************************************************************************
-                                 lal_buck_ext.cpp
+                                 buck_ext.cpp
                              -------------------
-                            W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to buck acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_buck.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Buck<PRECISION,ACC_PRECISION> BUCKMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                  double **host_buck1, double **host_buck2, 
                  double **host_a, double **host_c,       
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
   BUCKMF.clear();
   gpu_mode=BUCKMF.device->gpu_mode();
   double gpu_split=BUCKMF.device->particle_split();
   int first_gpu=BUCKMF.device->first_device();
   int last_gpu=BUCKMF.device->last_device();
   int world_me=BUCKMF.device->world_me();
   int gpu_rank=BUCKMF.device->gpu_rank();
   int procs_per_gpu=BUCKMF.device->procs_per_gpu();
 
   BUCKMF.device->init_message(screen,"buck",first_gpu,last_gpu);
 
   bool message=false;
   if (BUCKMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
     fprintf(screen,"Initializing GPU and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
   BUCKMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
         fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
     BUCKMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BUCKMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void buck_gpu_clear() {
   BUCKMF.clear(); 
 }
 
 int ** buck_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success) {
   return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void buck_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success) {
   BUCKMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double buck_gpu_bytes() {
   return BUCKMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index d9d027935..971725764 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -1,582 +1,578 @@
 /***************************************************************************
-                                lal_eam.cpp
+                                   eam.cpp
                              -------------------
-                      W. Michael Brown, Trung Dac Nguyen (ORNL)
+                   Trung Dac Nguyen, W. Michael Brown (ORNL)
 
   Class for acceleration of the eam pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
  
 #ifdef USE_OPENCL
 #include "eam_cl.h"
 #else
 #include "eam_ptx.h"
 #endif
 
 #include "lal_eam.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define EAMT EAM<numtyp, acctyp>
 
 
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MAX(A,B) ((A) > (B) ? (A) : (B))
 
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
 EAMT::EAM() : BaseAtomic<numtyp,acctyp>(), 
   _compiled_energy(false), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
 EAMT::~EAM() {
   clear();
 }
  
 template <class numtyp, class acctyp>
-int EAMT::init(const int ntypes, double host_cutforcesq,
-              int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
-              double ***host_rhor_spline, double ***host_z2r_spline,
-              double ***host_frho_spline,
-              double rdr, double rdrho, int nrhor, int nrho, 
-              int nz2r, int nfrho, int nr,
-              const int nlocal, const int nall, const int max_nbors,
-              const int maxspecial, const double cell_size,
-              const double gpu_split, FILE *_screen) 
+int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
+               int **host_type2z2r, int *host_type2frho,
+               double ***host_rhor_spline, double ***host_z2r_spline,
+               double ***host_frho_spline, double rdr, double rdrho, int nrhor,
+               int nrho, int nz2r, int nfrho, int nr, const int nlocal,
+               const int nall, const int max_nbors, const int maxspecial,
+               const double cell_size, const double gpu_split, FILE *_screen) 
 {
   int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,eam);
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,eam);
   
   if (success!=0)
     return success;
   
   // allocate fp
   
   bool cpuview=false;
   if (this->ucl_device->device_type()==UCL_CPU)
     cpuview=true;
   
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
 
   _max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
   host_fp.alloc(_max_fp_size,*(this->ucl_device));
   if (cpuview)
     dev_fp.view(host_fp);
   else 
     dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY);
                                      
   k_energy.set_function(*(this->pair_program),"kernel_energy");
   k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast");
   fp_tex.get_texture(*(this->pair_program),"fp_tex");
   fp_tex.bind_float(dev_fp,1);
   _compiled_energy = true;
   
   // Initialize timers for selected GPU
   time_pair2.init(*(this->ucl_device));
   time_pair2.zero();
   
   time_fp1.init(*(this->ucl_device));
   time_fp1.zero();
   
   time_fp2.init(*(this->ucl_device));
   time_fp2.zero();
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
 
   int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
   }
   
   _ntypes=lj_types;
   _cutforcesq=host_cutforcesq;
   _rdr=rdr;
   _rdrho = rdrho;
   _nrhor=nrhor;
   _nrho=nrho;
   _nz2r=nz2r;
   _nfrho=nfrho;
   _nr=nr;
   
   UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
   
   for (int i=0; i<lj_types*lj_types; i++) {
     dview_type[i].x=0; dview_type[i].y=0;
   }
                                 
   // pack type2rhor and type2z2r
   type2rhor_z2r.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   
   for (int i=0; i<ntypes; i++) {
     for (int j=0; j<ntypes; j++) {
       dview_type[i*lj_types+j].x=host_type2rhor[i][j];
       dview_type[i*lj_types+j].y=host_type2z2r[i][j];
     }
   }
   
   ucl_copy(type2rhor_z2r,dview_type,false);
   
   // pack type2frho
   UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                                 UCL_WRITE_OPTIMIZED);
 
   type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<ntypes; i++)
     dview_type2frho[i]=host_type2frho[i];
   ucl_copy(type2frho,dview_type2frho,false);
 
-  
-
   // pack frho_spline
   UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
                                
   for (int ix=0; ix<nfrho; ix++)
     for (int iy=0; iy<nrho+1; iy++) {
     dview_frho_spline[ix*(nrho+1)+iy].x=host_frho_spline[ix][iy][0];
     dview_frho_spline[ix*(nrho+1)+iy].y=host_frho_spline[ix][iy][1];
     dview_frho_spline[ix*(nrho+1)+iy].z=host_frho_spline[ix][iy][2];
     dview_frho_spline[ix*(nrho+1)+iy].w=0;
   }
 
   frho_spline1.alloc(nfrho*(nrho+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(frho_spline1,dview_frho_spline,false);
   frho_spline1_tex.get_texture(*(this->pair_program),"frho_sp1_tex");
   frho_spline1_tex.bind_float(frho_spline1,4);
  
 
   for (int ix=0; ix<nfrho; ix++)
     for (int iy=0; iy<nrho+1; iy++) {
     dview_frho_spline[ix*(nrho+1)+iy].x=host_frho_spline[ix][iy][3];
     dview_frho_spline[ix*(nrho+1)+iy].y=host_frho_spline[ix][iy][4];
     dview_frho_spline[ix*(nrho+1)+iy].z=host_frho_spline[ix][iy][5];
     dview_frho_spline[ix*(nrho+1)+iy].w=host_frho_spline[ix][iy][6];
   }
 
   frho_spline2.alloc(nfrho*(nrho+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(frho_spline2,dview_frho_spline,false);
   frho_spline2_tex.get_texture(*(this->pair_program),"frho_sp2_tex");
   frho_spline2_tex.bind_float(frho_spline2,4);
 
   // pack rhor_spline
   UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
                                
   for (int ix=0; ix<nrhor; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][0];
     dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][1];
     dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][2];
     dview_rhor_spline[ix*(nr+1)+iy].w=(numtyp)0;
   }
 
   rhor_spline1.alloc(nrhor*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(rhor_spline1,dview_rhor_spline,false);
   rhor_spline1_tex.get_texture(*(this->pair_program),"rhor_sp1_tex");
   rhor_spline1_tex.bind_float(rhor_spline1,4);
 
   for (int ix=0; ix<nrhor; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][3];
     dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][4];
     dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][5];
     dview_rhor_spline[ix*(nr+1)+iy].w=host_rhor_spline[ix][iy][6];
   }
 
   rhor_spline2.alloc(nrhor*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(rhor_spline2,dview_rhor_spline,false);
   rhor_spline2_tex.get_texture(*(this->pair_program),"rhor_sp2_tex");
   rhor_spline2_tex.bind_float(rhor_spline2,4);
 
   // pack z2r_spline
   UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
                                
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][0];
     dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][1];
     dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][2];
     dview_z2r_spline[ix*(nr+1)+iy].w=(numtyp)0;
   }
   
   z2r_spline1.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(z2r_spline1,dview_z2r_spline,false);
   z2r_spline1_tex.get_texture(*(this->pair_program),"z2r_sp1_tex");
   z2r_spline1_tex.bind_float(z2r_spline1,4);
   
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][3];
     dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][4];
     dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][5];
     dview_z2r_spline[ix*(nr+1)+iy].w=host_z2r_spline[ix][iy][6];
   }
   
   z2r_spline2.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(z2r_spline2,dview_z2r_spline,false);
   z2r_spline2_tex.get_texture(*(this->pair_program),"z2r_sp2_tex");
   z2r_spline2_tex.bind_float(z2r_spline2,4);
 
   _allocated=true;
   this->_max_bytes=type2rhor_z2r.row_bytes()
         + type2frho.row_bytes()
         + rhor_spline1.row_bytes()
         + rhor_spline2.row_bytes()
         + frho_spline1.row_bytes()
         + frho_spline2.row_bytes()
         + z2r_spline1.row_bytes()
         + z2r_spline2.row_bytes()
         + dev_fp.row_bytes();
   return 0;
 }
 
 template <class numtyp, class acctyp>
 void EAMT::clear() {
   if (!_allocated)
     return;
   _allocated=false;
   
   type2rhor_z2r.clear();
   type2frho.clear();
   rhor_spline1.clear();
   rhor_spline2.clear();
   frho_spline1.clear();
   frho_spline2.clear();
   z2r_spline1.clear();
   z2r_spline2.clear();
   
   host_fp.clear();
   dev_fp.clear();
   
   time_pair2.clear();
   time_fp1.clear();
   time_fp2.clear();
   
   if (_compiled_energy) {
     k_energy_fast.clear();
     k_energy.clear();
     _compiled_energy=false;
   }
 
   this->clear_atomic();
 }
 
 template <class numtyp, class acctyp>
 double EAMT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(EAM<numtyp,acctyp>);
 }
 
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then compute atom energies/forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
                    const int nall, double **host_x, int *host_type,
                    int *ilist, int *numj, int **firstneigh,
                    const bool eflag, const bool vflag,
                    const bool eatom, const bool vatom,
                    int &host_start, const double cpu_time,
                    bool &success, void **fp_ptr) {
   this->acc_timers();
   
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
     this->time_pair.add_time_to_total(time_pair2.time());
     
     // Add transfer time from device -> host after part 1
     this->atom->add_transfer_time(time_fp1.time());
     
     // Add transfer time from host -> device before part 2
     this->atom->add_transfer_time(time_fp2.time());
   }
   
   // ------------------- Resize FP Array for EAM --------------------
   
   if (nall>_max_fp_size) {
     dev_fp.clear();
     host_fp.clear();
     
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
     host_fp.alloc(_max_fp_size,*(this->ucl_device));
     if (this->ucl_device->device_type()==UCL_CPU)
       dev_fp.view(host_fp);
     else 
       dev_fp.alloc(_max_fp_size,*(this->ucl_device));
     
     fp_tex.bind_float(dev_fp,1);
   }
   *fp_ptr=host_fp.begin();
 
   // ----------------------------------------------------------------
 
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
     this->resize_atom(0,nall,success);
     this->zero_timers();
     return;
   }
   
   int ago=this->hd_balancer.ago_first(f_ago);
   int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
   this->ans->inum(inum);
   host_start=inum;
 
   // -----------------------------------------------------------------
 
   if (ago==0) {
     this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
     if (!success)
       return;
   }
   
   this->atom->cast_x_data(host_x,host_type);
   this->atom->add_x_data(host_x,host_type);
 
   loop(eflag,vflag);
 
   // copy fp from device to host for comm
   _nlocal=nlocal;
   time_fp1.start();
   ucl_copy(host_fp,dev_fp,nlocal,false);
   time_fp1.stop();
 
 }
 
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU and then compute per-atom densities
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** EAMT::compute(const int ago, const int inum_full,
                     const int nall, double **host_x, int *host_type,
                     double *sublo, double *subhi, int *tag,
                     int **nspecial, int **special, const bool eflag, 
                     const bool vflag, const bool eatom,
                     const bool vatom, int &host_start,
                     int **ilist, int **jnum,
                     const double cpu_time, bool &success,
                     int &inum, void **fp_ptr) {
   this->acc_timers();
   
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
     this->time_pair.add_time_to_total(time_pair2.time());
     
     // Add transfer time from device -> host after part 1
     this->atom->add_transfer_time(time_fp1.time());
     
     // Add transfer time from host -> device before part 2
     this->atom->add_transfer_time(time_fp2.time());
   }
 
   // ------------------- Resize FP Array for EAM --------------------
   
   if (nall>_max_fp_size) {
     dev_fp.clear();
     host_fp.clear();
     
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
     host_fp.alloc(_max_fp_size,*(this->ucl_device));
     if (this->ucl_device->device_type()==UCL_CPU)
       dev_fp.view(host_fp);
     else 
       dev_fp.alloc(_max_fp_size,*(this->ucl_device));
     
     fp_tex.bind_float(dev_fp,1);
   }      
   *fp_ptr=host_fp.begin();  
 
   // -----------------------------------------------------------------
   
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
     this->resize_atom(0,nall,success);
     this->zero_timers();
     return NULL;
   }
   
   // load balance, returning the atom count on the device (inum)
   this->hd_balancer.balance(cpu_time);
   inum=this->hd_balancer.get_gpu_count(ago,inum_full);
   this->ans->inum(inum);
   host_start=inum;
  
   // Build neighbor list on GPU if necessary 
   if (ago==0) {
     this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
   } else {
     this->atom->cast_x_data(host_x,host_type);
     this->atom->add_x_data(host_x,host_type);
   }
   *ilist=this->nbor->host_ilist.begin();
   *jnum=this->nbor->host_acc.begin();
 
   loop(eflag,vflag);
   
   // copy fp from device to host for comm
   _nlocal=inum_full;
   time_fp1.start();
   ucl_copy(host_fp,dev_fp,inum_full,false);
   time_fp1.stop();
   
   return this->nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom) {
   if (this->ans->inum()==0) 
     return;
   
   this->hd_balancer.start_timer();
   time_fp2.start();
   this->add_fp_data();
   time_fp2.stop();
   
   loop2(eflag,vflag);
   if (ilist == NULL)
     this->ans->copy_answers(eflag,vflag,eatom,vatom);
   else
     this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist);
   
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
 
 // ---------------------------------------------------------------------------
 // Calculate per-atom energies and forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::loop(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
   if (_eflag)
     eflag=1;
   else
     eflag=0;
 
   if (_vflag)
     vflag=1;
   else
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   
   if (shared_types) {
     this->k_energy_fast.set_size(GX,BX);
     this->k_energy_fast.run(&this->atom->dev_x.begin(), 
                  &type2rhor_z2r.begin(), &type2frho.begin(),
                  &rhor_spline2.begin(),
                  &frho_spline1.begin(),&frho_spline2.begin(), 
                  &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(),
                  &dev_fp.begin(), 
                  &this->ans->dev_engv.begin(),
                  &eflag, &ainum,
                  &nbor_pitch, 
                  &_ntypes, &_cutforcesq, 
                  &_rdr, &_rdrho,
                  &_nrho, &_nr,
                  &this->_threads_per_atom);
   } else {
     this->k_energy.set_size(GX,BX);
     this->k_energy.run(&this->atom->dev_x.begin(), 
                  &type2rhor_z2r.begin(), &type2frho.begin(),
                  &rhor_spline2.begin(),
                  &frho_spline1.begin(),&frho_spline2.begin(), 
                  &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(),
                  &dev_fp.begin(), 
                  &this->ans->dev_engv.begin(),
                  &eflag, &ainum,
                  &nbor_pitch, 
                  &_ntypes, &_cutforcesq, 
                  &_rdr, &_rdrho,
                  &_nrho, &_nr,
                  &this->_threads_per_atom);
   }
 
   this->time_pair.stop();
 }
 
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::loop2(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
   if (_eflag)
     eflag=1;
   else
     eflag=0;
 
   if (_vflag)
     vflag=1;
   else
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair2.start();
   
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(), 
                    &type2rhor_z2r.begin(),
                    &rhor_spline1.begin(), 
                    &z2r_spline1.begin(),
                    &z2r_spline2.begin(), 
                    &this->nbor->dev_nbor.begin(),
                    &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                    &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                    &nbor_pitch, &_cutforcesq, &_rdr, &_nr,
                    &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(), 
                    &type2rhor_z2r.begin(),
                    &rhor_spline1.begin(), 
                    &z2r_spline1.begin(),
                    &z2r_spline2.begin(),
                    &this->nbor->dev_nbor.begin(),
                    &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                    &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                    &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr,
                    &this->_threads_per_atom);
   }
 
   this->time_pair2.stop();
 }
 
 template class EAM<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index e705695bb..84417c251 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -1,493 +1,493 @@
 // **************************************************************************
-//                             lal_eam.cu
+//                                   eam.cu
 //                             -------------------
-//                     Trung Dac Nguyen, W. Michael Brown (ORNL)
+//                   Trung Dac Nguyen, W. Michael Brown (ORNL)
 //
 //  Device code for acceleration of the eam pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 texture<float4> pos_tex;
 texture<float> fp_tex;
 
 texture<float4> rhor_sp1_tex;
 texture<float4> rhor_sp2_tex;
 texture<float4> frho_sp1_tex;
 texture<float4> frho_sp2_tex;
 texture<float4> z2r_sp1_tex;
 texture<float4> z2r_sp2_tex;
 
 #ifdef _DOUBLE_DOUBLE
 ucl_inline double4 fetch_rhor_sp1(const int& i, const double4 *rhor_spline1) { 
   return rhor_spline1[i]; 
 }
 ucl_inline double4 fetch_rhor_sp2(const int& i, const double4 *rhor_spline2) { 
   return rhor_spline2[i]; 
 }
 ucl_inline double4 fetch_frho_sp1(const int& i, const double4 *frho_spline1) { 
   return frho_spline1[i]; 
 }
 ucl_inline double4 fetch_frho_sp2(const int& i, const double4 *frho_spline2) { 
   return frho_spline2[i]; 
 }
 ucl_inline double4 fetch_z2r_sp1(const int& i, const double4 *z2r_spline1) { 
   return z2r_spline1[i]; 
 }
 ucl_inline double4 fetch_z2r_sp2(const int& i, const double4 *z2r_spline2) { 
   return z2r_spline2[i]; 
 }
 #endif
 
 #ifndef _DOUBLE_DOUBLE
 ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
   { return tex1Dfetch(pos_tex, i); }
 ucl_inline float fetch_q(const int& i, const float *fp) 
   { return tex1Dfetch(fp_tex, i); }
 
 ucl_inline float4 fetch_rhor_sp1(const int& i, const float4 *rhor_spline1) 
   { return tex1Dfetch(rhor_sp1_tex, i); }
 ucl_inline float4 fetch_rhor_sp2(const int& i, const float4 *rhor_spline2) 
   { return tex1Dfetch(rhor_sp2_tex, i); }
 ucl_inline float4 fetch_frho_sp1(const int& i, const float4 *frho_spline1) 
   { return tex1Dfetch(frho_sp1_tex, i); }
 ucl_inline float4 fetch_frho_sp2(const int& i, const float4 *frho_spline2) 
   { return tex1Dfetch(frho_sp2_tex, i); }
 ucl_inline float4 fetch_z2r_sp1(const int& i, const float4 *z2r_spline1) 
   { return tex1Dfetch(z2r_sp1_tex, i); }
 ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2) 
   { return tex1Dfetch(z2r_sp2_tex, i); }
 #endif
 
 
 #endif
 
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MAX(A,B) ((A) > (B) ? (A) : (B))
 
 #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,           \
                      eflag,vflag,engv,rdrho,nrho,i)                           \
   if (t_per_atom>1) {                                                       \
     __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=rho;                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       if (offset < s)                                                       \
          red_acc[tid] += red_acc[tid+s];                                    \
       }                                                                     \
       rho=red_acc[tid];                                                     \
   }                                                                         \
   if (offset==0) {                                                          \
     numtyp p = rho*rdrho + (numtyp)1.0;                                     \
     int m=p;                                                                \
     m = MAX(1,MIN(m,nrho-1));                                               \
     p -= m;                                                                 \
     p = MIN(p,(numtyp)1.0);                                                 \
     int index = type2frho[itype]*(nrho+1)+m;                                \
     numtyp4 coeff = fetch_frho_sp1(index, frho_spline1);                    \
     numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z;                          \
     fp_[i]=fp;                                                             \
     if (eflag>0) {                                                          \
       coeff = fetch_frho_sp2(index, frho_spline2);                          \
       energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
       engv[ii]=(acctyp)2.0*energy;                                          \
     }                                                                       \
   }
 
 #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
                       offset, elag, vflag, ans, engv)                       \
   if (t_per_atom>1) {                                                       \
     __local acctyp red_acc[6][BLOCK_PAIR];                                  \
     red_acc[0][tid]=f.x;                                                    \
     red_acc[1][tid]=f.y;                                                    \
     red_acc[2][tid]=f.z;                                                    \
     red_acc[3][tid]=energy;                                                 \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       if (offset < s) {                                                     \
         for (int r=0; r<4; r++)                                             \
           red_acc[r][tid] += red_acc[r][tid+s];                             \
       }                                                                     \
     }                                                                       \
     f.x=red_acc[0][tid];                                                    \
     f.y=red_acc[1][tid];                                                    \
     f.z=red_acc[2][tid];                                                    \
     energy=red_acc[3][tid];                                                 \
     if (vflag>0) {                                                          \
       for (int r=0; r<6; r++)                                               \
         red_acc[r][tid]=virial[r];                                          \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
         if (offset < s) {                                                   \
           for (int r=0; r<6; r++)                                           \
             red_acc[r][tid] += red_acc[r][tid+s];                           \
         }                                                                   \
       }                                                                     \
       for (int r=0; r<6; r++)                                               \
         virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
     if (eflag>0) {                                                          \
       engv[ii]+=energy;                                                     \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
         engv[ii]=virial[i];                                                 \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
   }
 
 __kernel void kernel_energy(__global numtyp4 *x_, 
            __global int2 *type2rhor_z2r, __global int *type2frho,
            __global numtyp4 *rhor_spline2, __global numtyp4 *frho_spline1,
            __global numtyp4 *frho_spline2,
            __global int *dev_nbor, __global int *dev_packed,
            __global numtyp *fp_, 
            __global acctyp *engv, const int eflag, 
            const int inum, const int nbor_pitch,
            const int ntypes, const numtyp cutforcesq, 
            const numtyp rdr, const numtyp rdrho,
            const int nrho, const int nr,
            const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
   acctyp rho = (acctyp)0;
   acctyp energy = (acctyp)0;
    
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int itype=ix.w;
     
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
       
       if (rsq<cutforcesq) {
         numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0;
         int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
         
         int mtype = jtype*ntypes+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
         numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
     
     store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
         eflag,vflag,engv,rdrho,nrho,i);
   } // if ii
 }
 
 __kernel void kernel_energy_fast(__global numtyp4 *x_, 
            __global int2 *type2rhor_z2r_in, __global int *type2frho_in,
            __global numtyp4 *rhor_spline2, __global numtyp4 *frho_spline1,
            __global numtyp4 *frho_spline2,
            __global int *dev_nbor, __global int *dev_packed,
            __global numtyp *fp_, 
            __global acctyp *engv, const int eflag, 
            const int inum, const int nbor_pitch,
            const int ntypes, const numtyp cutforcesq, 
            const numtyp rdr, const numtyp rdrho,
            const int nrho, const int nr,
            const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local int type2frho[MAX_SHARED_TYPES];
 
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     type2rhor_z2r[tid]=type2rhor_z2r_in[tid];
   }
   
   if (tid<MAX_SHARED_TYPES) {
     type2frho[tid]=type2frho_in[tid];
   }
 
   acctyp rho = (acctyp)0;
   acctyp energy = (acctyp)0;
   
   __syncthreads(); 
 
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int itype=ix.w;
     
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
       
       if (rsq<cutforcesq) {
         numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0;
         int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
         
         int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
         int mtype = jtype+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
         numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
     
     store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
         eflag,vflag,engv,rdrho,nrho,i);
   } // if ii
 }
 
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
                           __global int2 *type2rhor_z2r,
                           __global numtyp4 *rhor_spline1, 
                           __global numtyp4 *z2r_spline1,
                           __global numtyp4 *z2r_spline2,
                           __global int *dev_nbor, __global int *dev_packed, 
                           __global acctyp4 *ans, __global acctyp *engv, 
                           const int eflag, const int vflag, 
                           const int inum, const int nbor_pitch,
                           const int ntypes, const numtyp cutforcesq, 
                           const numtyp rdr, const int nr,
                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0;
   f.y=(acctyp)0;
   f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp ifp=fetch_q(i,fp_);  //fp_[i];
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
       
       if (rsq<cutforcesq) {
         numtyp r = ucl_sqrt(rsq);
         numtyp p = r*rdr + (numtyp)1.0;
         int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
         
         int mtype,index;
         numtyp4 coeff;
 
         mtype = itype*ntypes+jtype;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         coeff = fetch_rhor_sp1(index, rhor_spline1); 
         numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
 
         mtype = jtype*ntypes+itype;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         coeff = fetch_rhor_sp1(index, rhor_spline1); 
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
               
         mtype = itype*ntypes+jtype;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
         coeff = fetch_z2r_sp1(index, z2r_spline1);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
         coeff = fetch_z2r_sp2(index, z2r_spline2);
         numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
         
         numtyp recip = ucl_recip(r);
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
         numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip; 
         numtyp force = -psip*recip;
         
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           energy += phi;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
     } // for nbor
     store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                   ans,engv);
   } // if ii
 
 }
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
                           __global int2 *type2rhor_z2r_in,
                           __global numtyp4 *rhor_spline1, 
                           __global numtyp4 *z2r_spline1,
                           __global numtyp4 *z2r_spline2,
                           __global int *dev_nbor, __global int *dev_packed, 
                           __global acctyp4 *ans, __global acctyp *engv, 
                           const int eflag, const int vflag, const int inum, 
                           const int nbor_pitch,
                           const numtyp cutforcesq, 
                           const numtyp rdr, const int nr,
                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
 
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     type2rhor_z2r[tid]=type2rhor_z2r_in[tid];
   }
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
 
   __syncthreads();
 
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp ifp=fetch_q(i,fp_); //fp_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jw=jx.w;
       int jtype=fast_mul((int)MAX_SHARED_TYPES,jw);
       
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
         
       if (rsq<cutforcesq) {
         numtyp r = ucl_sqrt(rsq);
         numtyp p = r*rdr + (numtyp)1.0;
         int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
         
         numtyp4 coeff;
         int mtype,index;
         
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         coeff = fetch_rhor_sp1(index, rhor_spline1); 
         numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
         
         mtype = jtype+iw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         coeff = fetch_rhor_sp1(index, rhor_spline1); 
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
         
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
         coeff = fetch_z2r_sp1(index, z2r_spline1);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
         coeff = fetch_z2r_sp2(index, z2r_spline2);
         numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       
         numtyp recip = ucl_recip(r);
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
         numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip;
         numtyp force = -psip*recip;
         
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           energy += phi;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
     } // for nbor
     store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                   ans,engv);
   } // if ii
 }
 
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index 26d9c6fa3..9a7baf6f1 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -1,146 +1,146 @@
 /***************************************************************************
-                              lal_eam.h
+                                    eam.h
                              -------------------
-                      W. Michael Brown, Trung Dac Nguyen (ORNL)
+                   Trung Dac Nguyen, W. Michael Brown (ORNL)
 
   Class for acceleration of the eam pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifndef LAL_EAM_H
 #define LAL_EAM_H
 
 #include "lal_precision.h"
 #include "lal_base_atomic.h"
 
 namespace LAMMPS_AL {
 
 template <class numtyp, class acctyp>
 class EAM : public BaseAtomic<numtyp, acctyp> {
  public:
   EAM();
   ~EAM();
                   
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * 
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double host_cutforcesq,
           int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
           double ***host_rhor_spline, double ***host_z2r_spline,
           double ***host_frho_spline,
           double rdr, double rdrho, int nrhor, int nrho, 
           int nz2r, int nfrho, int nr,
           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *_screen);
   
   // Copy charges to device asynchronously
   inline void add_fp_data() {
     int nghost=this->atom->nall()-_nlocal;
     if (nghost>0) {
       UCL_H_Vec<numtyp> host_view;
       UCL_D_Vec<numtyp> dev_view;
       host_view.view_offset(_nlocal,host_fp);
       dev_view.view_offset(_nlocal,dev_fp);
       ucl_copy(dev_view,host_view,nghost,true);
     }
   }
   
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
 
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
   
   /// Pair loop with host neighboring
   void compute(const int f_ago, const int inum_full, const int, const int nall,
                double **host_x, int *host_type, int *ilist, int *numj,
                int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success,
                void **fp_ptr);
                
   /// Pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, int *tag, int **nspecial,
                 int **special, const bool eflag, const bool vflag, 
                 const bool eatom, const bool vatom, int &host_start, 
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 int &inum, void **fp_ptr);
 
   /// Pair loop with host neighboring
   void compute2(int *ilist, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom);
   
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Kernel k_energy, k_energy_fast;
   
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture fp_tex;
   UCL_Texture rhor_spline1_tex, rhor_spline2_tex;
   UCL_Texture frho_spline1_tex, frho_spline2_tex;
   UCL_Texture z2r_spline1_tex, z2r_spline2_tex;
 
   // --------------------------- DEVICE DATA --------------------------
   
   /// Device Timers
   UCL_Timer time_pair2, time_fp1, time_fp2;
   
   // --------------------------- TYPE DATA --------------------------
     
   UCL_D_Vec<int2> type2rhor_z2r;
   UCL_D_Vec<int> type2frho;
   
   UCL_D_Vec<numtyp4> z2r_spline1, z2r_spline2;
   UCL_D_Vec<numtyp4> frho_spline1, frho_spline2;
   UCL_D_Vec<numtyp4> rhor_spline1, rhor_spline2;
     
   numtyp _cutforcesq,_rdr,_rdrho;
   
   int _nfrho,_nrhor,_nrho,_nz2r,_nr;
   
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
   
   /// Number of atom types 
   int _ntypes;
   
   int _max_fp_size;
   
   /// True of energy kernels are compiled
   bool _compiled_energy;
   
   /// Per-atom arrays
   UCL_H_Vec<numtyp> host_fp;
   UCL_D_Vec<numtyp> dev_fp;
   
 protected:
   bool _allocated;
   int _nlocal;
   void loop(const bool _eflag, const bool _vflag);
   void loop2(const bool _eflag, const bool _vflag);
 };
 
 }
 
 #endif
 
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index 9fab47451..d8f6b3cee 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -1,147 +1,147 @@
-// **************************************************************************
-//                               lal_eam_ext.cpp
-//                             -------------------
-//                     W. Michael Brown, Trung Dac Nguyen (ORNL)
-//
-//  Class for acceleration of the eam pair style
-//
-// __________________________________________________________________________
-//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
-// __________________________________________________________________________
-//
-//    begin                : 
-//    email                : brownw@ornl.gov nguyentd@ornl.gov
-// ***************************************************************************/
+/***************************************************************************
+                                 eam_ext.cpp
+                             -------------------
+                   Trung Dac Nguyen, W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to buck acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov nguyentd@ornl.gov
+ ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_eam.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static EAM<PRECISION,ACC_PRECISION> EAMMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int eam_gpu_init(const int ntypes, double host_cutforcesq, 
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline,
                  double rdr, double rdrho, int nrhor, 
                  int nrho, int nz2r, int nfrho, int nr, 
                  const int nlocal, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, 
                  int &gpu_mode, FILE *screen, int &fp_size) {
   EAMMF.clear();
   gpu_mode=EAMMF.device->gpu_mode();
   double gpu_split=EAMMF.device->particle_split();
   int first_gpu=EAMMF.device->first_device();
   int last_gpu=EAMMF.device->last_device();
   int world_me=EAMMF.device->world_me();
   int gpu_rank=EAMMF.device->gpu_rank();
   int procs_per_gpu=EAMMF.device->procs_per_gpu();
 
   // disable host/device split for now
   if (gpu_split != 1.0) 
     return -8;
     
   fp_size=sizeof(PRECISION);
     
   EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu);
 
   bool message=false;
   if (EAMMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
     fprintf(screen,"Initializing GPU and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=EAMMF.init(ntypes, host_cutforcesq,
                     host_type2rhor, host_type2z2r, host_type2frho,
                     host_rhor_spline, host_z2r_spline,
                     host_frho_spline,
                     rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, 
                     nlocal, nall, 300, maxspecial,
                     cell_size, gpu_split, screen);
 
   EAMMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
         fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMMF.init(ntypes, host_cutforcesq,
                     host_type2rhor, host_type2z2r, host_type2frho,
                     host_rhor_spline, host_z2r_spline,
                     host_frho_spline,
                     rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, 
                     nlocal, nall, 300, maxspecial,
                     cell_size, gpu_split, screen);
 
     EAMMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     EAMMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void eam_gpu_clear() {
   EAMMF.clear();
 }
 
 int ** eam_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, int &inum, void **fp_ptr) {
   return EAMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         inum, fp_ptr);
 }  
 
 void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, 
                      const int nall, double **host_x, int *host_type, 
                      int *ilist, int *numj, int **firstneigh, const bool eflag,
                      const bool vflag, const bool eatom, const bool vatom,
                      int &host_start, const double cpu_time, bool &success,
                      void **fp_ptr) {
   EAMMF.compute(ago,inum_full,nlocal,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 fp_ptr);
 }
 
 void eam_gpu_compute_force(int *ilist, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom) {
   EAMMF.compute2(ilist, eflag, vflag, eatom, vatom);
 }
 
 
 double eam_gpu_bytes() {
   return EAMMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp
index c75bd089b..5ab94ae81 100644
--- a/lib/gpu/lal_yukawa.cpp
+++ b/lib/gpu/lal_yukawa.cpp
@@ -1,150 +1,150 @@
 /***************************************************************************
-                                lal_yukawa.cpp
+                                 yukawa.cpp
                              -------------------
-                      Trung Dac Nguyen, W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the yukawa pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov nguyentd@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifdef USE_OPENCL
 #include "yukawa_cl.h"
 #else
 #include "yukawa_ptx.h"
 #endif
 
 #include "lal_yukawa.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define YukawaT Yukawa<numtyp, acctyp>
 
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
 YukawaT::Yukawa() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
 YukawaT::~Yukawa() { 
   clear();
 }
  
 template <class numtyp, class acctyp>
 int YukawaT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
 int YukawaT::init(const int ntypes, 
                   double **host_cutsq, double kappa,
                   double **host_a, double **host_offset, 
                   double *host_special_lj, const int nlocal,
                   const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                             _screen,yukawa);
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
   int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
 
   coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,host_offset,
 			 host_cutsq);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   dview.view(host_special_lj,4,*(this->ucl_device));
   ucl_copy(sp_lj,dview,false);
 
   _kappa = kappa;
   
   _allocated=true;
   this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
   return 0;
 }
 
 template <class numtyp, class acctyp>
 void YukawaT::clear() {
   if (!_allocated)
     return;
   _allocated=false;
 
   coeff.clear();
   sp_lj.clear();
   this->clear_atomic();
 }
 
 template <class numtyp, class acctyp>
 double YukawaT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Yukawa<numtyp,acctyp>);
 }
 
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void YukawaT::loop(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
   if (_eflag)
     eflag=1;
   else
     eflag=0;
 
   if (_vflag)
     vflag=1;
   else
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa, 
                           &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa,
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
 
 template class Yukawa<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index 39c571455..e2fa11aa3 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -1,189 +1,189 @@
 // **************************************************************************
-//                                   yukawa.cu
+//                                  yukawa.cu
 //                             -------------------
-//                      Trung Dac Nguyen, W. Michael Brown (ORNL)
+//                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the yukawa pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
-//    email                : brownw@ornl.gov nguyentd@ornl.gov
+//    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
 ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
   { return tex1Dfetch(pos_tex, i); }
 #endif
 #endif
 
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
                           const numtyp kappa, const int lj_types,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag, 
                           const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
         
       int mtype=itype*lj_types+jtype;
       if (rsq<coeff[mtype].z) {
         numtyp r2inv = (numtyp)1.0/rsq;
         numtyp r = ucl_rsqrt(r2inv);
         numtyp rinv = 1.0/r;
         numtyp screening = exp(-kappa*r);
         numtyp force = coeff[mtype].x*screening*(kappa + rinv)*r2inv;
         force*=factor_lj;
       
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x*screening*rinv;
           energy+=factor_lj*(e-coeff[mtype].y); 
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
     store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                   ans,engv);
   } // if ii
 }
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
                                const numtyp kappa, __global numtyp* sp_lj_in, 
                                __global int *dev_nbor, __global int *dev_packed, 
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
                                const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
   
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
 
   __syncthreads();
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
         
       if (rsq<coeff[mtype].z) {
         numtyp r2inv = (numtyp)1.0/rsq;
         numtyp r = ucl_rsqrt(r2inv);
         numtyp rinv = 1.0/r;
         numtyp screening = exp(-kappa*r);
         numtyp force = coeff[mtype].x*screening*(kappa + rinv)*r2inv;
         force*=factor_lj;
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x*screening*rinv;
           energy+=factor_lj*(e-coeff[mtype].y); 
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
     store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                   ans,engv);
   } // if ii
 }
 
diff --git a/lib/gpu/lal_yukawa.h b/lib/gpu/lal_yukawa.h
index 257bd9da5..720dc903d 100644
--- a/lib/gpu/lal_yukawa.h
+++ b/lib/gpu/lal_yukawa.h
@@ -1,80 +1,80 @@
 /***************************************************************************
-                                lal_yukawa.h
+                                  yukawa.h
                              -------------------
-                    Trung Dac Nguyen, W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the yukawa pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov nguyentd@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifndef LAL_YUKAWA_H
 #define LAL_YUKAWA_H
 
 #include "lal_base_atomic.h"
 
 namespace LAMMPS_AL {
 
 template <class numtyp, class acctyp>
 class Yukawa : public BaseAtomic<numtyp, acctyp> {
  public:
   Yukawa();
   ~Yukawa(); 
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * 
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq, double kappa,
            double **host_a, double **host_offset, double *host_special_lj,
            const int nlocal, const int nall, const int max_nbors, 
            const int maxspecial, const double cell_size, 
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
 
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
 
   // --------------------------- TYPE DATA --------------------------
 
   /// coeff.x = a, coeff.y = offset, coeff.z = cutsq
   UCL_D_Vec<numtyp4> coeff;
   
   /// Special LJ values
   UCL_D_Vec<numtyp> sp_lj;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
   /// Number of atom types 
   int _lj_types;
   
   /// kappa
   numtyp _kappa;
 
  private:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
 };
 
 }
 
 #endif
diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp
index 1ba2d0902..36f390ab9 100644
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@@ -1,120 +1,120 @@
 /***************************************************************************
-                              lal_yukawa_ext.cpp
+                               yukawa_ext.cpp
                              -------------------
-                     Trung Dac Nguyen, W. Michael Brown (ORNL)
+                           Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to yukawa acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
-    email                : brownw@ornl.gov nguyentd@ornl.gov
+    email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_yukawa.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Yukawa<PRECISION,ACC_PRECISION> YKMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
                  double **host_a, double **offset, double *special_lj, 
                  const int inum, const int nall, const int max_nbors,  
                  const int maxspecial, const double cell_size, 
                  int &gpu_mode, FILE *screen) {
   YKMF.clear();
   gpu_mode=YKMF.device->gpu_mode();
   double gpu_split=YKMF.device->particle_split();
   int first_gpu=YKMF.device->first_device();
   int last_gpu=YKMF.device->last_device();
   int world_me=YKMF.device->world_me();
   int gpu_rank=YKMF.device->gpu_rank();
   int procs_per_gpu=YKMF.device->procs_per_gpu();
 
   YKMF.device->init_message(screen,"yukawa",first_gpu,last_gpu);
 
   bool message=false;
   if (YKMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
     fprintf(screen,"Initializing GPU and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, 
                       inum, nall, 300, maxspecial, cell_size, 
                       gpu_split, screen);
 
   YKMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
         fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, 
                       inum, nall, 300, maxspecial, cell_size, 
                       gpu_split, screen);
 
     YKMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     YKMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void yukawa_gpu_clear() {
   YKMF.clear();
 }
 
 int ** yukawa_gpu_compute_n(const int ago, const int inum_full,
                             const int nall, double **host_x, int *host_type,
                             double *sublo, double *subhi, int *tag, int **nspecial,
                             int **special, const bool eflag, const bool vflag,
                             const bool eatom, const bool vatom, int &host_start,
                             int **ilist, int **jnum, const double cpu_time,
                             bool &success) {
   return YKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void yukawa_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success) {
   YKMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double yukawa_gpu_bytes() {
   return YKMF.host_memory_usage();
 }
 
 
diff --git a/src/GPU/pair_buck_coul_cut_gpu.cpp b/src/GPU/pair_buck_coul_cut_gpu.cpp
index ceaf1090b..d4e00811c 100644
--- a/src/GPU/pair_buck_coul_cut_gpu.cpp
+++ b/src/GPU/pair_buck_coul_cut_gpu.cpp
@@ -1,257 +1,260 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
    
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL)
+   Contributing authors: Trung Dac Nguyen (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "lmptype.h"
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_buck_coul_cut_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #include "gpu_extra.h"
 
 // External functions from cuda library for atom decomposition
 
 int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
-                 double **host_a, double **host_c,       
-                 double **offset, double *special_lj, const int inum,
-                 const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen,
-                 double **host_cut_ljsq, double **host_cut_coulsq,
-                 double *host_special_coul, const double qqrd2e);
+		   double **host_buck1, double **host_buck2, double **host_a,
+		   double **host_c, double **offset, double *special_lj,
+		   const int inum, const int nall, const int max_nbors,
+		   const int maxspecial, const double cell_size,
+		   int &gpu_mode, FILE *screen, double **host_cut_ljsq,
+		   double **host_cut_coulsq, double *host_special_coul,
+		   const double qqrd2e);
 void buckc_gpu_clear();
-int ** buckc_gpu_compute_n(const int ago, const int inum_full,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, int *tag, int **nspecial, 
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        int **ilist, int **jnum, const double cpu_time,
-                        bool &success, double *host_q, double *boxlo,
-                        double *prd);
+int ** buckc_gpu_compute_n(const int ago, const int inum_full, const int nall,
+			   double **host_x, int *host_type, double *sublo,
+			   double *subhi, int *tag, int **nspecial, 
+			   int **special, const bool eflag, const bool vflag,
+			   const bool eatom, const bool vatom, int &host_start,
+			   int **ilist, int **jnum, const double cpu_time,
+			   bool &success, double *host_q, double *boxlo,
+			   double *prd);
 void buckc_gpu_compute(const int ago, const int inum_full, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double *host_q,
-                     const int nlocal, double *boxlo, double *prd);
+		       double **host_x, int *host_type, int *ilist, int *numj,
+		       int **firstneigh, const bool eflag, const bool vflag,
+		       const bool eatom, const bool vatom, int &host_start,
+		       const double cpu_time, bool &success, double *host_q,
+		       const int nlocal, double *boxlo, double *prd);
 double buckc_gpu_bytes();
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairBuckCoulCutGPU::PairBuckCoulCutGPU(LAMMPS *lmp) : PairBuckCoulCut(lmp), 
-  gpu_mode(GPU_FORCE)
+						      gpu_mode(GPU_FORCE)
 {
   respa_enable = 0;
   cpu_time = 0.0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); 
 }
 
 /* ----------------------------------------------------------------------
    free all arrays
 ------------------------------------------------------------------------- */
 
 PairBuckCoulCutGPU::~PairBuckCoulCutGPU()
 {
   buckc_gpu_clear();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairBuckCoulCutGPU::compute(int eflag, int vflag)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
   
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
   if (gpu_mode != GPU_FORCE) {
     inum = atom->nlocal;
     firstneigh = buckc_gpu_compute_n(neighbor->ago, inum, nall,
-				   atom->x, atom->type, domain->sublo,
-				   domain->subhi, atom->tag, atom->nspecial,
-				   atom->special, eflag, vflag, eflag_atom,
-				   vflag_atom, host_start, 
-				   &ilist, &numneigh, cpu_time, success, 
-           atom->q, domain->boxlo, domain->prd);
+				     atom->x, atom->type, domain->sublo,
+				     domain->subhi, atom->tag, atom->nspecial,
+				     atom->special, eflag, vflag, eflag_atom,
+				     vflag_atom, host_start, 
+				     &ilist, &numneigh, cpu_time, success, 
+				     atom->q, domain->boxlo, domain->prd);
   } else {
     inum = list->inum;
     ilist = list->ilist;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
     buckc_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
-		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
-		    vflag_atom, host_start, cpu_time, success, atom->q,
-		    atom->nlocal, domain->boxlo, domain->prd);
+		      ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		      vflag_atom, host_start, cpu_time, success, atom->q,
+		      atom->nlocal, domain->boxlo, domain->prd);
   }
   if (!success)
     error->one(FLERR,"Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairBuckCoulCutGPU::init_style()
 {
   if (force->newton_pair) 
-    error->all(FLERR,"Cannot use newton pair with buck/gpu pair style");
+    error->all(FLERR,
+	       "Cannot use newton pair with buck/coul/cut/gpu pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
   double cut;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cut *= cut;
         if (cut > maxcut)
           maxcut = cut;
         cutsq[i][j] = cutsq[j][i] = cut;
       } else
         cutsq[i][j] = cutsq[j][i] = 0.0;
     }
   }
   double cell_size = sqrt(maxcut) + neighbor->skin;
 
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int success = buckc_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2, 
-                             a, c, offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
-                             cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
-                             force->special_coul, force->qqrd2e);
+			       a, c, offset, force->special_lj, atom->nlocal,
+			       atom->nlocal+atom->nghost, 300, maxspecial,
+			       cell_size, gpu_mode, screen, cut_ljsq,
+			       cut_coulsq, force->special_coul, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE) {
     int irequest = neighbor->request(this);
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   } 
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairBuckCoulCutGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
   return bytes + buckc_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
-void PairBuckCoulCutGPU::cpu_compute(int start, int inum, int eflag, int vflag, 
-			       int *ilist, int *numneigh, int **firstneigh) {
+void PairBuckCoulCutGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+				     int *ilist, int *numneigh,
+				     int **firstneigh) {
   int i,j,ii,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,qtmp,delx,dely,delz,evdwl,ecoul,fpair;
   double rsq,r2inv,r6inv,forcecoul,forcebuck,factor_coul,factor_lj;
   double r,rexp;
   int *jlist;
 
   evdwl = ecoul = 0.0;
   
   double **x = atom->x;
   double **f = atom->f;
   double *q = atom->q;
   int *type = atom->type;
   double *special_coul = force->special_coul;
   double *special_lj = force->special_lj;
   double qqrd2e = force->qqrd2e;
   
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     qtmp = q[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
       factor_coul = special_coul[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	r = sqrt(rsq);
   
-  if (rsq < cut_coulsq[itype][jtype])
-    forcecoul = qqrd2e * qtmp*q[j]/r;
-  else forcecoul = 0.0;
-
-  if (rsq < cut_ljsq[itype][jtype]) {
-    r6inv = r2inv*r2inv*r2inv;
-    rexp = exp(-r*rhoinv[itype][jtype]);
-    forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv;
-  } else forcebuck = 0.0;
+	if (rsq < cut_coulsq[itype][jtype])
+	  forcecoul = qqrd2e * qtmp*q[j]/r;
+	else forcecoul = 0.0;
 
+	if (rsq < cut_ljsq[itype][jtype]) {
+	  r6inv = r2inv*r2inv*r2inv;
+	  rexp = exp(-r*rhoinv[itype][jtype]);
+	  forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv;
+	} else forcebuck = 0.0;
+	
 	fpair = (factor_coul*forcecoul + factor_lj*forcebuck) * r2inv;
-  
+	
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 
 	if (eflag) {
 	  if (rsq < cut_coulsq[itype][jtype])
 	    ecoul = factor_coul * qqrd2e * qtmp*q[j]/r;
 	  else ecoul = 0.0;
 	  if (rsq < cut_ljsq[itype][jtype]) {
 	    evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv -
 	      offset[itype][jtype];
 	    evdwl *= factor_lj;
 	  } else evdwl = 0.0;
 	}
 
 	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
       }
     }
   }
 }
diff --git a/src/GPU/pair_buck_coul_long_gpu.cpp b/src/GPU/pair_buck_coul_long_gpu.cpp
index 92d54181a..a5f640111 100644
--- a/src/GPU/pair_buck_coul_long_gpu.cpp
+++ b/src/GPU/pair_buck_coul_long_gpu.cpp
@@ -1,288 +1,291 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
    
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Trung Dac Nguyen, Mike Brown (ORNL)
+   Contributing author: Trung Dac Nguyen (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "lmptype.h"
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_buck_coul_long_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
 #include "gpu_extra.h"
 
 #define EWALD_F   1.12837917
 #define EWALD_P   0.3275911
 #define A1        0.254829592
 #define A2       -0.284496736
 #define A3        1.421413741
 #define A4       -1.453152027
 #define A5        1.061405429
 
 // External functions from cuda library for atom decomposition
 
 int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-       double **host_buck1, double **host_buck2, 
-       double **host_a, double **host_c,
-        double **offset, double *special_lj, const int inum,
-        const int nall, const int max_nbors, const int maxspecial,
-        const double cell_size, int &gpu_mode, FILE *screen,
-        double **host_cut_ljsq, double host_cut_coulsq,
-        double *host_special_coul, const double qqrd2e,
-        const double g_ewald);
+		    double **host_buck1, double **host_buck2, double **host_a,
+		    double **host_c, double **offset, double *special_lj,
+		    const int inum, const int nall, const int max_nbors,
+		    const int maxspecial, const double cell_size,
+		    int &gpu_mode, FILE *screen, double **host_cut_ljsq,
+		    double host_cut_coulsq, double *host_special_coul,
+		    const double qqrd2e, const double g_ewald);
 void buckcl_gpu_clear();
-int** buckcl_gpu_compute_n(const int ago, const int inum_full,
-         const int nall, double **host_x, int *host_type,
-         double *sublo, double *subhi, int *tag, int **nspecial, 
-         int **special, const bool eflag, const bool vflag,
-         const bool eatom, const bool vatom, int &host_start,
-         int **ilist, int **jnum,  const double cpu_time,
-         bool &success, double *host_q, double *boxlo,
-         double *prd);
+int** buckcl_gpu_compute_n(const int ago, const int inum_full, const int nall,
+			   double **host_x, int *host_type, double *sublo,
+			   double *subhi, int *tag, int **nspecial, 
+			   int **special, const bool eflag, const bool vflag,
+			   const bool eatom, const bool vatom, int &host_start,
+			   int **ilist, int **jnum,  const double cpu_time,
+			   bool &success, double *host_q, double *boxlo,
+			   double *prd);
 void buckcl_gpu_compute(const int ago, const int inum_full, const int nall,
-        double **host_x, int *host_type, int *ilist, int *numj,
-        int **firstneigh, const bool eflag, const bool vflag,
-        const bool eatom, const bool vatom, int &host_start,
-        const double cpu_time, bool &success, double *host_q,
-        const int nlocal, double *boxlo, double *prd);
+			double **host_x, int *host_type, int *ilist, int *numj,
+			int **firstneigh, const bool eflag, const bool vflag,
+			const bool eatom, const bool vatom, int &host_start,
+			const double cpu_time, bool &success, double *host_q,
+			const int nlocal, double *boxlo, double *prd);
 double buckcl_gpu_bytes();
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairBuckCoulLongGPU::PairBuckCoulLongGPU(LAMMPS *lmp) : 
   PairBuckCoulLong(lmp), gpu_mode(GPU_FORCE)
 {
   respa_enable = 0;
   cpu_time = 0.0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); 
 }
 
 /* ----------------------------------------------------------------------
    free all arrays
 ------------------------------------------------------------------------- */
 
 PairBuckCoulLongGPU::~PairBuckCoulLongGPU()
 {
   buckcl_gpu_clear();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairBuckCoulLongGPU::compute(int eflag, int vflag)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
   
   bool success = true;
   int *ilist, *numneigh, **firstneigh;    
   if (gpu_mode != GPU_FORCE) {
     inum = atom->nlocal;
     firstneigh = buckcl_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
-				    atom->type, domain->sublo, domain->subhi,
-				    atom->tag, atom->nspecial, atom->special,
-				    eflag, vflag, eflag_atom, vflag_atom,
-				    host_start, &ilist, &numneigh, cpu_time,
-				    success, atom->q, domain->boxlo,
-				    domain->prd);
+				      atom->type, domain->sublo, domain->subhi,
+				      atom->tag, atom->nspecial, atom->special,
+				      eflag, vflag, eflag_atom, vflag_atom,
+				      host_start, &ilist, &numneigh, cpu_time,
+				      success, atom->q, domain->boxlo,
+				      domain->prd);
   } else {
     inum = list->inum;
     ilist = list->ilist;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
     buckcl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
-		     ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
-		     vflag_atom, host_start, cpu_time, success, atom->q,
-		     atom->nlocal, domain->boxlo, domain->prd);
+		       ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		       vflag_atom, host_start, cpu_time, success, atom->q,
+		       atom->nlocal, domain->boxlo, domain->prd);
   }
   if (!success)
     error->one(FLERR,"Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairBuckCoulLongGPU::init_style()
 {
   if (!atom->q_flag)
-    error->all(FLERR,"Pair style buck/coul/long/gpu requires atom attribute q");
+    error->all(FLERR,
+	       "Pair style buck/coul/long/gpu requires atom attribute q");
   if (force->newton_pair) 
-    error->all(FLERR,"Cannot use newton pair with buck/could/cut/gpu pair style");
+    error->all(FLERR,
+	       "Cannot use newton pair with buck/coul/long/gpu pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
   double cut;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cut *= cut;
         if (cut > maxcut)
           maxcut = cut;
         cutsq[i][j] = cutsq[j][i] = cut;
       } else
         cutsq[i][j] = cutsq[j][i] = 0.0;
     }
   }
   double cell_size = sqrt(maxcut) + neighbor->skin;
 
   cut_coulsq = cut_coul * cut_coul;
 
   // insure use of KSpace long-range solver, set g_ewald
 
   if (force->kspace == NULL)
     error->all(FLERR,"Pair style is incompatible with KSpace style");
   g_ewald = force->kspace->g_ewald;
 
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int success = buckcl_gpu_init(atom->ntypes+1, cutsq,  rhoinv, buck1, buck2, 
-                              a, c, offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
-                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
-                              force->special_coul, force->qqrd2e, g_ewald);
+				a, c, offset, force->special_lj, atom->nlocal,
+				atom->nlocal+atom->nghost, 300, maxspecial,
+				cell_size, gpu_mode, screen, cut_ljsq,
+				cut_coulsq, force->special_coul, force->qqrd2e,
+				g_ewald);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE) {
     int irequest = neighbor->request(this);
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairBuckCoulLongGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
   return bytes + buckcl_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairBuckCoulLongGPU::cpu_compute(int start, int inum, int eflag,
 				       int vflag, int *ilist, int *numneigh,
 				       int **firstneigh)
 {
   int i,j,ii,jj,jnum,itype,jtype,itable;
   double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
   double fraction,table;
   double r,rexp,r2inv,r6inv,forcecoul,forcebuck,factor_coul,factor_lj;
   double grij,expm2,prefactor,t,erfc;
   int *jlist;
   double rsq;
 
   evdwl = ecoul = 0.0;
 
   double **x = atom->x;
   double **f = atom->f;
   double *q = atom->q;
   int *type = atom->type;
   double *special_coul = force->special_coul;
   double *special_lj = force->special_lj;
   double qqrd2e = force->qqrd2e;
 
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     qtmp = q[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
       factor_coul = special_coul[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 
 	if (rsq < cut_coulsq) {
 	  grij = g_ewald * r;
 	  expm2 = exp(-grij*grij);
 	  t = 1.0 / (1.0 + EWALD_P*grij);
 	  erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
 	  prefactor = qqrd2e * qtmp*q[j]/r;
 	  forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
 	  if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
 	} else forcecoul = 0.0;
 
 	if (rsq < cut_ljsq[itype][jtype]) {
 	  r6inv = r2inv*r2inv*r2inv;
-    rexp = exp(-r*rhoinv[itype][jtype]);
-    forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv;
+	  rexp = exp(-r*rhoinv[itype][jtype]);
+	  forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv;
 	} else forcebuck = 0.0;
 
 	fpair = (forcecoul + factor_lj*forcebuck) * r2inv;
 
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 
 	if (eflag) {
 	  if (rsq < cut_coulsq) {
-      ecoul = prefactor*erfc;
+	    ecoul = prefactor*erfc;
 	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
 	  } else ecoul = 0.0;
 
 	  if (rsq < cut_ljsq[itype][jtype]) {
 	    evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv -
 	      offset[itype][jtype];
 	    evdwl *= factor_lj;
 	  } else evdwl = 0.0;
 	}
 
 	if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
       }
     }
   }
 }
diff --git a/src/GPU/pair_buck_gpu.cpp b/src/GPU/pair_buck_gpu.cpp
index c277f9b02..67982202c 100644
--- a/src/GPU/pair_buck_gpu.cpp
+++ b/src/GPU/pair_buck_gpu.cpp
@@ -1,229 +1,230 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
    
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Mike Brown (SNL)
+   Contributing author: Trung Dac Nguyen (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "lmptype.h"
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_buck_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #include "gpu_extra.h"
 
 // External functions from cuda library for atom decomposition
 
 int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
-                 double **host_a, double **host_c,       
-                 double **offset, double *special_lj, const int inum,
-                 const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen);
+		  double **host_buck1, double **host_buck2, 
+		  double **host_a, double **host_c,       
+		  double **offset, double *special_lj, const int inum,
+		  const int nall, const int max_nbors,  const int maxspecial,
+		  const double cell_size, int &gpu_mode, FILE *screen);
 void buck_gpu_clear();
-int ** buck_gpu_compute_n(const int ago, const int inum_full,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        int **ilist, int **jnum, const double cpu_time,
-                        bool &success);
+int ** buck_gpu_compute_n(const int ago, const int inum_full, const int nall,
+			  double **host_x, int *host_type, double *sublo,
+			  double *subhi, int *tag, int **nspecial,
+			  int **special, const bool eflag, const bool vflag,
+			  const bool eatom, const bool vatom, int &host_start,
+			  int **ilist, int **jnum, const double cpu_time,
+			  bool &success);
 void buck_gpu_compute(const int ago, const int inum_full, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success);
+		      double **host_x, int *host_type, int *ilist, int *numj,
+		      int **firstneigh, const bool eflag, const bool vflag,
+		      const bool eatom, const bool vatom, int &host_start,
+		      const double cpu_time, bool &success);
 double buck_gpu_bytes();
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairBuckGPU::PairBuckGPU(LAMMPS *lmp) : PairBuck(lmp), gpu_mode(GPU_FORCE)
 {
   respa_enable = 0;
   cpu_time = 0.0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); 
 }
 
 /* ----------------------------------------------------------------------
    free all arrays
 ------------------------------------------------------------------------- */
 
 PairBuckGPU::~PairBuckGPU()
 {
   buck_gpu_clear();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairBuckGPU::compute(int eflag, int vflag)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
   
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
   if (gpu_mode != GPU_FORCE) {
     inum = atom->nlocal;
     firstneigh = buck_gpu_compute_n(neighbor->ago, inum, nall,
-				   atom->x, atom->type, domain->sublo,
-				   domain->subhi, atom->tag, atom->nspecial,
-				   atom->special, eflag, vflag, eflag_atom,
-				   vflag_atom, host_start, 
-				   &ilist, &numneigh, cpu_time, success);
+				    atom->x, atom->type, domain->sublo,
+				    domain->subhi, atom->tag, atom->nspecial,
+				    atom->special, eflag, vflag, eflag_atom,
+				    vflag_atom, host_start, 
+				    &ilist, &numneigh, cpu_time, success);
   } else {
     inum = list->inum;
     ilist = list->ilist;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
     buck_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
-		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
-		    vflag_atom, host_start, cpu_time, success);
+		     ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		     vflag_atom, host_start, cpu_time, success);
   }
   if (!success)
     error->one(FLERR,"Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairBuckGPU::init_style()
 {
   if (force->newton_pair) 
     error->all(FLERR,"Cannot use newton pair with buck/gpu pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
   double cut;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cut *= cut;
         if (cut > maxcut)
           maxcut = cut;
         cutsq[i][j] = cutsq[j][i] = cut;
       } else
         cutsq[i][j] = cutsq[j][i] = 0.0;
     }
   }
   double cell_size = sqrt(maxcut) + neighbor->skin;
 
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int success = buck_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2, 
-                             a, c, offset, force->special_lj, atom->nlocal,
-			     atom->nlocal+atom->nghost, 300, maxspecial,
-			     cell_size, gpu_mode, screen);
+			      a, c, offset, force->special_lj, atom->nlocal,
+			      atom->nlocal+atom->nghost, 300, maxspecial,
+			      cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE) {
     int irequest = neighbor->request(this);
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairBuckGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
   return bytes + buck_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairBuckGPU::cpu_compute(int start, int inum, int eflag, int vflag, 
 			       int *ilist, int *numneigh, int **firstneigh) {
   int i,j,ii,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r2inv,r6inv,forcebuck,factor_lj;
   double r,rexp;
   int *jlist;
 
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   double *special_lj = force->special_lj;
 
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	r6inv = r2inv*r2inv*r2inv;
 	r = sqrt(rsq);
 	rexp = exp(-r*rhoinv[itype][jtype]);
 	forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv;
 	fpair = factor_lj*forcebuck*r2inv;
 
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 
 	if (eflag) {
 	  evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv -
 	    offset[itype][jtype];
 	  evdwl *= factor_lj;
 	}
 
 	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
       }
     }
   }
 }
diff --git a/src/GPU/pair_cg_cmm_coul_msm.cpp b/src/GPU/pair_cg_cmm_coul_msm.cpp
index 5321945eb..791dfc9c4 100644
--- a/src/GPU/pair_cg_cmm_coul_msm.cpp
+++ b/src/GPU/pair_cg_cmm_coul_msm.cpp
@@ -1,290 +1,291 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    CMM coarse grained MD potentials. Coulomb with MSM version.
    Contributing author: Mike Brown <brownw@ornl.gov>
 ------------------------------------------------------------------------- */
 
 #include "string.h"
 #include "pair_cg_cmm_coul_msm.h"
 #include "memory.h"
 #include "atom.h"
 #include "force.h"
 #include "kspace.h"
 
 using namespace LAMMPS_NS;
 
 enum {C3=0,C4=1};
 
 /* ---------------------------------------------------------------------- */
 
 PairCGCMMCoulMSM::PairCGCMMCoulMSM(LAMMPS *lmp) : PairCMMCommon(lmp)
 {
   respa_enable = 0;
   single_enable = 0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); 
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairCGCMMCoulMSM::~PairCGCMMCoulMSM()
 {
   if (allocated_coul) {
     memory->destroy(cut_lj);
     memory->destroy(cut_ljsq);
     memory->destroy(cut_coul);
     memory->destroy(cut_coulsq);
     allocated_coul=0;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairCGCMMCoulMSM::allocate()
 {
   PairCMMCommon::allocate();
   allocated_coul = 1;
 
   int n = atom->ntypes;
 
   memory->create(cut_lj,n+1,n+1,"paircg:cut_lj");
   memory->create(cut_ljsq,n+1,n+1,"paircg:cut_ljsq");
   memory->create(cut_coul,n+1,n+1,"paircg:cut_coul");
   memory->create(cut_coulsq,n+1,n+1,"paircg:cut_coulsq");
 }
 
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
 
 void PairCGCMMCoulMSM::settings(int narg, char **arg)
 {
   // strip off smoothing type and send args to parent
 
   if (narg < 1) error->all(FLERR,"Illegal pair_style command");
 
   if (strcmp(arg[0],"C3") == 0)
     _smooth = C3;
   else if (strcmp(arg[0],"C4") == 0)
     _smooth = C4;
   else error->all(FLERR,"Illegal pair_style command");
 
   PairCMMCommon::settings(narg-1,&arg[1]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairCGCMMCoulMSM::init_style()
 {
   if (!atom->q_flag)
     error->all(FLERR,"Pair style cg/cut/coul/msm requires atom attribute q");
   
   PairCMMCommon::init_style();
   _ia=-1.0/cut_coul_global;
   _ia2=_ia*_ia;
   _ia3=_ia2*_ia;
 
   cut_respa = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairCGCMMCoulMSM::init_one(int i, int j)
 {
   double mycut = PairCMMCommon::init_one(i,j);
 
   return mycut;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairCGCMMCoulMSM::compute(int eflag, int vflag)
 {
   int i,j,ii,jj,inum,jnum,itype,jtype,itable;
   double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
   double fraction,table;
   double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
   double grij,expm2,prefactor,t,erfc;
   int *ilist,*jlist,*numneigh,**firstneigh;
   double rsq;
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
   double **x = atom->x;
   double **f = atom->f;
   double *q = atom->q;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int nall = nlocal + atom->nghost;
   double *special_coul = force->special_coul;
   double *special_lj = force->special_lj;
   double qqrd2e = force->qqrd2e;
   int newton_pair = force->newton_pair;
 
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
   
   // loop over neighbors of my atoms
 
   for (ii = 0; ii < inum; ii++) {
     i = ilist[ii];
     qtmp = q[i];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
       factor_coul = special_coul[sbmask(j)];
       j &= NEIGHMASK;
 
       const double delx = xtmp - x[j][0];
       const double dely = ytmp - x[j][1];
       const double delz = ztmp - x[j][2];
       const double rsq = delx*delx + dely*dely + delz*delz;
       const int jtype = type[j];
 
       double evdwl = 0.0;
       double ecoul = 0.0;
       double fpair = 0.0;
 
       if (rsq < cutsq[itype][jtype]) {
         const double r2inv = 1.0/rsq;
         const int cgt=cg_type[itype][jtype];
 
         double forcelj  = 0.0;
         double forcecoul = 0.0;
 
         if (rsq < cut_ljsq[itype][jtype]) {
           forcelj=factor_lj;
           if (eflag) evdwl=factor_lj;
 
           if (cgt == CG_LJ12_4) {
             const double r4inv=r2inv*r2inv;
             forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
                        - lj2[itype][jtype]);
             if (eflag) {
               evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
                        - lj4[itype][jtype]) - offset[itype][jtype];
             }
           } else if (cgt == CG_LJ9_6) {
             const double r3inv = r2inv*sqrt(r2inv);
             const double r6inv = r3inv*r3inv;
             forcelj *= r6inv*(lj1[itype][jtype]*r3inv
                        - lj2[itype][jtype]);
             if (eflag) {
               evdwl *= r6inv*(lj3[itype][jtype]*r3inv
                         - lj4[itype][jtype]) - offset[itype][jtype];
             }
           } else {
             const double r6inv = r2inv*r2inv*r2inv;
             forcelj *= r6inv*(lj1[itype][jtype]*r6inv
                        - lj2[itype][jtype]);
             if (eflag) {
               evdwl *= r6inv*(lj3[itype][jtype]*r6inv
                        - lj4[itype][jtype]) - offset[itype][jtype];
             }
           }
         }
 
         if (rsq < cut_coulsq_global) {
           const double ir = 1.0/sqrt(rsq);
           const double prefactor = qqrd2e * qtmp*q[j];
           const double r2_ia2 = rsq*_ia2;
           const double r4_ia4 = r2_ia2*r2_ia2;
           if (_smooth==C3) {
             forcecoul = prefactor*(_ia3*(-4.375+5.25*r2_ia2-1.875*r4_ia4)-
                                    ir/rsq);
             if (eflag)
               ecoul = prefactor*(ir+_ia*(2.1875-2.1875*r2_ia2+
                                          1.3125*r4_ia4-0.3125*r2_ia2*r4_ia4));
           } else {
             const double r6_ia6 = r2_ia2*r4_ia4;
             forcecoul = prefactor*(_ia3*(-6.5625+11.8125*r2_ia2-8.4375*r4_ia4+
                                          2.1875*r6_ia6)-ir/rsq);
             if (eflag)
               ecoul = prefactor*(ir+_ia*(2.4609375-3.28125*r2_ia2+
                                          2.953125*r4_ia4-1.40625*r6_ia6+
                                          0.2734375*r4_ia4*r4_ia4));
           }
           if (factor_coul < 1.0) {
             forcecoul -= (1.0-factor_coul)*prefactor*ir;
             if (eflag) ecoul -= (1.0-factor_coul)*prefactor*ir;
           }
         }
         fpair = forcecoul + forcelj * r2inv;
 
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (newton_pair || j < nlocal) {
 	  f[j][0] -= delx*fpair;
 	  f[j][1] -= dely*fpair;
 	  f[j][2] -= delz*fpair;
 	}
 	if (evflag) ev_tally(i,j,nlocal,newton_pair,
 			     evdwl,ecoul,fpair,delx,dely,delz);
       }
     }
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairCGCMMCoulMSM::write_restart(FILE *fp) 
 {
   write_restart_settings(fp);
   PairCMMCommon::write_restart(fp);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairCGCMMCoulMSM::read_restart(FILE *fp)
 {
   read_restart_settings(fp);
   allocate();
   PairCMMCommon::read_restart(fp);
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairCGCMMCoulMSM::memory_usage()
 {
   double bytes=PairCMMCommon::memory_usage();
   
   int n = atom->ntypes;
 
   // cut_coul/cut_coulsq/cut_ljsq
   bytes += (n+1)*(n+1)*sizeof(double)*4; 
   
   return bytes;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void *PairCGCMMCoulMSM::extract(char *str)
 {
   if (strcmp(str,"cut_coul") == 0) return (void *) &cut_coul_global;
   return NULL;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/pair_eam_alloy_gpu.cpp b/src/GPU/pair_eam_alloy_gpu.cpp
index ffccd8516..2cb80a34d 100644
--- a/src/GPU/pair_eam_alloy_gpu.cpp
+++ b/src/GPU/pair_eam_alloy_gpu.cpp
@@ -1,323 +1,323 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL)
+   Contributing authors: Trung Dac Nguyen (ORNL), W. Michael Brown (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_eam_alloy_gpu.h"
 #include "atom.h"
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define MAXLINE 1024
 
 /* ---------------------------------------------------------------------- */
 
 PairEAMAlloyGPU::PairEAMAlloyGPU(LAMMPS *lmp) : PairEAMGPU(lmp)
 {
   one_coeff = 1;
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
    read DYNAMO setfl file
 ------------------------------------------------------------------------- */
 
 void PairEAMAlloyGPU::coeff(int narg, char **arg)
 {
   int i,j;
 
   if (!allocated) allocate();
 
   if (narg != 3 + atom->ntypes)
     error->all(FLERR,"Incorrect args for pair coefficients");
 
   // insure I,J args are * *
 
   if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
     error->all(FLERR,"Incorrect args for pair coefficients");
 
   // read EAM setfl file
 
   if (setfl) {
     for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
     delete [] setfl->elements;
     delete [] setfl->mass;
     memory->destroy(setfl->frho);
     memory->destroy(setfl->rhor);
     memory->destroy(setfl->z2r);
     delete setfl;
   }
   setfl = new Setfl();
   read_file(arg[2]);
 
   // read args that map atom types to elements in potential file
   // map[i] = which element the Ith atom type is, -1 if NULL
 
   for (i = 3; i < narg; i++) {
     if (strcmp(arg[i],"NULL") == 0) {
       map[i-2] = -1;
       continue;
     }
     for (j = 0; j < setfl->nelements; j++)
       if (strcmp(arg[i],setfl->elements[j]) == 0) break;
     if (j < setfl->nelements) map[i-2] = j;
     else error->all(FLERR,"No matching element in EAM potential file");
   }
 
   // clear setflag since coeff() called once with I,J = * *
 
   int n = atom->ntypes;
   for (i = 1; i <= n; i++)
     for (j = i; j <= n; j++)
       setflag[i][j] = 0;
 
   // set setflag i,j for type pairs where both are mapped to elements
   // set mass of atom type if i = j
 
   int count = 0;
   for (i = 1; i <= n; i++) {
     for (j = i; j <= n; j++) {
       if (map[i] >= 0 && map[j] >= 0) {
 	setflag[i][j] = 1;
 	if (i == j) atom->set_mass(i,setfl->mass[map[i]]);
 	count++;
       }
     }
   }
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 
 /* ----------------------------------------------------------------------
    read a multi-element DYNAMO setfl file
 ------------------------------------------------------------------------- */
 
 void PairEAMAlloyGPU::read_file(char *filename)
 {
   Setfl *file = setfl;
 
   // open potential file
 
   int me = comm->me;
   FILE *fptr;
   char line[MAXLINE];
 
   if (me == 0) {
     fptr = fopen(filename,"r");
     if (fptr == NULL) {
       char str[128];
       sprintf(str,"Cannot open EAM potential file %s",filename);
       error->one(FLERR,str);
     }
   }
 
   // read and broadcast header
   // extract element names from nelements line
 
   int n;
   if (me == 0) {
     fgets(line,MAXLINE,fptr);
     fgets(line,MAXLINE,fptr);
     fgets(line,MAXLINE,fptr);
     fgets(line,MAXLINE,fptr);
     n = strlen(line) + 1;
   }
   MPI_Bcast(&n,1,MPI_INT,0,world);
   MPI_Bcast(line,n,MPI_CHAR,0,world);
 
   sscanf(line,"%d",&file->nelements);
   int nwords = atom->count_words(line);
   if (nwords != file->nelements + 1)
     error->all(FLERR,"Incorrect element names in EAM potential file");
   
   char **words = new char*[file->nelements+1];
   nwords = 0;
   strtok(line," \t\n\r\f");
   while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue;
 
   file->elements = new char*[file->nelements];
   for (int i = 0; i < file->nelements; i++) {
     n = strlen(words[i]) + 1;
     file->elements[i] = new char[n];
     strcpy(file->elements[i],words[i]);
   }
   delete [] words;
 
   if (me == 0) {
     fgets(line,MAXLINE,fptr);
     sscanf(line,"%d %lg %d %lg %lg",
 	   &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
   }
 
   MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
   MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&file->nr,1,MPI_INT,0,world);
   MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
 
   file->mass = new double[file->nelements];
   memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
   memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
   memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
 		 "pair:z2r");
 
   int i,j,tmp;
   for (i = 0; i < file->nelements; i++) {
     if (me == 0) {
       fgets(line,MAXLINE,fptr);
       sscanf(line,"%d %lg",&tmp,&file->mass[i]);
     }
     MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
 
     if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
     MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
     if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
     MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
   }
 
   for (i = 0; i < file->nelements; i++)
     for (j = 0; j <= i; j++) {
       if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
       MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
     }
 
   // close the potential file
 
   if (me == 0) fclose(fptr);
 }
 
 /* ----------------------------------------------------------------------
    copy read-in setfl potential to standard array format
 ------------------------------------------------------------------------- */
 
 void PairEAMAlloyGPU::file2array()
 {
   int i,j,m,n;
   int ntypes = atom->ntypes;
 
   // set function params directly from setfl file
 
   nrho = setfl->nrho;
   nr = setfl->nr;
   drho = setfl->drho;
   dr = setfl->dr;
 
   // ------------------------------------------------------------------
   // setup frho arrays
   // ------------------------------------------------------------------
 
   // allocate frho arrays
   // nfrho = # of setfl elements + 1 for zero array
   
   nfrho = setfl->nelements + 1;
   memory->destroy(frho);
   memory->create(frho,nfrho,nrho+1,"pair:frho");
 
   // copy each element's frho to global frho
 
   for (i = 0; i < setfl->nelements; i++)
     for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
 
   // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
   // this is necessary b/c fp is still computed for non-EAM atoms
 
   for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
 
   // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
   // if atom type doesn't point to element (non-EAM atom in pair hybrid)
   // then map it to last frho array of zeroes
 
   for (i = 1; i <= ntypes; i++)
     if (map[i] >= 0) type2frho[i] = map[i];
     else type2frho[i] = nfrho-1;
 
   // ------------------------------------------------------------------
   // setup rhor arrays
   // ------------------------------------------------------------------
 
   // allocate rhor arrays
   // nrhor = # of setfl elements
 
   nrhor = setfl->nelements;
   memory->destroy(rhor);
   memory->create(rhor,nrhor,nr+1,"pair:rhor");
 
   // copy each element's rhor to global rhor
 
   for (i = 0; i < setfl->nelements; i++)
     for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
 
   // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
   // for setfl files, I,J mapping only depends on I
   // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
 
   for (i = 1; i <= ntypes; i++)
     for (j = 1; j <= ntypes; j++)
       type2rhor[i][j] = map[i];
 
   // ------------------------------------------------------------------
   // setup z2r arrays
   // ------------------------------------------------------------------
 
   // allocate z2r arrays
   // nz2r = N*(N+1)/2 where N = # of setfl elements
 
   nz2r = setfl->nelements * (setfl->nelements+1) / 2;
   memory->destroy(z2r);
   memory->create(z2r,nz2r,nr+1,"pair:z2r");
 
   // copy each element pair z2r to global z2r, only for I >= J
 
   n = 0;
   for (i = 0; i < setfl->nelements; i++)
     for (j = 0; j <= i; j++) {
       for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
       n++;
     }
 
   // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
   // set of z2r arrays only fill lower triangular Nelement matrix
   // value = n = sum over rows of lower-triangular matrix until reach irow,icol
   // swap indices when irow < icol to stay lower triangular
   // if map = -1 (non-EAM atom in pair hybrid):
   //   type2z2r is not used by non-opt
   //   but set type2z2r to 0 since accessed by opt
 
   int irow,icol;
   for (i = 1; i <= ntypes; i++) {
     for (j = 1; j <= ntypes; j++) {
       irow = map[i];
       icol = map[j];
       if (irow == -1 || icol == -1) {
 	type2z2r[i][j] = 0;
 	continue;
       }
       if (irow < icol) {
 	irow = map[j];
 	icol = map[i];
       }
       n = 0;
       for (m = 0; m < irow; m++) n += m + 1;
       n += icol;
       type2z2r[i][j] = n;
     }
   }
 }
diff --git a/src/GPU/pair_eam_fs_gpu.cpp b/src/GPU/pair_eam_fs_gpu.cpp
index 382302a8a..1b52920e3 100644
--- a/src/GPU/pair_eam_fs_gpu.cpp
+++ b/src/GPU/pair_eam_fs_gpu.cpp
@@ -1,332 +1,332 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL)
+   Contributing authors: Trung Dac Nguyen (ORNL), W. Michael Brown (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_eam_fs_gpu.h"
 #include "atom.h"
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define MAXLINE 1024
 
 /* ---------------------------------------------------------------------- */
 
 PairEAMFSGPU::PairEAMFSGPU(LAMMPS *lmp) : PairEAMGPU(lmp)
 {
   one_coeff = 1;
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
    read EAM Finnis-Sinclair file
 ------------------------------------------------------------------------- */
 
 void PairEAMFSGPU::coeff(int narg, char **arg)
 {
   int i,j;
 
   if (!allocated) allocate();
 
   if (narg != 3 + atom->ntypes)
     error->all(FLERR,"Incorrect args for pair coefficients");
 
   // insure I,J args are * *
 
   if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
     error->all(FLERR,"Incorrect args for pair coefficients");
 
   // read EAM Finnis-Sinclair file
 
   if (fs) {
     for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
     delete [] fs->elements;
     delete [] fs->mass;
     memory->destroy(fs->frho);
     memory->destroy(fs->rhor);
     memory->destroy(fs->z2r);
     delete fs;
   }
   fs = new Fs();
   read_file(arg[2]);
 
   // read args that map atom types to elements in potential file
   // map[i] = which element the Ith atom type is, -1 if NULL
 
   for (i = 3; i < narg; i++) {
     if (strcmp(arg[i],"NULL") == 0) {
       map[i-2] = -1;
       continue;
     }
     for (j = 0; j < fs->nelements; j++)
       if (strcmp(arg[i],fs->elements[j]) == 0) break;
     if (j < fs->nelements) map[i-2] = j;
     else error->all(FLERR,"No matching element in EAM potential file");
   }
 
   // clear setflag since coeff() called once with I,J = * *
 
   int n = atom->ntypes;
   for (i = 1; i <= n; i++)
     for (j = i; j <= n; j++)
       setflag[i][j] = 0;
 
   // set setflag i,j for type pairs where both are mapped to elements
   // set mass of atom type if i = j
 
   int count = 0;
   for (i = 1; i <= n; i++) {
     for (j = i; j <= n; j++) {
       if (map[i] >= 0 && map[j] >= 0) {
 	setflag[i][j] = 1;
 	if (i == j) atom->set_mass(i,fs->mass[map[i]]);
 	count++;
       }
     }
   }
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 
 /* ----------------------------------------------------------------------
    read a multi-element DYNAMO setfl file
 ------------------------------------------------------------------------- */
 
 void PairEAMFSGPU::read_file(char *filename)
 {
   Fs *file = fs;
 
   // open potential file
 
   int me = comm->me;
   FILE *fptr;
   char line[MAXLINE];
 
   if (me == 0) {
     fptr = fopen(filename,"r");
     if (fptr == NULL) {
       char str[128];
       sprintf(str,"Cannot open EAM potential file %s",filename);
       error->one(FLERR,str);
     }
   }
 
   // read and broadcast header
   // extract element names from nelements line
 
   int n;
   if (me == 0) {
     fgets(line,MAXLINE,fptr);
     fgets(line,MAXLINE,fptr);
     fgets(line,MAXLINE,fptr);
     fgets(line,MAXLINE,fptr);
     n = strlen(line) + 1;
   }
   MPI_Bcast(&n,1,MPI_INT,0,world);
   MPI_Bcast(line,n,MPI_CHAR,0,world);
 
   sscanf(line,"%d",&file->nelements);
   int nwords = atom->count_words(line);
   if (nwords != file->nelements + 1)
     error->all(FLERR,"Incorrect element names in EAM potential file");
   
   char **words = new char*[file->nelements+1];
   nwords = 0;
   strtok(line," \t\n\r\f");
   while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue;
 
   file->elements = new char*[file->nelements];
   for (int i = 0; i < file->nelements; i++) {
     n = strlen(words[i]) + 1;
     file->elements[i] = new char[n];
     strcpy(file->elements[i],words[i]);
   }
   delete [] words;
 
   if (me == 0) {
     fgets(line,MAXLINE,fptr);
     sscanf(line,"%d %lg %d %lg %lg",
 	   &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
   }
 
   MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
   MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&file->nr,1,MPI_INT,0,world);
   MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
 
   file->mass = new double[file->nelements];
   memory->create(file->frho,file->nelements,file->nrho+1,
 					      "pair:frho");
   memory->create(file->rhor,file->nelements,file->nelements,
 		 file->nr+1,"pair:rhor");
   memory->create(file->z2r,file->nelements,file->nelements,
 		 file->nr+1,"pair:z2r");
 
   int i,j,tmp;
   for (i = 0; i < file->nelements; i++) {
     if (me == 0) {
       fgets(line,MAXLINE,fptr);
       sscanf(line,"%d %lg",&tmp,&file->mass[i]);
     }
     MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
 
     if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
     MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
 
     for (j = 0; j < file->nelements; j++) {
       if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
       MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
     }
   }
 
   for (i = 0; i < file->nelements; i++)
     for (j = 0; j <= i; j++) {
       if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
       MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
     }
 
   // close the potential file
 
   if (me == 0) fclose(fptr);
 }
 
 /* ----------------------------------------------------------------------
    copy read-in setfl potential to standard array format
 ------------------------------------------------------------------------- */
 
 void PairEAMFSGPU::file2array()
 {
   int i,j,m,n;
   int ntypes = atom->ntypes;
 
   // set function params directly from fs file
 
   nrho = fs->nrho;
   nr = fs->nr;
   drho = fs->drho;
   dr = fs->dr;
 
   // ------------------------------------------------------------------
   // setup frho arrays
   // ------------------------------------------------------------------
 
   // allocate frho arrays
   // nfrho = # of fs elements + 1 for zero array
   
   nfrho = fs->nelements + 1;
   memory->destroy(frho);
   memory->create(frho,nfrho,nrho+1,"pair:frho");
 
   // copy each element's frho to global frho
 
   for (i = 0; i < fs->nelements; i++)
     for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
 
   // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
   // this is necessary b/c fp is still computed for non-EAM atoms
 
   for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
 
   // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
   // if atom type doesn't point to element (non-EAM atom in pair hybrid)
   // then map it to last frho array of zeroes
 
   for (i = 1; i <= ntypes; i++)
     if (map[i] >= 0) type2frho[i] = map[i];
     else type2frho[i] = nfrho-1;
 
   // ------------------------------------------------------------------
   // setup rhor arrays
   // ------------------------------------------------------------------
 
   // allocate rhor arrays
   // nrhor = square of # of fs elements
 
   nrhor = fs->nelements * fs->nelements;
   memory->destroy(rhor);
   memory->create(rhor,nrhor,nr+1,"pair:rhor");
 
   // copy each element pair rhor to global rhor
 
   n = 0;
   for (i = 0; i < fs->nelements; i++)
     for (j = 0; j < fs->nelements; j++) {
       for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
       n++;
     }
 
   // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
   // for fs files, there is a full NxN set of rhor arrays
   // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
 
   for (i = 1; i <= ntypes; i++)
     for (j = 1; j <= ntypes; j++)
       type2rhor[i][j] = map[i] * fs->nelements + map[j];
 
   // ------------------------------------------------------------------
   // setup z2r arrays
   // ------------------------------------------------------------------
 
   // allocate z2r arrays
   // nz2r = N*(N+1)/2 where N = # of fs elements
 
   nz2r = fs->nelements * (fs->nelements+1) / 2;
   memory->destroy(z2r);
   memory->create(z2r,nz2r,nr+1,"pair:z2r");
 
   // copy each element pair z2r to global z2r, only for I >= J
 
   n = 0;
   for (i = 0; i < fs->nelements; i++)
     for (j = 0; j <= i; j++) {
       for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
       n++;
     }
 
   // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
   // set of z2r arrays only fill lower triangular Nelement matrix
   // value = n = sum over rows of lower-triangular matrix until reach irow,icol
   // swap indices when irow < icol to stay lower triangular
   // if map = -1 (non-EAM atom in pair hybrid):
   //   type2z2r is not used by non-opt
   //   but set type2z2r to 0 since accessed by opt
 
   int irow,icol;
   for (i = 1; i <= ntypes; i++) {
     for (j = 1; j <= ntypes; j++) {
       irow = map[i];
       icol = map[j];
       if (irow == -1 || icol == -1) {
 	type2z2r[i][j] = 0;
 	continue;
       }
       if (irow < icol) {
 	irow = map[j];
 	icol = map[i];
       }
       n = 0;
       for (m = 0; m < irow; m++) n += m + 1;
       n += icol;
       type2z2r[i][j] = n;
     }
   }
 }
diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp
index ff03a0290..68a9e9c3c 100644
--- a/src/GPU/pair_eam_gpu.cpp
+++ b/src/GPU/pair_eam_gpu.cpp
@@ -1,421 +1,243 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL)
+   Contributing authors: Trung Dac Nguyen (ORNL), W. Michael Brown (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_eam_gpu.h"
 #include "atom.h"
 #include "force.h"
 #include "comm.h"
 #include "domain.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "gpu_extra.h"
 
 using namespace LAMMPS_NS;
 
 #define MAXLINE 1024
 
 // External functions from cuda library for atom decomposition
 
 int eam_gpu_init(const int ntypes, double host_cutforcesq,
-                 int **host_type2rhor, int **host_type2z2r,
-                 int *host_type2frho, 
-                 double ***host_rhor_spline, double ***host_z2r_spline,
-                 double ***host_frho_spline,
-                 double rdr, double rdrho, int nrhor, int nrho, 
-                 int nz2r, int nfrho, int nr,
-                 const int nlocal, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, 
-                 int &gpu_mode, FILE *screen, int &fp_size);
+		 int **host_type2rhor, int **host_type2z2r,
+                 int *host_type2frho, double ***host_rhor_spline,
+		 double ***host_z2r_spline, double ***host_frho_spline,
+                 double rdr, double rdrho, int nrhor, int nrho, int nz2r,
+		 int nfrho, int nr, const int nlocal, const int nall,
+		 const int max_nbors, const int maxspecial, 
+		 const double cell_size, int &gpu_mode, FILE *screen, 
+		 int &fp_size);
 void eam_gpu_clear();
-int** eam_gpu_compute_n(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type,
-                 double *sublo, double *subhi, int *tag, int **nspecial, 
-                 int **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **jnum,  const double cpu_time,
-                 bool &success, int &inum, void **fp_ptr);
+int** eam_gpu_compute_n(const int ago, const int inum_full, const int nall,
+			double **host_x, int *host_type, double *sublo,
+			double *subhi, int *tag, int **nspecial, int **special,
+			const bool eflag, const bool vflag, const bool eatom,
+			const bool vatom, int &host_start, int **ilist,
+			int **jnum,  const double cpu_time, bool &success,
+			int &inum, void **fp_ptr);
 void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, 
-                 const int nall,double **host_x, int *host_type, 
-                 int *ilist, int *numj, int **firstneigh, 
-                 const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 const double cpu_time, bool &success, void **fp_ptr);
+		     const int nall,double **host_x, int *host_type, 
+		     int *ilist, int *numj, int **firstneigh, 
+		     const bool eflag, const bool vflag,
+		     const bool eatom, const bool vatom, int &host_start,
+		     const double cpu_time, bool &success, void **fp_ptr);
 void eam_gpu_compute_force(int *ilist, const bool eflag, const bool vflag,
-                      const bool eatom, const bool vatom);
-
+			   const bool eatom, const bool vatom);
 double eam_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
 
 PairEAMGPU::PairEAMGPU(LAMMPS *lmp) : PairEAM(lmp), gpu_mode(GPU_FORCE)
 {
   respa_enable = 0;
   cpu_time = 0.0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); 
 }
 
 /* ----------------------------------------------------------------------
    check if allocated, since class can be destructed when incomplete
 ------------------------------------------------------------------------- */
 
 PairEAMGPU::~PairEAMGPU()
 {
   eam_gpu_clear();
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairEAMGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
   return bytes + eam_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEAMGPU::compute(int eflag, int vflag)
 {
   int i,j,ii,jj,m,jnum,itype,jtype;
   double evdwl,*coeff;
 
   evdwl = 0.0;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
  
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
 
   // compute density on each atom on GPU
 
   int nall = atom->nlocal + atom->nghost;  
   int inum, host_start, inum_dev;
   
   bool success = true;
   int *ilist, *numneigh, **firstneigh; 
   if (gpu_mode != GPU_FORCE) { 
     inum = atom->nlocal;
-    firstneigh = eam_gpu_compute_n(neighbor->ago, inum, nall, 
-             atom->x,atom->type, domain->sublo, domain->subhi,
-             atom->tag, atom->nspecial, atom->special,
-             eflag, vflag, eflag_atom, vflag_atom,
-             host_start, &ilist, &numneigh, cpu_time,
-             success, inum_dev, &fp_pinned);
+    firstneigh = eam_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				   atom->type, domain->sublo, domain->subhi,
+				   atom->tag, atom->nspecial, atom->special,
+				   eflag, vflag, eflag_atom, vflag_atom,
+				   host_start, &ilist, &numneigh, cpu_time,
+				   success, inum_dev, &fp_pinned);
   } else { // gpu_mode == GPU_FORCE
     inum = list->inum;
     ilist = list->ilist;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
     eam_gpu_compute(neighbor->ago, inum, nlocal, nall, atom->x, atom->type,
 		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
 		    vflag_atom, host_start, cpu_time, success, &fp_pinned);
   }
     
   if (!success)
     error->one(FLERR,"Out of memory on GPGPU");
 
-  if (host_start<inum) {
-    cpu_time = MPI_Wtime();
-    cpu_compute_energy(host_start, inum, eflag, vflag, 
-            ilist, numneigh, firstneigh);
-    cpu_time = MPI_Wtime() - cpu_time;
-  }
-  
   // communicate derivative of embedding function
 
   comm->forward_comm_pair(this);
     
   // compute forces on each atom on GPU
   if (gpu_mode != GPU_FORCE) 
     eam_gpu_compute_force(NULL, eflag, vflag, eflag_atom, vflag_atom);
   else
     eam_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom);
-
-  if (host_start<inum) {
-    double cpu_time2 = MPI_Wtime();
-    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
-    cpu_time += MPI_Wtime() - cpu_time2;
-  }
-  
-}
-
-void PairEAMGPU::cpu_compute_energy(int start, int inum, int eflag, int vflag,
-				      int *ilist, int *numneigh,
-				      int **firstneigh) 
-{
-  int i,j,ii,jj,m,jnum,itype,jtype;
-  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
-  double rsq,r,p,rhoip,rhojp,phi;
-  double *coeff;
-  int *jlist;
-  
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int newton_pair = force->newton_pair;
-  
-  // rho = density at each atom
-  // loop over neighbors of my atoms
-
-  for (ii = start; ii < inum; ii++) {
-    i = ilist[ii];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    jlist = firstneigh[i];
-    jnum = numneigh[i];
-
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      j &= NEIGHMASK;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-
-      if (rsq < cutforcesq) {
-	jtype = type[j];
-	p = sqrt(rsq)*rdr + 1.0;
-	m = static_cast<int> (p);
-	m = MIN(m,nr-1);
-	p -= m;
-	p = MIN(p,1.0);
-	coeff = rhor_spline[type2rhor[jtype][itype]][m];
-	rho[i] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
-	    }
-    }
-  }
-
-  // communicate and sum densities
-
-  if (newton_pair) comm->reverse_comm_pair(this);
-  
-  
-  // fp = derivative of embedding energy at each atom
-  // phi = embedding energy at each atom
-  
-  for (ii = start; ii < inum; ii++) {
-    i = ilist[ii];
-    p = rho[i]*rdrho + 1.0;    
-    m = static_cast<int> (p);
-    m = MAX(1,MIN(m,nrho-1));
-    p -= m;
-    p = MIN(p,1.0);
-    
-    coeff = frho_spline[type2frho[type[i]]][m];
-    fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
-    if (eflag) {
-      phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
-      if (eflag_global) eng_vdwl += phi;
-      if (eflag_atom) eatom[i] += phi;
-    
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairEAMGPU::cpu_compute(int start, int inum, int eflag, int vflag,
-				      int *ilist, int *numneigh,
-				      int **firstneigh)
-{
-  int i,j,ii,jj,m,jnum,itype,jtype;
-  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
-  double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi;
-  double *coeff;
-  int *jlist;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int newton_pair = force->newton_pair;
-  
-  // compute forces on each atom
-  // loop over neighbors of my atoms
-
-  for (ii = start; ii < inum; ii++) {
-    i = ilist[ii];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-
-    jlist = firstneigh[i];
-    jnum = numneigh[i];
-
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      j &= NEIGHMASK;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-
-      if (rsq < cutforcesq) {
-	jtype = type[j];
-	r = sqrt(rsq);
-	p = r*rdr + 1.0;
-	m = static_cast<int> (p);
-	m = MIN(m,nr-1);
-	p -= m;
-	p = MIN(p,1.0);
-
-	// rhoip = derivative of (density at atom j due to atom i)
-	// rhojp = derivative of (density at atom i due to atom j)
-	// phi = pair potential energy
-	// phip = phi'
-	// z2 = phi * r
-	// z2p = (phi * r)' = (phi' r) + phi
-	// psip needs both fp[i] and fp[j] terms since r_ij appears in two
-	//   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
-	//   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
-
-	coeff = rhor_spline[type2rhor[itype][jtype]][m];
-	rhoip = (coeff[0]*p + coeff[1])*p + coeff[2];
-	coeff = rhor_spline[type2rhor[jtype][itype]][m];
-	rhojp = (coeff[0]*p + coeff[1])*p + coeff[2];
-	coeff = z2r_spline[type2z2r[itype][jtype]][m];
-	z2p = (coeff[0]*p + coeff[1])*p + coeff[2];
-	z2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
-
-	recip = 1.0/r;
-	phi = z2*recip;
-	phip = z2p*recip - phi*recip;
-	psip = fp[i]*rhojp + fp[j]*rhoip + phip;
-	fpair = -psip*recip;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-	if (eflag) evdwl = phi;
-	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
-      }
-    }
-  }
-
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairEAMGPU::init_style()
 {
   if (force->newton_pair) 
     error->all(FLERR,"Cannot use newton pair with eam/gpu pair style");
   
   if (!allocated) error->all(FLERR,"Not allocate memory eam/gpu pair style");
   
   // convert read-in file(s) to arrays and spline them
 
   file2array();
   array2spline();
   
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
   double cut;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cut *= cut;
         if (cut > maxcut)
           maxcut = cut;
         cutsq[i][j] = cutsq[j][i] = cut;
       } else
         cutsq[i][j] = cutsq[j][i] = 0.0;
     }
   }
   double cell_size = sqrt(maxcut) + neighbor->skin;
   
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int fp_size;
-  int success = eam_gpu_init(atom->ntypes+1, cutforcesq,
-          type2rhor, type2z2r, type2frho,
-          rhor_spline, z2r_spline, frho_spline,
-          rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, atom->nlocal, 
-          atom->nlocal+atom->nghost, 300, maxspecial,
-			     cell_size, gpu_mode, screen, fp_size);
+  int success = eam_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r,
+			     type2frho, rhor_spline, z2r_spline, frho_spline,
+			     rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr,
+			     atom->nlocal, atom->nlocal+atom->nghost, 300,
+			     maxspecial, cell_size, gpu_mode, screen, fp_size);
   GPU_EXTRA::check_flag(success,error,world);
   
   if (gpu_mode == GPU_FORCE) {
     int irequest = neighbor->request(this);
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
 
   if (fp_size == sizeof(double))
     fp_single = false;
   else
     fp_single = true;
 }
 
 /* ---------------------------------------------------------------------- */
 
 int PairEAMGPU::pack_comm(int n, int *list, double *buf, int pbc_flag, 
 			  int *pbc)
 {
   int i,j,m;
 
   m = 0;
 
   if (fp_single) {
     float *fp_ptr = (float *)fp_pinned;
     for (i = 0; i < n; i++) {
       j = list[i];
       buf[m++] = static_cast<double>(fp_ptr[j]);
     }
   } else {
     double *fp_ptr = (double *)fp_pinned;
     for (i = 0; i < n; i++) {
       j = list[i];
       buf[m++] = fp_ptr[j];
     }
   }
 
   return 1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEAMGPU::unpack_comm(int n, int first, double *buf)
 {
   int i,m,last;
 
   m = 0;
   last = first + n;
   if (fp_single) {
     float *fp_ptr = (float *)fp_pinned;
     for (i = first; i < last; i++) fp_ptr[i] = buf[m++];
   } else {
     double *fp_ptr = (double *)fp_pinned;
     for (i = first; i < last; i++) fp_ptr[i] = buf[m++];
   }
 }
diff --git a/src/GPU/pair_eam_gpu.h b/src/GPU/pair_eam_gpu.h
index 7cdecb010..f7e41be91 100644
--- a/src/GPU/pair_eam_gpu.h
+++ b/src/GPU/pair_eam_gpu.h
@@ -1,55 +1,53 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifdef PAIR_CLASS
 
 PairStyle(eam/gpu,PairEAMGPU)
 
 #else
 
 #ifndef LMP_PAIR_EAM_GPU_H
 #define LMP_PAIR_EAM_GPU_H
 
 #include "stdio.h"
 #include "pair_eam.h"
 
 namespace LAMMPS_NS {
 
 class PairEAMGPU : public PairEAM {
  public:
 
   PairEAMGPU(class LAMMPS *);
   virtual ~PairEAMGPU();
-  void cpu_compute(int, int, int, int, int *, int *, int **);
-  void cpu_compute_energy(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();
 
   int pack_comm(int, int *, double *, int, int *);
   void unpack_comm(int, int, double *);
 
  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
 
- private:
+ protected:
   int gpu_mode;
   double cpu_time;
   int *gpulist;
   void *fp_pinned;
   bool fp_single;  
 };
 
 }
 
 #endif
 #endif
diff --git a/src/GPU/pair_eam_lj_gpu.cpp b/src/GPU/pair_eam_lj_gpu.cpp
index 992a0d494..02bfde93e 100755
--- a/src/GPU/pair_eam_lj_gpu.cpp
+++ b/src/GPU/pair_eam_lj_gpu.cpp
@@ -1,655 +1,656 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_eam_lj_gpu.h"
 #include "atom.h"
 #include "force.h"
 #include "comm.h"
 #include "domain.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "memory.h"
 #include "math_const.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "gpu_extra.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 // External functions from cuda library for atom decomposition
 
 int eam_lj_gpu_init(const int ntypes, double host_cutforcesq, 
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline, double **host_cutljsq, 
                  double **host_lj1, double **host_lj2, double **host_lj3, 
                  double **host_lj4, double **offset, double *special_lj,
                  double rdr, double rdrho, int nrhor, 
                  int nrho, int nz2r, int nfrho, int nr, 
                  const int nlocal, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, 
                  int &gpu_mode, FILE *screen, int &fp_size);
 void eam_lj_gpu_clear();
 int** eam_lj_gpu_compute_n(const int ago, const int inum_full,
                  const int nall, double **host_x, int *host_type,
                  double *sublo, double *subhi, int *tag, int **nspecial, 
                  int **special, const bool eflag, const bool vflag,
                  const bool eatom, const bool vatom, int &host_start,
                  int **ilist, int **jnum,  const double cpu_time,
                  bool &success, int &inum, void **fp_ptr);
 void eam_lj_gpu_compute(const int ago, const int inum_full, const int nlocal, 
                  const int nall,double **host_x, int *host_type, 
                  int *ilist, int *numj, int **firstneigh, 
                  const bool eflag, const bool vflag,
                  const bool eatom, const bool vatom, int &host_start,
                  const double cpu_time, bool &success, void **fp_ptr);
 void eam_lj_gpu_compute_force(int *ilist, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom);
 
 double eam_lj_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
 
 PairEAMLJGPU::PairEAMLJGPU(LAMMPS *lmp) : PairEAM(lmp), gpu_mode(GPU_FORCE)
 {
   respa_enable = 0;
   cpu_time = 0.0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); 
 }
 
 /* ----------------------------------------------------------------------
    check if allocated, since class can be destructed when incomplete
 ------------------------------------------------------------------------- */
 
 PairEAMLJGPU::~PairEAMLJGPU()
 {
   eam_lj_gpu_clear();
 
   memory->destroy(cut);
   memory->destroy(epsilon);
   memory->destroy(sigma);
   memory->destroy(lj1);
   memory->destroy(lj2);
   memory->destroy(lj3);
   memory->destroy(lj4);
   memory->destroy(offset);
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairEAMLJGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
   return bytes + eam_lj_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEAMLJGPU::compute(int eflag, int vflag)
 {
   int i,j,ii,jj,m,jnum,itype,jtype;
   double evdwl,*coeff;
 
   evdwl = 0.0;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
  
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
 
   // compute density on each atom on GPU
 
   int nall = atom->nlocal + atom->nghost;  
   int inum, host_start, inum_dev;
   
   bool success = true;
   int *ilist, *numneigh, **firstneigh; 
   if (gpu_mode != GPU_FORCE) { 
     inum = atom->nlocal;
     firstneigh = eam_lj_gpu_compute_n(neighbor->ago, inum, nall, 
              atom->x,atom->type, domain->sublo, domain->subhi,
              atom->tag, atom->nspecial, atom->special,
              eflag, vflag, eflag_atom, vflag_atom,
              host_start, &ilist, &numneigh, cpu_time,
              success, inum_dev, &fp_pinned);
   } else { // gpu_mode == GPU_FORCE
     inum = list->inum;
     ilist = list->ilist;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
     eam_lj_gpu_compute(neighbor->ago, inum, nlocal, nall, atom->x, atom->type,
 		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
 		    vflag_atom, host_start, cpu_time, success, &fp_pinned);
   }
     
   if (!success)
     error->one(FLERR,"Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
     cpu_compute_energy(host_start, inum, eflag, vflag, 
             ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
   
   // communicate derivative of embedding function
 
   comm->forward_comm_pair(this);
     
   // compute forces on each atom on GPU
   if (gpu_mode != GPU_FORCE) 
     eam_lj_gpu_compute_force(NULL, eflag, vflag, eflag_atom, vflag_atom);
   else
     eam_lj_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom);
 
   if (host_start<inum) {
     double cpu_time2 = MPI_Wtime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time += MPI_Wtime() - cpu_time2;
   }
   
 }
 
 void PairEAMLJGPU::cpu_compute_energy(int start, int inum, int eflag, int vflag,
 				      int *ilist, int *numneigh,
 				      int **firstneigh) 
 {
   int i,j,ii,jj,m,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r,p,rhoip,rhojp,phi;
   double *coeff;
   int *jlist;
   
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
   
   // rho = density at each atom
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
 
       if (rsq < cutforcesq) {
 	jtype = type[j];
 	p = sqrt(rsq)*rdr + 1.0;
 	m = static_cast<int> (p);
 	m = MIN(m,nr-1);
 	p -= m;
 	p = MIN(p,1.0);
 	coeff = rhor_spline[type2rhor[jtype][itype]][m];
 	rho[i] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
 	    }
     }
   }
 
   // communicate and sum densities
 
   if (newton_pair) comm->reverse_comm_pair(this);
   
   
   // fp = derivative of embedding energy at each atom
   // phi = embedding energy at each atom
   
   for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     p = rho[i]*rdrho + 1.0;    
     m = static_cast<int> (p);
     m = MAX(1,MIN(m,nrho-1));
     p -= m;
     p = MIN(p,1.0);
     
     coeff = frho_spline[type2frho[type[i]]][m];
     fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
     if (eflag) {
       phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
       if (eflag_global) eng_vdwl += phi;
       if (eflag_atom) eatom[i] += phi;
     
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEAMLJGPU::cpu_compute(int start, int inum, int eflag, int vflag,
 				      int *ilist, int *numneigh,
 				      int **firstneigh)
 {
   int i,j,ii,jj,m,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi;
   double r2inv,r6inv,factor_lj,force_eam,force_lj;
   double *coeff;
   int *jlist;
 
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   double *special_lj = force->special_lj;
   
   // compute forces on each atom
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
 
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       
       if (rsq < cutsq[itype][jtype]) {
         jtype = type[j];
         if (rsq < cutforcesq  && (itype ==2 && jtype ==2)) {
           r = sqrt(rsq);
           p = r*rdr + 1.0;
           m = static_cast<int> (p);
           m = MIN(m,nr-1);
           p -= m;
           p = MIN(p,1.0);
 
           // rhoip = derivative of (density at atom j due to atom i)
           // rhojp = derivative of (density at atom i due to atom j)
           // phi = pair potential energy
           // phip = phi'
           // z2 = phi * r
           // z2p = (phi * r)' = (phi' r) + phi
           // psip needs both fp[i] and fp[j] terms since r_ij appears in two
           //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
           //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
 
           coeff = rhor_spline[type2rhor[itype][jtype]][m];
           rhoip = (coeff[0]*p + coeff[1])*p + coeff[2];
           coeff = rhor_spline[type2rhor[jtype][itype]][m];
           rhojp = (coeff[0]*p + coeff[1])*p + coeff[2];
           coeff = z2r_spline[type2z2r[itype][jtype]][m];
           z2p = (coeff[0]*p + coeff[1])*p + coeff[2];
           z2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
 
           recip = 1.0/r;
           phi = z2*recip;
           phip = z2p*recip - phi*recip;
           psip = fp[i]*rhojp + fp[j]*rhoip + phip;
           force_eam = -psip*recip;
         } else {
           force_eam=0.0;
           phi = 0.0;
         }
         
       if (rsq < cutsq[itype][jtype] && (itype !=2 || jtype !=2)) {
         r2inv = 1.0/rsq;
         r6inv = r2inv*r2inv*r2inv;
         force_lj = factor_lj*r2inv*r6inv * (lj1[itype][jtype]*r6inv 
           - lj2[itype][jtype]);
       } else force_lj=0.0;
   
   fpair = force_eam + force_lj;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 
 	if (eflag) {
     evdwl = phi;
     if (rsq < cutsq[itype][jtype] && (itype !=2 || jtype !=2)) {
       double e = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
         offset[itype][jtype];
       evdwl += factor_lj*e;  
     }
     }
 	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
       }
     }
     
   }
 }
 
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
 
 void PairEAMLJGPU::allocate()
 {
   allocated = 1;
   int n = atom->ntypes;
 
   memory->create(setflag,n+1,n+1,"pair:setflag");
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       setflag[i][j] = 0;
 
   memory->create(cutsq,n+1,n+1,"pair:cutsq");
 
   map = new int[n+1];
   for (int i = 1; i <= n; i++) map[i] = -1;
 
   type2frho = new int[n+1];
   memory->create(type2rhor,n+1,n+1,"pair:type2rhor");
   memory->create(type2z2r,n+1,n+1,"pair:type2z2r");
   
   // LJ
   memory->create(cut,n+1,n+1,"pair:cut");
   memory->create(epsilon,n+1,n+1,"pair:epsilon");
   memory->create(sigma,n+1,n+1,"pair:sigma");
   memory->create(lj1,n+1,n+1,"pair:lj1");
   memory->create(lj2,n+1,n+1,"pair:lj2");
   memory->create(lj3,n+1,n+1,"pair:lj3");
   memory->create(lj4,n+1,n+1,"pair:lj4");
   memory->create(offset,n+1,n+1,"pair:offset");
 }
 
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
 
 void PairEAMLJGPU::settings(int narg, char **arg)
 {
   if (narg != 1) error->all(FLERR,"Illegal pair_style eam/lj/gpu command");
 
   cut_global = force->numeric(arg[0]);
 
   // reset cutoffs that have been explicitly set
   
   if (allocated) {
     int i,j;
     for (i = 1; i <= atom->ntypes; i++)
       for (j = i+1; j <= atom->ntypes; j++)
 	if (setflag[i][j]) cut[i][j] = cut_global;
   }
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
    read DYNAMO funcfl file
 ------------------------------------------------------------------------- */
 
 void PairEAMLJGPU::coeff(int narg, char **arg)
 {
   if (!allocated) allocate();
   
   if (narg > 5) 
     error->all(FLERR,"Incorrect args for pair coefficients");
 
   // parse pair of atom types
 
   int ilo,ihi,jlo,jhi;
   force->bounds(arg[0],atom->ntypes,ilo,ihi);
   force->bounds(arg[1],atom->ntypes,jlo,jhi);
   
   int count = 0;
   if (narg == 3) { // eam
  
     // read funcfl file if hasn't already been read
     // store filename in Funcfl data struct
 
     int ifuncfl;
     for (ifuncfl = 0; ifuncfl < nfuncfl; ifuncfl++)
       if (strcmp(arg[2],funcfl[ifuncfl].file) == 0) break;
 
     if (ifuncfl == nfuncfl) {
       nfuncfl++;
       funcfl = (Funcfl *) 
         memory->srealloc(funcfl,nfuncfl*sizeof(Funcfl),"pair:funcfl");
       read_file(arg[2]);
       int n = strlen(arg[2]) + 1;
       funcfl[ifuncfl].file = new char[n];
       strcpy(funcfl[ifuncfl].file,arg[2]);
     }
 
     // set setflag and map only for i,i type pairs
     // set mass of atom type if i = j
 
     for (int i = ilo; i <= ihi; i++) {
       for (int j = MAX(jlo,i); j <= jhi; j++) {
         if (i == j) {
 	  setflag[i][i] = 1;
 	  map[i] = ifuncfl;
 	  atom->set_mass(i,funcfl[ifuncfl].mass);
 	  count++;
          }
       }
     }
   } else if (narg >= 4 || narg <= 5) { // LJ
     double epsilon_one = force->numeric(arg[2]);
     double sigma_one = force->numeric(arg[3]);
 
     double cut_one = cut_global;
     if (narg == 5) cut_one = force->numeric(arg[4]);
   
     for (int i = ilo; i <= ihi; i++) {
       for (int j = MAX(jlo,i); j <= jhi; j++) {
         epsilon[i][j] = epsilon_one;
         sigma[i][j] = sigma_one;
         cut[i][j] = cut_one;
         setflag[i][j] = 1;
         count++;
       }
     }
   }
   
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairEAMLJGPU::init_style()
 {
   if (force->newton_pair) 
     error->all(FLERR,"Cannot use newton pair with eam/lj/gpu pair style");
   
   if (!allocated) error->all(FLERR,"Not allocate memory eam/lj/gpu pair style");
   
   // convert read-in file(s) to arrays and spline them
 
   file2array();
   array2spline();
   
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
   double cut;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cut *= cut;
         if (cut > maxcut)
           maxcut = cut;
         cutsq[i][j] = cutsq[j][i] = cut;
       } else
         cutsq[i][j] = cutsq[j][i] = 0.0;
     }
   }
   double cell_size = sqrt(maxcut) + neighbor->skin;
   
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int fp_size;
   int success = eam_lj_gpu_init(atom->ntypes+1, cutforcesq,
           type2rhor, type2z2r, type2frho,
           rhor_spline, z2r_spline, frho_spline, cutsq,
           lj1, lj2, lj3, lj4, offset, force->special_lj,
           rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, atom->nlocal, 
           atom->nlocal+atom->nghost, 300, maxspecial,
 			     cell_size, gpu_mode, screen, fp_size);
   GPU_EXTRA::check_flag(success,error,world);
   
   if (gpu_mode == GPU_FORCE) {
     int irequest = neighbor->request(this);
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
 
   if (fp_size == sizeof(double))
     fp_single = false;
   else
     fp_single = true;
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairEAMLJGPU::init_one(int i, int j)
 {
   if (setflag[i][j] == 0) {
     epsilon[i][j] = mix_energy(epsilon[i][i],epsilon[j][j],
 			       sigma[i][i],sigma[j][j]);
     sigma[i][j] = mix_distance(sigma[i][i],sigma[j][j]);
     cut[i][j] = mix_distance(cut[i][i],cut[j][j]);
   }
 
   if (i==2 && j==2) {
     // single global cutoff = max of cut from all files read in
     // for funcfl could be multiple files
     // for setfl or fs, just one file
 
     if (funcfl) {
       cutmax = 0.0;
       for (int m = 0; m < nfuncfl; m++)
         cutmax = MAX(cutmax,funcfl[m].cut);
     } else if (setfl) cutmax = setfl->cut;
     else if (fs) cutmax = fs->cut;
 
     cutforcesq = cutmax*cutmax;
     cut[i][j] = cutforcesq;
     return cutmax;
   }
   
   lj1[i][j] = 48.0 * epsilon[i][j] * pow(sigma[i][j],12.0);
   lj2[i][j] = 24.0 * epsilon[i][j] * pow(sigma[i][j],6.0);
   lj3[i][j] = 4.0 * epsilon[i][j] * pow(sigma[i][j],12.0);
   lj4[i][j] = 4.0 * epsilon[i][j] * pow(sigma[i][j],6.0);
 
   if (offset_flag) {
     double ratio = sigma[i][j] / cut[i][j];
     offset[i][j] = 4.0 * epsilon[i][j] * (pow(ratio,12.0) - pow(ratio,6.0));
   } else offset[i][j] = 0.0;
 
   lj1[j][i] = lj1[i][j];
   lj2[j][i] = lj2[i][j];
   lj3[j][i] = lj3[i][j];
   lj4[j][i] = lj4[i][j];
   offset[j][i] = offset[i][j];
 
   // compute I,J contribution to long-range tail correction
   // count total # of atoms of type I and J via Allreduce
 
   if (tail_flag) {
     int *type = atom->type;
     int nlocal = atom->nlocal;
 
     double count[2],all[2];
     count[0] = count[1] = 0.0;
     for (int k = 0; k < nlocal; k++) {
       if (type[k] == i) count[0] += 1.0;
       if (type[k] == j) count[1] += 1.0;
     }
     MPI_Allreduce(count,all,2,MPI_DOUBLE,MPI_SUM,world);
         
     double sig2 = sigma[i][j]*sigma[i][j];
     double sig6 = sig2*sig2*sig2;
     double rc3 = cut[i][j]*cut[i][j]*cut[i][j];
     double rc6 = rc3*rc3;
     double rc9 = rc3*rc6;
     etail_ij = 8.0*MY_PI*all[0]*all[1]*epsilon[i][j] * 
       sig6 * (sig6 - 3.0*rc6) / (9.0*rc9); 
     ptail_ij = 16.0*MY_PI*all[0]*all[1]*epsilon[i][j] * 
       sig6 * (2.0*sig6 - 3.0*rc6) / (9.0*rc9); 
   } 
   
   return cut[i][j];
 }
 
 /* ---------------------------------------------------------------------- */
 
 int PairEAMLJGPU::pack_comm(int n, int *list, double *buf, int pbc_flag, 
 			  int *pbc)
 {
   int i,j,m;
 
   m = 0;
 
   if (fp_single) {
     float *fp_ptr = (float *)fp_pinned;
     for (i = 0; i < n; i++) {
       j = list[i];
       buf[m++] = static_cast<double>(fp_ptr[j]);
     }
   } else {
     double *fp_ptr = (double *)fp_pinned;
     for (i = 0; i < n; i++) {
       j = list[i];
       buf[m++] = fp_ptr[j];
     }
   }
 
   return 1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEAMLJGPU::unpack_comm(int n, int first, double *buf)
 {
   int i,m,last;
 
   m = 0;
   last = first + n;
   if (fp_single) {
     float *fp_ptr = (float *)fp_pinned;
     for (i = first; i < last; i++) fp_ptr[i] = buf[m++];
   } else {
     double *fp_ptr = (double *)fp_pinned;
     for (i = first; i < last; i++) fp_ptr[i] = buf[m++];
   }
 }
diff --git a/src/GPU/pair_eam_lj_gpu.h b/src/GPU/pair_eam_lj_gpu.h
index 1dc682277..e3aa4e45a 100755
--- a/src/GPU/pair_eam_lj_gpu.h
+++ b/src/GPU/pair_eam_lj_gpu.h
@@ -1,68 +1,59 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifdef PAIR_CLASS
 
 PairStyle(eam/lj/gpu,PairEAMLJGPU)
 
 #else
 
 #ifndef LMP_PAIR_EAM_LJ_GPU_H
 #define LMP_PAIR_EAM_LJ_GPU_H
 
 #include "stdio.h"
 #include "pair_eam.h"
 
 namespace LAMMPS_NS {
 
 class PairEAMLJGPU : public PairEAM {
  public:
 
   PairEAMLJGPU(class LAMMPS *);
   virtual ~PairEAMLJGPU();
   void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
   void init_style();
   double init_one(int, int);
   double memory_usage();
   void cpu_compute(int, int, int, int, int *, int *, int **);
   void cpu_compute_energy(int, int, int, int, int *, int *, int **);
   
   int pack_comm(int, int *, double *, int, int *);
   void unpack_comm(int, int, double *);
 
  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
 
  protected:
   double cut_global;
   double **cut;
   double **epsilon,**sigma;
   double **lj1,**lj2,**lj3,**lj4,**offset;
 
   void allocate();
-
- private:
-  int gpu_mode;
-  double cpu_time;
-  int *gpulist;
-  void *fp_pinned;
-  bool fp_single;
-
-  
 };
 
 }
 
 #endif
 #endif
diff --git a/src/GPU/pair_table_gpu.cpp b/src/GPU/pair_table_gpu.cpp
index 40c7e95de..dba7a9a82 100644
--- a/src/GPU/pair_table_gpu.cpp
+++ b/src/GPU/pair_table_gpu.cpp
@@ -1,345 +1,346 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
    
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "lmptype.h"
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_table_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #include "gpu_extra.h"
 
 #define LOOKUP 0
 #define LINEAR 1
 #define SPLINE 2
 #define BITMAP 3
 
 // External functions from cuda library for atom decomposition
 
 int table_gpu_init(const int ntypes, double **cutsq, 
      double ***host_table_coeffs, double **host_table_data, 
 		 double *special_lj, const int nlocal, 
 		 const int nall, const int max_nbors, const int maxspecial,
 		 const double cell_size, int &gpu_mode, FILE *screen, 
      int tabstyle, int ntables, int tablength);
 void table_gpu_clear();
 int ** table_gpu_compute_n(const int ago, const int inum,
 			 const int nall, double **host_x, int *host_type, 
 			 double *sublo, double *subhi, int *tag, int **nspecial,
 			 int **special, const bool eflag, const bool vflag,
 			 const bool eatom, const bool vatom, int &host_start,
 			 int **ilist, int **jnum,
 			 const double cpu_time, bool &success);
 void table_gpu_compute(const int ago, const int inum, const int nall, 
 		     double **host_x, int *host_type, int *ilist, int *numj,
 		     int **firstneigh, const bool eflag, const bool vflag,
 		     const bool eatom, const bool vatom, int &host_start,
 		     const double cpu_time, bool &success);
 double table_gpu_bytes();
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairTableGPU::PairTableGPU(LAMMPS *lmp) : PairTable(lmp), 
   gpu_mode(GPU_FORCE)
 {
   respa_enable = 0;
   cpu_time = 0.0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); 
 }
 
 /* ----------------------------------------------------------------------
    free all arrays
 ------------------------------------------------------------------------- */
 
 PairTableGPU::~PairTableGPU()
 {
   table_gpu_clear();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairTableGPU::compute(int eflag, int vflag)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
   
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
   if (gpu_mode != GPU_FORCE) {
     inum = atom->nlocal;
     firstneigh = table_gpu_compute_n(neighbor->ago, inum, nall,
 				   atom->x, atom->type, domain->sublo,
 				   domain->subhi, atom->tag, atom->nspecial,
 				   atom->special, eflag, vflag, eflag_atom,
 				   vflag_atom, host_start, 
 				   &ilist, &numneigh, cpu_time, success);
   } else {
     inum = list->inum;
     ilist = list->ilist;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
     table_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
 		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
 		    vflag_atom, host_start, cpu_time, success);
   }
   if (!success)
     error->one(FLERR,"Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairTableGPU::init_style()
 {
   if (force->newton_pair) 
     error->all(FLERR,"Cannot use newton pair with table/gpu pair style");
 
   int ntypes = atom->ntypes;
   
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
   double cut;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cut *= cut;
         if (cut > maxcut)
           maxcut = cut;
         cutsq[i][j] = cutsq[j][i] = cut;
       } else
         cutsq[i][j] = cutsq[j][i] = 0.0;
     }
   }
   double cell_size = sqrt(maxcut) + neighbor->skin;
   
   // pack tables and send them to device
   double ***table_coeffs = NULL;
   double **table_data = NULL;
   memory->create(table_coeffs, ntypes+1, ntypes+1, 8, "table:coeffs");
     
   Table *tb;
   for (int i = 1; i <= atom->ntypes; i++)
     for (int j = 1; j <= atom->ntypes; j++) {
       int n = tabindex[i][j];
       tb = &tables[n];
       table_coeffs[i][j][0] = n;
       table_coeffs[i][j][1] = tb->ntablebits;
       table_coeffs[i][j][2] = tb->nshiftbits;
       table_coeffs[i][j][3] = tb->nmask;
       table_coeffs[i][j][4] = tb->innersq;
       table_coeffs[i][j][5] = tb->invdelta;
       table_coeffs[i][j][6] = tb->deltasq6;
       table_coeffs[i][j][7] = 0.0;
   }
   
   if (tabstyle != BITMAP) {
     memory->create(table_data, ntables, 8*tablength, "table:data");
     for (int n = 0; n < ntables; n++) {
       tb = &tables[n];
       if (tabstyle == LOOKUP) {
         for (int k = 0; k<tablength-1; k++) {
           table_data[n][8*k+1] = tb->e[k];
           table_data[n][8*k+2] = tb->f[k];
        }
       } else if (tabstyle == LINEAR) {
         for (int k = 0; k<tablength; k++) {
           table_data[n][8*k+0] = tb->rsq[k];
           table_data[n][8*k+1] = tb->e[k];
           table_data[n][8*k+2] = tb->f[k];
           if (k<tablength-1) {
             table_data[n][8*k+3] = tb->de[k];
             table_data[n][8*k+4] = tb->df[k];
           }
        }
       } else if (tabstyle == SPLINE) {
         for (int k = 0; k<tablength; k++) {
           table_data[n][8*k+0] = tb->rsq[k];
           table_data[n][8*k+1] = tb->e[k];
           table_data[n][8*k+2] = tb->f[k];
           table_data[n][8*k+3] = tb->e2[k];
           table_data[n][8*k+4] = tb->f2[k];
         }
       } 
     }
   } else {
     int ntable = 1 << tablength;
     memory->create(table_data, ntables, 8*ntable, "table:data");
     
     for (int n = 0; n < ntables; n++) {
       tb = &tables[n];
       for (int k = 0; k<ntable; k++) {
         table_data[n][8*k+0] = tb->rsq[k];
         table_data[n][8*k+1] = tb->e[k];
         table_data[n][8*k+2] = tb->f[k];
         table_data[n][8*k+3] = tb->de[k];
         table_data[n][8*k+4] = tb->df[k];
         table_data[n][8*k+5] = tb->drsq[k];
       }
     }   
   }
  
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int success = table_gpu_init(atom->ntypes+1, cutsq, 
            table_coeffs, table_data,
 			     force->special_lj, atom->nlocal,
 			     atom->nlocal+atom->nghost, 300, maxspecial,
 			     cell_size, gpu_mode, screen, 
            tabstyle, ntables, tablength);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE) {
     int irequest = neighbor->request(this);
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
   
   memory->destroy(table_coeffs);
   memory->destroy(table_data);
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairTableGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
   return bytes + table_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairTableGPU::cpu_compute(int start, int inum, int eflag, int vflag, 
 			       int *ilist, int *numneigh, int **firstneigh) {
   int i,j,ii,jj,jnum,itype,jtype,itable;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,factor_lj,fraction,value,a,b;
   int *jlist;
   Table *tb;
   
   union_int_float_t rsq_lookup;
   int tlm1 = tablength - 1;
   
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   double *special_lj = force->special_lj;
 
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
 	tb = &tables[tabindex[itype][jtype]];
 	if (rsq < tb->innersq)
 	  error->one(FLERR,"Pair distance < table inner cutoff");
  
 	if (tabstyle == LOOKUP) {
 	  itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta);
 	  if (itable >= tlm1)
 	    error->one(FLERR,"Pair distance > table outer cutoff");
 	  fpair = factor_lj * tb->f[itable];
 	} else if (tabstyle == LINEAR) {
 	  itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta);
 	  if (itable >= tlm1)
 	    error->one(FLERR,"Pair distance > table outer cutoff");
 	  fraction = (rsq - tb->rsq[itable]) * tb->invdelta;
 	  value = tb->f[itable] + fraction*tb->df[itable];
 	  fpair = factor_lj * value;
 	} else if (tabstyle == SPLINE) {
 	  itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta);
 	  if (itable >= tlm1)
 	    error->one(FLERR,"Pair distance > table outer cutoff");
 	  b = (rsq - tb->rsq[itable]) * tb->invdelta;
 	  a = 1.0 - b;
 	  value = a * tb->f[itable] + b * tb->f[itable+1] + 
 	    ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) * 
             tb->deltasq6;
 	  fpair = factor_lj * value;
 	} else {
 	  rsq_lookup.f = rsq;
 	  itable = rsq_lookup.i & tb->nmask;
 	  itable >>= tb->nshiftbits;
 	  fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable];
 	  value = tb->f[itable] + fraction*tb->df[itable];
 	  fpair = factor_lj * value;
 	}
 
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 
 	if (eflag) {
 	  if (tabstyle == LOOKUP)
 	    evdwl = tb->e[itable];
 	  else if (tabstyle == LINEAR || tabstyle == BITMAP)
 	    evdwl = tb->e[itable] + fraction*tb->de[itable];
 	  else
 	    evdwl = a * tb->e[itable] + b * tb->e[itable+1] + 
 	      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * 
 	      tb->deltasq6;
 	  evdwl *= factor_lj;
 	}
 
 	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
       }
     }
   }
 }
diff --git a/src/GPU/pair_yukawa_gpu.cpp b/src/GPU/pair_yukawa_gpu.cpp
index 63186f1ad..f03687d02 100644
--- a/src/GPU/pair_yukawa_gpu.cpp
+++ b/src/GPU/pair_yukawa_gpu.cpp
@@ -1,229 +1,229 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
    
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Trung Dac Nguyen, W. Michael Brown (ORNL)
+   Contributing author: Trung Dac Nguyen (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "lmptype.h"
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "pair_yukawa_gpu.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "integrate.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "domain.h"
 #include "string.h"
 #include "gpu_extra.h"
 
 // External functions from cuda library for atom decomposition
 
 int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
-                 double **host_a, double **offset, double *special_lj, 
-                 const int inum, const int nall, const int max_nbors,  
-                 const int maxspecial, const double cell_size, 
-                 int &gpu_mode, FILE *screen);
+		    double **host_a, double **offset, double *special_lj, 
+		    const int inum, const int nall, const int max_nbors,  
+		    const int maxspecial, const double cell_size, 
+		    int &gpu_mode, FILE *screen);
 void yukawa_gpu_clear();
-int ** yukawa_gpu_compute_n(const int ago, const int inum_full,
-              const int nall, double **host_x, int *host_type,
-              double *sublo, double *subhi, int *tag, int **nspecial,
-              int **special, const bool eflag, const bool vflag,
-              const bool eatom, const bool vatom, int &host_start,
-              int **ilist, int **jnum, const double cpu_time,
-              bool &success);
+int ** yukawa_gpu_compute_n(const int ago, const int inum_full, const int nall,
+			    double **host_x, int *host_type, double *sublo,
+			    double *subhi, int *tag, int **nspecial,
+			    int **special, const bool eflag, const bool vflag,
+			    const bool eatom, const bool vatom,
+			    int &host_start, int **ilist, int **jnum,
+			    const double cpu_time, bool &success);
 void yukawa_gpu_compute(const int ago, const int inum_full, const int nall,
-             double **host_x, int *host_type, int *ilist, int *numj,
-             int **firstneigh, const bool eflag, const bool vflag,
-             const bool eatom, const bool vatom, int &host_start,
-             const double cpu_time, bool &success);
+			double **host_x, int *host_type, int *ilist, int *numj,
+			int **firstneigh, const bool eflag, const bool vflag,
+			const bool eatom, const bool vatom, int &host_start,
+			const double cpu_time, bool &success);
 double yukawa_gpu_bytes();
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairYukawaGPU::PairYukawaGPU(LAMMPS *lmp) : PairYukawa(lmp), 
-    gpu_mode(GPU_FORCE)
+					    gpu_mode(GPU_FORCE)
 {
   respa_enable = 0;
   cpu_time = 0.0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); 
 }
 
 /* ----------------------------------------------------------------------
    free all arrays
 ------------------------------------------------------------------------- */
 
 PairYukawaGPU::~PairYukawaGPU()
 {
   yukawa_gpu_clear();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairYukawaGPU::compute(int eflag, int vflag)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
   
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
   if (gpu_mode != GPU_FORCE) {
     inum = atom->nlocal;
     firstneigh = yukawa_gpu_compute_n(neighbor->ago, inum, nall,
-				   atom->x, atom->type, domain->sublo,
-				   domain->subhi, atom->tag, atom->nspecial,
-				   atom->special, eflag, vflag, eflag_atom,
-				   vflag_atom, host_start, 
-				   &ilist, &numneigh, cpu_time, success);
+				      atom->x, atom->type, domain->sublo,
+				      domain->subhi, atom->tag, atom->nspecial,
+				      atom->special, eflag, vflag, eflag_atom,
+				      vflag_atom, host_start, 
+				      &ilist, &numneigh, cpu_time, success);
   } else {
     inum = list->inum;
     ilist = list->ilist;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
     yukawa_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
-		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
-		    vflag_atom, host_start, cpu_time, success);
+		       ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		       vflag_atom, host_start, cpu_time, success);
   }
   if (!success)
     error->one(FLERR,"Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairYukawaGPU::init_style()
 {
- if (force->newton_pair) 
+  if (force->newton_pair) 
     error->all(FLERR,"Cannot use newton pair with yukawa/gpu pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
   double cut;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cut *= cut;
         if (cut > maxcut)
           maxcut = cut;
         cutsq[i][j] = cutsq[j][i] = cut;
       } else
         cutsq[i][j] = cutsq[j][i] = 0.0;
     }
   }
   double cell_size = sqrt(maxcut) + neighbor->skin;
 
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int success = yukawa_gpu_init(atom->ntypes+1, cutsq, kappa, a,
-			     offset, force->special_lj, atom->nlocal,
-			     atom->nlocal+atom->nghost, 300, maxspecial,
-			     cell_size, gpu_mode, screen);
+				offset, force->special_lj, atom->nlocal,
+				atom->nlocal+atom->nghost, 300, maxspecial,
+				cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE) {
     int irequest = neighbor->request(this);
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairYukawaGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
   return bytes + yukawa_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairYukawaGPU::cpu_compute(int start, int inum, int eflag, int vflag, 
 			       int *ilist, int *numneigh, int **firstneigh) {
   int i,j,ii,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r2inv,r,rinv,screening,forceyukawa,factor;
   int *jlist;
 
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   double *special_lj = force->special_lj;
 
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor = special_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	r = sqrt(rsq);
 	rinv = 1.0/r;
 	screening = exp(-kappa*r);
 	forceyukawa = a[itype][jtype] * screening * (kappa + rinv);
 
 	fpair = factor*forceyukawa * r2inv;
 
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 
 	if (eflag) {
-	  evdwl = a[itype][jtype] * screening * rinv 
-      - offset[itype][jtype];
+	  evdwl = a[itype][jtype] * screening * rinv - offset[itype][jtype];
 	  evdwl *= factor;
 	}
 
 	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
       }
     }
   }
 }