diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp
index 0ca234508..106dc3971 100644
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@@ -1,577 +1,584 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
  
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #include "pair_gpu_atom.h"
 
 #define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
 
 #ifdef WINDLL
 #include <windows.h>
 typedef bool (*__win_sort_alloc)(const int max_atoms);
 typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin,
                            int *particle_begin);
 __win_sort_alloc _win_sort_alloc;
 __win_sort _win_sort;
 #endif
 
 template <class numtyp, class acctyp>
 PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
                               _vflag(false),_inum(0),_ilist(NULL), 
                               _newton(false) {
   #ifndef USE_OPENCL
   sort_config.op = CUDPP_ADD;
   sort_config.datatype = CUDPP_UINT;
   sort_config.algorithm = CUDPP_SORT_RADIX;
   sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
 
   #ifdef WINDLL
   HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
   if (hinstLib == NULL) {
     printf("\nUnable to load gpu.dll\n");
     exit(1);
   }
   _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc");
   _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort");
   #endif
 
   #endif
 }
 
 template <class numtyp, class acctyp>
 int PairGPUAtomT::bytes_per_atom() const { 
   int id_space=0;
   if (_gpu_nbor)
     id_space=2;
   int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
   if (_rot)
     bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
   if (_charge)
     bytes+=sizeof(numtyp);
   return bytes;
 }
 
 template <class numtyp, class acctyp>
 bool PairGPUAtomT::alloc(const int inum, const int nall) {
   _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
   if (_newton)
     _max_local=_max_atoms;
   else
     _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
 
   bool success=true;
   
   int ans_elements=4;
   if (_rot)
     ans_elements+=4;
   
   // Ignore host/device transfers?
   bool cpuview=false;
   if (dev->device_type()==UCL_CPU)
     cpuview=true;
     
   // Allocate storage for CUDPP sort
   #ifndef USE_OPENCL
   #ifdef WINDLL
   _win_sort_alloc(_max_atoms);
   #else
   if (_gpu_nbor) {
     CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
     if (CUDPP_SUCCESS != result)
       return false;
   }
   #endif
   #endif
 
   // --------------------------   Host allocations
   // Get a host write only buffer
   #ifdef GPU_CAST
   success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
                                         UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   success=success && (host_type_cast.alloc(_max_atoms,*dev,
                                            UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   #else
   success=success && (host_x.alloc(_max_atoms*4,*dev,
                       UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   #endif                      
   success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
   success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
   // Buffer for casting only if different precisions
   if (_charge)
     success=success && (host_q.alloc(_max_atoms,*dev,
                                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   // Buffer for casting only if different precisions
   if (_rot)
     success=success && (host_quat.alloc(_max_atoms*4,*dev,
                                         UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
 
     
   // ---------------------------  Device allocations
   _gpu_bytes=0;
   if (cpuview) {
     #ifdef GPU_CAST
     assert(0==1);
     #else
     dev_x.view(host_x);
     #endif
     dev_engv.view(host_engv);
     dev_ans.view(host_ans);
     if (_rot)
       dev_quat.view(host_quat);
     if (_charge)
       dev_q.view(host_q);
   } else {
     #ifdef GPU_CAST
     success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
     success=success && (UCL_SUCCESS==
                         dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
     success=success && (UCL_SUCCESS==
                         dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
     _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
     #else
     success=success && (UCL_SUCCESS==
                         dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
     #endif
     success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
                                        UCL_WRITE_ONLY)==UCL_SUCCESS);
     success=success && (dev_ans.alloc(ans_elements*_max_local,
                                       *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
     if (_charge) {
       success=success && (dev_q.alloc(_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
       _gpu_bytes+=dev_q.row_bytes();
     }
     if (_rot) {
       success=success && (dev_quat.alloc(_max_atoms*4,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
       _gpu_bytes+=dev_quat.row_bytes();
     }
   }
   if (_gpu_nbor) {
     success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
     success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
     _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
     if (_bonds) {
       success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
       _gpu_bytes+=dev_tag.row_bytes();
     }
   }
 
   _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
   
   _allocated=true;  
   return success;
 }
 
 template <class numtyp, class acctyp>
 bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
                         const bool rot, UCL_Device &devi, const bool gpu_nbor,
                         const bool bonds) {
   clear();
 
   bool success=true;
   _gpu_nbor=gpu_nbor;
   _bonds=bonds;
   _charge=charge;
   _rot=rot;
   _other=_charge || _rot;
   dev=&devi;
 
   _e_fields=1;
   if (_charge)
     _e_fields++;
   _ev_fields=6+_e_fields;
     
   // Initialize atom and nbor data
   int ef_inum=inum;
   if (ef_inum==0)
     ef_inum=1000;
   int ef_nall=nall;
   if (ef_nall<=ef_inum)
     ef_nall=ef_inum*2;
   
   // Initialize timers for the selected device
   time_pos.init(*dev);
   time_other.init(*dev);
   time_answer.init(*dev);
   time_pos.zero();
   time_other.zero();
   time_answer.zero();
   _time_cast=0.0;
   
   #ifdef GPU_CAST
   compile_kernels(*dev);
   #endif
   
   return success && alloc(ef_inum,ef_nall);
 }
   
+template <class numtyp, class acctyp>
+bool PairGPUAtomT::add_fields(const bool charge, const bool rot) {
+  if (charge && _charge=false) {
+    _charge=true;
+    _e_fields++;
+    
+
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear_resize() {
   if (!_allocated)
     return;
   _allocated=false;
 
   dev_x.clear();
   if (_charge) { 
     dev_q.clear();
     host_q.clear();
   }
   if (_rot) {
     dev_quat.clear();
     host_quat.clear();
   }
   dev_ans.clear();
   dev_engv.clear();
   #ifndef GPU_CAST
   host_x.clear();
   #else
   host_x_cast.clear();
   host_type_cast.clear();
   #endif
   host_ans.clear();
   host_engv.clear();
   dev_cell_id.clear();
   dev_particle_id.clear();
   dev_tag.clear();
   #ifdef GPU_CAST
   dev_x_cast.clear();
   dev_type_cast.clear();
   #endif
 
   #ifndef USE_OPENCL
   #ifndef WINDLL
   if (_gpu_nbor) cudppDestroyPlan(sort_plan);
   #endif
   #endif
 }
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
   _gpu_bytes=0;
   if (!_allocated)
     return;
 
   time_pos.clear();
   time_other.clear();
   time_answer.clear();
   clear_resize();
   _inum=0;
   _eflag=false;
   _vflag=false;
 
   #ifdef GPU_CAST
   if (_compiled) {
     k_cast_x.clear();
     delete atom_program;
     _compiled=false;
   }
   #endif
 }
 
 template <class numtyp, class acctyp>
 double PairGPUAtomT::host_memory_usage() const {
   int atom_bytes=4;
   if (_charge) 
     atom_bytes+=1;
   if (_rot) 
     atom_bytes+=4;
   int ans_bytes=atom_bytes+_ev_fields;
   return _max_atoms*atom_bytes*sizeof(numtyp)+
          ans_bytes*(_max_local)*sizeof(acctyp)+
          sizeof(PairGPUAtom<numtyp,acctyp>);
 }
   
 template <class numtyp, class acctyp>
 void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
                                 const bool ef_atom, const bool vf_atom) {
   time_answer.start();
   _eflag=eflag;
   _vflag=vflag;
   _ef_atom=ef_atom;
   _vf_atom=vf_atom;
     
   int csize=_ev_fields;    
   if (!eflag)
     csize-=_e_fields;
   if (!vflag)
     csize-=6;
       
   if (csize>0)
     ucl_copy(host_engv,dev_engv,_inum*csize,true);
   if (_rot)
     ucl_copy(host_ans,dev_ans,_inum*4*2,true);
   else
     ucl_copy(host_ans,dev_ans,_inum*4,true);
   time_answer.stop();
 }
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
                                 const bool ef_atom, const bool vf_atom,
                                 int *ilist) {
   _ilist=ilist;
   copy_answers(eflag,vflag,ef_atom,vf_atom);
 }
 
 template <class numtyp, class acctyp>
 double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
                                    double *virial) {
   if (_eflag==false && _vflag==false)
     return 0.0;
 
   double evdwl=0.0;
   if (_gpu_nbor) {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       if (_eflag) {
         if (_ef_atom) {
           evdwl+=*ap;
           eatom[i]+=*ap*0.5;
           ap+=_inum;
         } else {
           evdwl+=*ap;
           ap+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
             vatom[i][j]+=*ap*0.5;
             virial[j]+=*ap;
             ap+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
             virial[j]+=*ap;
             ap+=_inum;
           }
         }
       }
     }
     for (int j=0; j<6; j++)
       virial[j]*=0.5;
   } else {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       int ii=_ilist[i];
       if (_eflag) {
         if (_ef_atom) {
           evdwl+=*ap;
           eatom[ii]+=*ap*0.5;
           ap+=_inum;
         } else {
           evdwl+=*ap;
           ap+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
             vatom[ii][j]+=*ap*0.5;
             virial[j]+=*ap;
             ap+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
             virial[j]+=*ap;
             ap+=_inum;
           }
         }
       }
     }
     for (int j=0; j<6; j++)
       virial[j]*=0.5;
   }
   
   evdwl*=0.5;
   return evdwl;
 }
 
 template <class numtyp, class acctyp>
 double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
                                    double *virial, double &ecoul) {
   if (_eflag==false && _vflag==false) {
     ecoul=0.0;
     return 0.0;
   }
 
   if (_charge==false)
     return energy_virial(eatom,vatom,virial);
 
   double evdwl=0.0;
   double _ecoul=0.0;
   if (_gpu_nbor) {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       if (_eflag) {
         if (_ef_atom) {
           evdwl+=*ap;
           eatom[i]+=*ap*0.5;
           ap+=_inum;
           _ecoul+=*ap;
           eatom[i]+=*ap*0.5;
           ap+=_inum;
         } else {
           evdwl+=*ap;
           ap+=_inum;
           _ecoul+=*ap;
           ap+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
             vatom[i][j]+=*ap*0.5;
             virial[j]+=*ap;
             ap+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
             virial[j]+=*ap;
             ap+=_inum;
           }
         }
       }
     }
     for (int j=0; j<6; j++)
       virial[j]*=0.5;
   } else {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       int ii=_ilist[i];
       if (_eflag) {
         if (_ef_atom) {
           evdwl+=*ap;
           eatom[ii]+=*ap*0.5;
           ap+=_inum;
           _ecoul+=*ap;
           eatom[ii]+=*ap*0.5;
           ap+=_inum;
         } else {
           evdwl+=*ap;
           ap+=_inum;
           _ecoul+=*ap;
           ap+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
             vatom[ii][j]+=*ap*0.5;
             virial[j]+=*ap;
             ap+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
             virial[j]+=*ap;
             ap+=_inum;
           }
         }
       }
     }
     for (int j=0; j<6; j++)
       virial[j]*=0.5;
   }
   
   evdwl*=0.5;
   ecoul+=_ecoul*0.5;
   return evdwl;
 }
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::get_answers(double **f, double **tor) {
   acctyp *ap=host_ans.begin();
   if (_gpu_nbor) {
     for (int i=0; i<_inum; i++) {
       f[i][0]+=*ap;
       ap++;
       f[i][1]+=*ap;
       ap++;
       f[i][2]+=*ap;
       ap+=2;
     }
     if (_rot) {
       for (int i=0; i<_inum; i++) {
         tor[i][0]+=*ap;
         ap++;
         tor[i][1]+=*ap;
         ap++;
         tor[i][2]+=*ap;
         ap+=2;
       }
     }
   } else {
     for (int i=0; i<_inum; i++) {
       int ii=_ilist[i];
       f[ii][0]+=*ap;
       ap++;
       f[ii][1]+=*ap;
       ap++;
       f[ii][2]+=*ap;
       ap+=2;
     }
     if (_rot) {
       for (int i=0; i<_inum; i++) {
         int ii=_ilist[i];
         tor[ii][0]+=*ap;
         ap++;
         tor[ii][1]+=*ap;
         ap++;
         tor[ii][2]+=*ap;
         ap+=2;
       }
     }
   }
 }
 
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
 void PairGPUAtomT::sort_neighbor(const int num_atoms) {
   #ifndef USE_OPENCL
   #ifdef WINDLL
   _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(),
             (int *)dev_particle_id.begin());
   #else
   CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), 
                                  (int *)dev_particle_id.begin(), 
                                  8*sizeof(unsigned), num_atoms);
   if (CUDPP_SUCCESS != result) {
     printf("Error in cudppSort\n");
     NVD_GERYON_EXIT;
   }
   #endif
   #endif
 }
 
 #ifdef GPU_CAST
 #ifdef USE_OPENCL
 #include "pair_gpu_atom_cl.h"
 #else
 #include "pair_gpu_atom_ptx.h"
 #endif
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::compile_kernels(UCL_Device &dev) {
   atom_program=new UCL_Program(dev);
   atom_program->load_string(pair_gpu_atom_kernel,"");
   k_cast_x.set_function(*atom_program,"kernel_cast_x");
   _compiled=true;
 }
 
 #endif
 
 template class PairGPUAtom<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index e0a1fd9fb..2f816be03 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -1,419 +1,424 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #ifndef PAIR_GPU_ATOM_H
 #define PAIR_GPU_ATOM_H
 
 #include <math.h>
 #include "mpi.h"
 
 #ifdef USE_OPENCL
 
 #include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
 using namespace ucl_opencl;
 
 #else
 
 #include "cudpp.h"
 #include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
 using namespace ucl_cudadr;
 
 #endif
 
 #ifndef int2
 struct int2 { int x; int y; };
 #endif
 
 #include "pair_gpu_precision.h"
 
 template <class numtyp, class acctyp>
 class PairGPUAtom {
  public:
   PairGPUAtom();
   ~PairGPUAtom() { clear(); }
 
   /// Maximum number of atoms that can be stored with current allocation
   inline int max_atoms() const { return _max_atoms; }
   /// Current number of local+ghost atoms stored
   inline int nall() const { return _nall; }
   /// Current number of local atoms stored
   inline int inum() const { return _inum; }
 
   /// Set number of local+ghost atoms for future copy operations
   inline void nall(const int n) { _nall=n; }
   /// Set number of local atoms for future copy operations
   inline void inum(const int n) { _inum=n; }
   
   /// Memory usage per atom in this class
   int bytes_per_atom() const; 
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor True if neighboring will be performed on device **/
   bool init(const int inum, const int nall, const bool charge, const bool rot, 
             UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
   
   /// Check if we have enough device storage and realloc if not
   inline bool resize(const int inum, const int nall, bool &success) {
     _inum=inum;
     _nall=nall;
     if (inum>_max_local || nall>_max_atoms) {
       clear_resize();
       success = success && alloc(inum,nall);
       return true;
     }
     return false;
   }
+  
+  /// If already initialized by another LAMMPS style, add fields as necessary
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool add_fields(const bool charge, const bool rot);  
 
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
   
   /// Free all memory on host and device
   void clear();
  
   /// Return the total amount of host memory used by class in bytes
   double host_memory_usage() const;
 
   /// Sort arrays for neighbor list calculation on device
   void sort_neighbor(const int num_atoms);
   
   /// Add copy times to timers
   inline void acc_timers() {
     time_pos.add_to_total();
     time_answer.add_to_total();
     if (_other)
       time_other.add_to_total();
   }
 
   /// Add copy times to timers
   inline void zero_timers() {
     time_pos.zero();
     time_answer.zero();
     if (_other)
       time_other.zero();
   }
 
   /// Return the total time for host/device data transfer
   inline double transfer_time() {
     double total=time_pos.total_seconds()+time_answer.total_seconds();
     if (_other) total+=time_other.total_seconds();
     return total;
   }
   
   /// Return the total time for data cast/pack
   inline double cast_time() { return _time_cast; }
 
   /// Pack LAMMPS atom type constants into matrix and copy to device
   template <class dev_typ, class t1>
   inline void type_pack1(const int n, const int m_size,
 			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
 			 t1 **one) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
         buffer[ii]=static_cast<numtyp>(one[i][j]);
         ii++;
       }
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
 
   /// Pack LAMMPS atom type constants into 2 vectors and copy to device
   template <class dev_typ, class t1, class t2>
   inline void type_pack2(const int n, const int m_size,
 			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
 			 t1 **one, t2 **two) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
         buffer[ii*2]=static_cast<numtyp>(one[i][j]);
         buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
         ii++;
       }
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
 
   /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
   template <class dev_typ, class t1, class t2, class t3>
   inline void type_pack4(const int n, const int m_size,
 			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
 			 t1 **one, t2 **two, t3 **three) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
         buffer[ii*4]=static_cast<numtyp>(one[i][j]);
         buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
         buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
         ii++;
       }
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
   
   /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
   template <class dev_typ, class t1, class t2, class t3, class t4>
   inline void type_pack4(const int n, const int m_size,
 			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
 			 t1 **one, t2 **two, t3 **three, t4 **four) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
         buffer[ii*4]=static_cast<numtyp>(one[i][j]);
         buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
         buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
         buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
         ii++;
       }
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
 
   /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
   template <class dev_typ, class t1, class t2>
   inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, 
                          UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
     for (int i=0; i<n; i++) {
       buffer[i*2]=static_cast<numtyp>(one[i][i]);
       buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),n,*dev);
     ucl_copy(dev_v,view,false);
   }
 
   // -------------------------COPY TO GPU ----------------------------------
 
   /// Cast positions and types to write buffer
   inline void cast_x_data(double **host_ptr, const int *host_type) {
     double t=MPI_Wtime();
     #ifdef GPU_CAST
     memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
     memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
     #else
     numtyp *_write_loc=host_x.begin();
     for (int i=0; i<_nall; i++) {
       *_write_loc=host_ptr[i][0];
       _write_loc++;
       *_write_loc=host_ptr[i][1];
       _write_loc++;
       *_write_loc=host_ptr[i][2];
       _write_loc++;
       *_write_loc=host_type[i];
       _write_loc++;
     }
     #endif
     _time_cast+=MPI_Wtime()-t;
   }      
 
   /// Copy positions and types to device asynchronously
   /** Copies nall() elements **/
   inline void add_x_data(double **host_ptr, int *host_type) { 
     time_pos.start();
     #ifdef GPU_CAST
     ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
     ucl_copy(dev_type_cast,host_type_cast,_nall,true);
     int block_size=64;
     int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
     k_cast_x.set_size(GX,block_size);
     k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
                  &_nall);
     #else
     ucl_copy(dev_x,host_x,_nall*4,true);
     #endif
     time_pos.stop();
   }
 
   /// Calls cast_x_data and add_x_data and times the routines
   inline void cast_copy_x(double **host_ptr, int *host_type) {
     cast_x_data(host_ptr,host_type);
     add_x_data(host_ptr,host_type);
   }
 
   /// Cast charges to write buffer
   template<class cpytyp>
   inline void cast_q_data(cpytyp *host_ptr) {
     double t=MPI_Wtime();
     if (dev->device_type()==UCL_CPU) {
       if (sizeof(numtyp)==sizeof(double)) {
         host_q.view((numtyp*)host_ptr,_nall,*dev);
         dev_q.view(host_q);
       } else
         for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
     } else {
       if (sizeof(numtyp)==sizeof(double))
         memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
       else
         for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
     }
     _time_cast+=MPI_Wtime()-t;
   }
 
   /// Copy charges to device asynchronously
   inline void add_q_data() {
     ucl_copy(dev_q,host_q,_nall,true);
   }
 
   /// Cast quaternions to write buffer
   template<class cpytyp>
   inline void cast_quat_data(cpytyp *host_ptr) {
     double t=MPI_Wtime();
     if (dev->device_type()==UCL_CPU) {
       if (sizeof(numtyp)==sizeof(double)) {
         host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
         dev_quat.view(host_quat);
       } else
         for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
     } else {
       if (sizeof(numtyp)==sizeof(double))
         memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
       else
         for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
     }
     _time_cast+=MPI_Wtime()-t;
   }
 
   /// Copy quaternions to device
   /** Copies nall()*4 elements **/
   inline void add_quat_data() {
     ucl_copy(dev_quat,host_quat,_nall*4,true);
   }
 
   /// Copy data other than pos and data to device
   inline void add_other_data() {
     time_other.start();
     if (_charge)
       add_q_data();
     if (_rot)
       add_quat_data();
     time_other.stop();
   }
   
   /// Return number of bytes used on device
   inline double gpu_bytes() { return _gpu_bytes; } 
 
   // -------------------------COPY FROM GPU -------------------------------
 
   /// Copy answers from device into read buffer asynchronously
   void copy_answers(const bool eflag, const bool vflag,
                     const bool ef_atom, const bool vf_atom);
 
   /// Copy answers from device into read buffer asynchronously
   void copy_answers(const bool eflag, const bool vflag,
                     const bool ef_atom, const bool vf_atom, int *ilist);
   
   /// Copy energy and virial data into LAMMPS memory
   double energy_virial(double *eatom, double **vatom, double *virial);
 
   /// Copy energy and virial data into LAMMPS memory
   double energy_virial(double *eatom, double **vatom, double *virial,
                        double &ecoul);
 
   /// Add forces and torques from the GPU into a LAMMPS pointer
   void get_answers(double **f, double **tor);
 
   // ------------------------------ DATA ----------------------------------
 
   /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
   UCL_D_Vec<numtyp> dev_x;
   /// Charges
   UCL_D_Vec<numtyp> dev_q;
   /// Quaterions
   UCL_D_Vec<numtyp> dev_quat;
   /// Force and possibly torque
   UCL_D_Vec<acctyp> dev_ans;
   /// Energy and virial per-atom storage
   UCL_D_Vec<acctyp> dev_engv;
   
   #ifdef GPU_CAST
   UCL_D_Vec<double> dev_x_cast;
   UCL_D_Vec<int> dev_type_cast;
   UCL_H_Vec<double> host_x_cast;
   UCL_H_Vec<int> host_type_cast;
   #endif
 
   /// Buffer for moving positions to device
   UCL_H_Vec<numtyp> host_x;
   /// Buffer for moving charge data to GPU
   UCL_H_Vec<numtyp> host_q;
   /// Buffer for moving quat data to GPU
   UCL_H_Vec<numtyp> host_quat;
   /// Force and possibly torque data on host
   UCL_H_Vec<acctyp> host_ans;
   /// Energy/virial data on host
   UCL_H_Vec<acctyp> host_engv;
   
   /// Cell list identifiers for device nbor builds
   UCL_D_Vec<unsigned> dev_cell_id;
   /// Cell list identifiers for device nbor builds
   UCL_D_Vec<int> dev_particle_id;
   /// Atom tag information for device nbor builds
   UCL_D_Vec<int> dev_tag;
 
   /// Device timers
   UCL_Timer time_pos, time_other, time_answer;
   
   /// Geryon device
   UCL_Device *dev;
 
  private:
   #ifdef GPU_CAST
   UCL_Program *atom_program;
   UCL_Kernel k_cast_x;
   void compile_kernels(UCL_Device &dev);
   #endif
 
   bool _compiled;
 
   bool alloc(const int inum, const int nall);
   
   bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
   int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
   bool _gpu_nbor, _bonds;
   int *_ilist;
   double _time_cast;
   
   double _gpu_bytes;
   
   bool _newton;
 
   #ifndef USE_OPENCL
   CUDPPConfiguration sort_config;
   CUDPPHandle sort_plan;
   #endif
 };
 
 #endif
 
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index d02618e22..09728c77d 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -1,286 +1,287 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #include "pair_gpu_device.h"
 #include "pair_gpu_precision.h"
 #include <map>
 #include <math.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
 #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
 
 template <class numtyp, class acctyp>
 PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
                                   _gpu_mode(GPU_FORCE), _first_device(0),
                                   _last_device(0) {
 }
 
 template <class numtyp, class acctyp>
 PairGPUDeviceT::~PairGPUDevice() {
   clear_device();
 }
 
 template <class numtyp, class acctyp>
 bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
                                  const int first_gpu, const int last_gpu,
                                  const int gpu_mode, const double p_split,
                                  const int nthreads) {
   _nthreads=nthreads;
   #ifdef _OPENMP
   omp_set_num_threads(nthreads);
   #endif
 
   if (_device_init)
     return true;
   _device_init=true;
   _comm_world=world;
   _comm_replica=replica;
   _first_device=first_gpu;
   _last_device=last_gpu;
   _gpu_mode=gpu_mode;
   _particle_split=p_split;
 
   // Get the rank/size within the world
   MPI_Comm_rank(_comm_world,&_world_me);
   MPI_Comm_size(_comm_world,&_world_size);
   // Get the rank/size within the replica
   MPI_Comm_rank(_comm_replica,&_replica_me);
   MPI_Comm_size(_comm_replica,&_replica_size);
 
   // Get the names of all nodes
   int name_length;
   char node_name[MPI_MAX_PROCESSOR_NAME];
   char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
   MPI_Get_processor_name(node_name,&name_length);
   MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
                 MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
   std::string node_string=std::string(node_name);
   
   // Get the number of procs per node                
   std::map<std::string,int> name_map;
   std::map<std::string,int>::iterator np;
   for (int i=0; i<_world_size; i++) {
     std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
     np=name_map.find(i_string);
     if (np==name_map.end())
       name_map[i_string]=1;
     else
       np->second++;
   }
   int procs_per_node=name_map.begin()->second;
 
   // Assign a unique id to each node
   int split_num=0, split_id=0;
   for (np=name_map.begin(); np!=name_map.end(); ++np) {
     if (np->first==node_string)
       split_id=split_num;
     split_num++;
   }
   
   // Set up a per node communicator and find rank within
   MPI_Comm node_comm;
   MPI_Comm_split(_comm_world, split_id, 0, &node_comm);  
   int node_rank;
   MPI_Comm_rank(node_comm,&node_rank);                  
 
   // set the device ID
   _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
                                        (last_gpu-first_gpu+1)));
   int my_gpu=node_rank/_procs_per_gpu;
   
   // Set up a per device communicator
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
   MPI_Comm_rank(_comm_gpu,&_gpu_rank);
 
   gpu=new UCL_Device();
   if (my_gpu>=gpu->num_devices())
     return false;
   
   gpu->set(my_gpu);
   return true;
 }
 
 template <class numtyp, class acctyp>
 bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, 
                           const int host_nlocal, const int nall,
                           const int maxspecial, const bool gpu_nbor, 
                           const int gpu_host, const int max_nbors, 
                           const double cell_size, const bool pre_cut) {
   if (!_device_init)
     return false;                          
   if (_init_count==0) {
     // Initialize atom and nbor data
     int ef_nlocal=nlocal;
     if (_particle_split<1.0 && _particle_split>0.0)
       ef_nlocal=static_cast<int>(_particle_split*nlocal);
     if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
                    gpu_nbor && maxspecial>0))
       return false;
     if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
                    gpu_host,pre_cut))
       return false;
     nbor.cell_size(cell_size);
   } else {
+    atom.add_fields(charge,rot);
     if (cell_size>nbor.cell_size())
       nbor.cell_size(cell_size);
   }
 
   _init_count++;
   return true;
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::init_message(FILE *screen, const char *name,
                                   const int first_gpu, const int last_gpu) {
   #ifdef USE_OPENCL
   std::string fs="";
   #else
   std::string fs=toa(gpu->free_gigabytes())+"/";
   #endif
   
   if (_replica_me == 0 && screen) {
     fprintf(screen,"\n-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
     fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
     fprintf(screen,"-  with %d procs per device.\n",_procs_per_gpu);
     #ifdef _OPENMP
     fprintf(screen,"-  with %d threads per proc.\n",_nthreads);
     #endif
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
 
     for (int i=first_gpu; i<=last_gpu; i++) {
       std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
                         toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
                         " GHZ (";
       if (sizeof(PRECISION)==4) {
         if (sizeof(ACC_PRECISION)==4)
           sname+="Single Precision)";
         else
           sname+="Mixed Precision)";
       } else
         sname+="Double Precision)";
 
       fprintf(screen,"GPU %d: %s\n",i,sname.c_str());         
     }
 
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n\n");
   }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
                                   const double max_bytes, FILE *screen) {
   double single[5], times[5];
 
   single[0]=atom.transfer_time();
   single[1]=nbor.time_nbor.total_seconds();
   single[2]=nbor.time_kernel.total_seconds();
   single[3]=time_pair.total_seconds();
   single[4]=atom.cast_time();
 
   MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes;
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
 
   if (replica_me()==0)
     if (screen && times[3]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
       fprintf(screen,"      GPU Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
       if (procs_per_gpu()==1) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
         if (nbor.gpu_nbor())
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
         else
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
       }
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
       fprintf(screen,"-------------------------------------");
       fprintf(screen,"--------------------------------\n\n");
     }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::clear() {
   if (_init_count>0) {
     _init_count--;
     if (_init_count==0) {
       atom.clear();
       nbor.clear();
     }
   }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::clear_device() {
   while (_init_count>0)
     clear();
   if (_device_init) {
     delete gpu;
     _device_init=false;
   }
 }
 
 template <class numtyp, class acctyp>
 double PairGPUDeviceT::host_memory_usage() const {
   return atom.host_memory_usage()+
          nbor.host_memory_usage()+4*sizeof(numtyp)+
          sizeof(PairGPUDevice<numtyp,acctyp>);
 }
 
 template class PairGPUDevice<PRECISION,ACC_PRECISION>;
 PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 
 bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                      const int last_gpu, const int gpu_mode, 
                      const double particle_split, const int nthreads) {
   return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
                                      particle_split,nthreads);
 }
 
 void lmp_clear_device() {
   pair_gpu_device.clear_device();
 }
 
 double lmp_gpu_forces(double **f, double **tor, double *eatom,
                       double **vatom, double *virial, double &ecoul) {
   if (pair_gpu_device.init_count()) {
     pair_gpu_device.stop_host_timer();
     pair_gpu_device.gpu->sync();
     double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
     pair_gpu_device.atom.get_answers(f,tor);
 
     return evdw;
   }
   return 0.0;
 }
 
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index 00a2c5d65..12137406e 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -1,681 +1,677 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "string.h"
 #include "ctype.h"
 #include "pair_hybrid.h"
 #include "atom.h"
 #include "force.h"
 #include "pair.h"
 #include "neighbor.h"
 #include "neigh_request.h"
 #include "update.h"
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 /* ---------------------------------------------------------------------- */
 
 PairHybrid::PairHybrid(LAMMPS *lmp) : Pair(lmp)
 {
   nstyles = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairHybrid::~PairHybrid()
 {
   if (nstyles) {
     for (int m = 0; m < nstyles; m++) delete styles[m];
     delete [] styles;
     for (int m = 0; m < nstyles; m++) delete [] keywords[m];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy_2d_int_array(setflag);
     memory->destroy_2d_double_array(cutsq);
     memory->destroy_2d_int_array(nmap);
     memory->destroy_3d_int_array(map);
   }
 }
 
 /* ----------------------------------------------------------------------
   call each sub-style's compute function
   accumulate sub-style global/peratom energy/virial in hybrid
   for global vflag = 1:
     each sub-style computes own virial[6]
     sum sub-style virial[6] to hybrid's virial[6]
   for global vflag = 2:
     call sub-style with adjusted vflag to prevent it calling virial_compute()
     hybrid calls virial_compute() on final accumulated f
 ------------------------------------------------------------------------- */
 
 void PairHybrid::compute(int eflag, int vflag)
 {
   int i,j,m,n;
 
   // if no_virial_compute is set and global component of incoming vflag = 2
   // reset vflag as if global component were 1
   // necessary since one or more sub-styles cannot compute virial as F dot r
   
   if (no_virial_compute && vflag % 4 == 2) vflag = 1 + vflag/4 * 4;
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = eflag_global = vflag_global = 
 	 eflag_atom = vflag_atom = 0;
 
   // check if global component of incoming vflag = 2
   // if so, reset vflag passed to substyle as if it were 0
   // necessary so substyle will not invoke virial_compute()
 
   int vflag_substyle;
   if (vflag % 4 == 2) vflag_substyle = vflag/4 * 4;
   else vflag_substyle = vflag;
 
   for (m = 0; m < nstyles; m++) {
     styles[m]->compute(eflag,vflag_substyle);
 
     if (eflag_global) {
       eng_vdwl += styles[m]->eng_vdwl;
       eng_coul += styles[m]->eng_coul;
     }
     if (vflag_global) {
       for (n = 0; n < 6; n++) virial[n] += styles[m]->virial[n];
     }
     if (eflag_atom) {
       n = atom->nlocal;
       if (force->newton_pair) n += atom->nghost;
       double *eatom_substyle = styles[m]->eatom;
       for (i = 0; i < n; i++) eatom[i] += eatom_substyle[i];
     }
     if (vflag_atom) {
       n = atom->nlocal;
       if (force->newton_pair) n += atom->nghost;
       double **vatom_substyle = styles[m]->vatom;
       for (i = 0; i < n; i++)
 	for (j = 0; j < 6; j++)
 	  vatom[i][j] += vatom_substyle[i][j];
     }
   }
 
   if (vflag_fdotr) virial_compute();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairHybrid::compute_inner()
 {
   for (int m = 0; m < nstyles; m++)
     if (styles[m]->respa_enable) styles[m]->compute_inner();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairHybrid::compute_middle()
 {
   for (int m = 0; m < nstyles; m++)
     if (styles[m]->respa_enable) styles[m]->compute_middle();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairHybrid::compute_outer(int eflag, int vflag)
 {
   for (int m = 0; m < nstyles; m++)
     if (styles[m]->respa_enable) styles[m]->compute_outer(eflag,vflag);
 }
 
 /* ----------------------------------------------------------------------
    allocate all arrays 
 ------------------------------------------------------------------------- */
 
 void PairHybrid::allocate()
 {
   allocated = 1;
   int n = atom->ntypes;
 
   setflag = memory->create_2d_int_array(n+1,n+1,"pair:setflag");
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       setflag[i][j] = 0;
 
   cutsq = memory->create_2d_double_array(n+1,n+1,"pair:cutsq");
 
   nmap = memory->create_2d_int_array(n+1,n+1,"pair:nmap");
   map = memory->create_3d_int_array(n+1,n+1,nstyles,"pair:map");
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       nmap[i][j] = 0;
 }
 
 /* ----------------------------------------------------------------------
    create one pair style for each arg in list
 ------------------------------------------------------------------------- */
 
 void PairHybrid::settings(int narg, char **arg)
 {
   int i,m,istyle;
 
   if (narg < 1) error->all("Illegal pair_style command");
 
   // delete old lists, since cannot just change settings
 
   if (nstyles) {
     for (m = 0; m < nstyles; m++) delete styles[m];
     delete [] styles;
     for (m = 0; m < nstyles; m++) delete [] keywords[m];
     delete [] keywords;
   }
 
   if (allocated) {
     memory->destroy_2d_int_array(setflag);
     memory->destroy_2d_double_array(cutsq);
     memory->destroy_2d_int_array(nmap);
     memory->destroy_3d_int_array(map);
   }
   allocated = 0;
 
   // count sub-styles by skipping numeric args
   // exception is 1st arg of style "table", which is non-numeric word
   // exception is 1st two args of style "lj/coul", which are non-numeric
   // exception is 1st two args of style "buck/coul", which are non-numeric
-  // exception is 1st arg of any "gpu" style, which is non-numeric
   // exception is 1st arg of reax/c style, which is non-numeric
   // need a better way to skip these exceptions
 
   nstyles = 0;
   i = 0;
   while (i < narg) {
     if (strcmp(arg[i],"table") == 0) i++;
     if (strcmp(arg[i],"lj/coul") == 0) i += 2;
     if (strcmp(arg[i],"buck/coul") == 0) i += 2;
-    if (strstr(arg[i],"gpu")) i++;
     if (strcmp(arg[i],"reax/c") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     nstyles++;
   }
 
   // allocate list of sub-styles
 
   styles = new Pair*[nstyles];
   keywords = new char*[nstyles];
 
   // allocate each sub-style and call its settings() with subset of args
   // define subset of args for a sub-style by skipping numeric args
   // exception is 1st arg of style "table", which is non-numeric
   // exception is 1st two args of style "lj/coul", which are non-numeric
   // exception is 1st two args of style "buck/coul", which are non-numeric
-  // exception is 1st arg of any "gpu" style, which is non-numeric
   // exception is 1st arg of reax/c style, which is non-numeric
   // need a better way to skip these exceptions
 
   nstyles = 0;
   i = 0;
   while (i < narg) {
     for (m = 0; m < nstyles; m++)
       if (strcmp(arg[i],keywords[m]) == 0) 
 	error->all("Pair style hybrid cannot use same pair style twice");
     if (strcmp(arg[i],"hybrid") == 0) 
       error->all("Pair style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0) 
       error->all("Pair style hybrid cannot have none as an argument");
     styles[nstyles] = force->new_pair(arg[i]);
     keywords[nstyles] = new char[strlen(arg[i])+1];
     strcpy(keywords[nstyles],arg[i]);
     istyle = i;
     if (strcmp(arg[i],"table") == 0) i++;
     if (strcmp(arg[i],"lj/coul") == 0) i += 2;
     if (strcmp(arg[i],"buck/coul") == 0) i += 2;
-    if (strstr(arg[i],"gpu")) i++;
     if (strcmp(arg[i],"reax/c") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
     styles[nstyles]->settings(i-istyle-1,&arg[istyle+1]);
     nstyles++;
   }
 
   // set comm_forward, comm_reverse to max of any sub-style
 
   for (m = 0; m < nstyles; m++) {
     if (styles[m]) comm_forward = MAX(comm_forward,styles[m]->comm_forward);
     if (styles[m]) comm_reverse = MAX(comm_reverse,styles[m]->comm_reverse);
   }
 
   // single_enable = 0 if any sub-style = 0
   // respa_enable = 1 if any sub-style is set
   // no_virial_compute = 1 if any sub-style is set
 
   for (m = 0; m < nstyles; m++)
     if (styles[m]->single_enable == 0) single_enable = 0;
   for (m = 0; m < nstyles; m++)
     if (styles[m]->respa_enable) respa_enable = 1;
   for (m = 0; m < nstyles; m++)
     if (styles[m]->no_virial_compute) no_virial_compute = 1;
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
 void PairHybrid::coeff(int narg, char **arg)
 {
   if (narg < 3) error->all("Incorrect args for pair coefficients");
   if (!allocated) allocate();
 
   int ilo,ihi,jlo,jhi;
   force->bounds(arg[0],atom->ntypes,ilo,ihi);
   force->bounds(arg[1],atom->ntypes,jlo,jhi);
 
   // 3rd arg = pair sub-style name
   // allow for "none" as valid sub-style name
 
   int m;
   for (m = 0; m < nstyles; m++)
     if (strcmp(arg[2],keywords[m]) == 0) break;
 
   int none = 0;
   if (m == nstyles) {
     if (strcmp(arg[2],"none") == 0) none = 1;
     else error->all("Pair coeff for hybrid has invalid style");
   }
 
   // move 1st/2nd args to 2nd/3rd args
   // just copy ptrs, since arg[] points into original input line
 
   arg[2] = arg[1];
   arg[1] = arg[0];
 
   // invoke sub-style coeff() starting with 1st arg
 
   if (!none) styles[m]->coeff(narg-1,&arg[1]);
 
   // if sub-style only allows one pair coeff call (with * * and type mapping)
   // then unset setflag/map assigned to that style before setting it below
   // in case pair coeff for this sub-style is being called for 2nd time
 
   if (!none && styles[m]->one_coeff)
     for (int i = 1; i <= atom->ntypes; i++)
       for (int j = i; j <= atom->ntypes; j++)
 	if (nmap[i][j] && map[i][j][0] == m) {
 	  setflag[i][j] = 0;
 	  nmap[i][j] = 0;
 	}
 
   // set setflag and which type pairs map to which sub-style
   // if sub-style is none: set hybrid setflag, wipe out map
   // else: set hybrid setflag & map only if substyle setflag is set
   //       previous mappings are wiped out
 
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
       if (none) {
 	setflag[i][j] = 1;
 	nmap[i][j] = 0;
 	count++;
       } else if (styles[m]->setflag[i][j]) {
 	setflag[i][j] = 1;
 	nmap[i][j] = 1;
 	map[i][j][0] = m;
 	count++;
       }
     }
   }
 
   if (count == 0) error->all("Incorrect args for pair coefficients");
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairHybrid::init_style()
 {
   int i,m,itype,jtype,used,istyle,skip;
 
   // error if a sub-style is not used
 
   int ntypes = atom->ntypes;
 
   for (istyle = 0; istyle < nstyles; istyle++) {
     used = 0;
     for (itype = 1; itype <= ntypes; itype++)
       for (jtype = itype; jtype <= ntypes; jtype++)
 	for (m = 0; m < nmap[itype][jtype]; m++)
 	  if (map[itype][jtype][m] == istyle) used = 1;
     if (used == 0) error->all("Pair hybrid sub-style is not used");
   }
 
   // each sub-style makes its neighbor list request(s)
 
   for (istyle = 0; istyle < nstyles; istyle++) styles[istyle]->init_style();
 
   // create skip lists for each pair neigh request
   // any kind of list can have its skip flag set at this stage
 
   for (i = 0; i < neighbor->nrequest; i++) {
     if (!neighbor->requests[i]->pair) continue;
 
     // istyle = associated sub-style
 
     for (istyle = 0; istyle < nstyles; istyle++)
       if (styles[istyle] == neighbor->requests[i]->requestor) break;
 
     // allocate iskip and ijskip
     // initialize so as to skip all pair types
     // set ijskip = 0 if type pair matches any entry in sub-style map
     // set ijskip = 0 if mixing will assign type pair to this sub-style
     //   will occur if type pair is currently unassigned
     //   and both I,I and J,J are assigned to single sub-style
     //   and sub-style for both I,I and J,J match istyle
     // set iskip = 1 only if all ijskip for itype are 1
 
     int *iskip = new int[ntypes+1];
     int **ijskip = memory->create_2d_int_array(ntypes+1,ntypes+1,
 					       "pair_hybrid:ijskip");
 
     for (itype = 1; itype <= ntypes; itype++)
       for (jtype = 1; jtype <= ntypes; jtype++)
 	ijskip[itype][jtype] = 1;
 
     for (itype = 1; itype <= ntypes; itype++)
       for (jtype = itype; jtype <= ntypes; jtype++) {
 	for (m = 0; m < nmap[itype][jtype]; m++)
 	  if (map[itype][jtype][m] == istyle) 
 	    ijskip[itype][jtype] = ijskip[jtype][itype] = 0;
 	if (nmap[itype][jtype] == 0 &&
 	    nmap[itype][itype] == 1 && map[itype][itype][0] == istyle &&
 	    nmap[jtype][jtype] == 1 && map[jtype][jtype][0] == istyle)
 	  ijskip[itype][jtype] = ijskip[jtype][itype] = 0;
       }
 
     for (itype = 1; itype <= ntypes; itype++) {
       iskip[itype] = 1;
       for (jtype = 1; jtype <= ntypes; jtype++)
 	if (ijskip[itype][jtype] == 0) iskip[itype] = 0;
     }
 
     // if any skipping occurs
     // set request->skip and copy iskip and ijskip into request
     // else delete iskip and ijskip
 
     skip = 0;
     for (itype = 1; itype <= ntypes; itype++)
       for (jtype = 1; jtype <= ntypes; jtype++)
 	if (ijskip[itype][jtype] == 1) skip = 1;
 
     if (skip) {
       neighbor->requests[i]->skip = 1;
       neighbor->requests[i]->iskip = iskip;
       neighbor->requests[i]->ijskip = ijskip;
     } else {
       delete [] iskip;
       memory->destroy_2d_int_array(ijskip);
     }
   }
 
   // combine sub-style neigh list requests and create new ones if needed
 
   modify_requests();
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairHybrid::init_one(int i, int j)
 {
   // if I,J is not set explicitly:
   // perform mixing only if I,I sub-style = J,J sub-style
   // also require I,I and J,J are both assigned to single sub-style
 
   if (setflag[i][j] == 0) {
     if (nmap[i][i] != 1 || nmap[j][j] != 1 || map[i][i][0] != map[j][j][0])
       error->one("All pair coeffs are not set");
     nmap[i][j] = 1;
     map[i][j][0] = map[i][i][0];
   }
 
   // call init/mixing for all sub-styles of I,J
   // set cutsq in sub-style just as pair::init_one() does
   // sum tail corrections for I,J
   // return max cutoff of all sub-styles assigned to I,J
   // if no sub-styles assigned to I,J (pair_coeff none), cutmax = 0.0 returned
 
   double cutmax = 0.0;
   if (tail_flag) etail_ij = ptail_ij = 0.0;
 
   nmap[j][i] = nmap[i][j];
 
   for (int k = 0; k < nmap[i][j]; k++) {
     map[j][i][k] = map[i][j][k];
     double cut = styles[map[i][j][k]]->init_one(i,j);
     styles[map[i][j][k]]->cutsq[i][j] = 
       styles[map[i][j][k]]->cutsq[j][i] = cut*cut;
     if (tail_flag) {
       etail_ij += styles[map[i][j][k]]->etail_ij;
       ptail_ij += styles[map[i][j][k]]->ptail_ij;
     }
     cutmax = MAX(cutmax,cut);
   }
 
   return cutmax;
 }
 
 /* ----------------------------------------------------------------------
    combine sub-style neigh list requests and create new ones if needed
 ------------------------------------------------------------------------- */
 
 void PairHybrid::modify_requests()
 {
   int i,j;
   NeighRequest *irq,*jrq;
 
   // loop over pair requests only
   // if list is skip list and not copy, look for non-skip list of same kind
   // if one exists, point at that one
   // else make new non-skip request of same kind and point at that one
   //   don't bother to set ID for new request, since pair hybrid ignores list
   // only exception is half_from_full:
   //   ignore it, turn off skip, since it will derive from its skip parent
   // after possible new request creation, unset skip flag and otherlist
   //   for these derived lists: granhistory, rRESPA inner/middle
   //   this prevents neighbor from treating them as skip lists
   // copy list check is for pair style = hybrid/overlay
   //   which invokes this routine
 
   for (i = 0; i < neighbor->nrequest; i++) {
     if (!neighbor->requests[i]->pair) continue;
 
     irq = neighbor->requests[i];
     if (irq->skip == 0 || irq->copy) continue;
     if (irq->half_from_full) {
       irq->skip = 0;
       continue;
     }
 
     for (j = 0; j < neighbor->nrequest; j++) {
       if (!neighbor->requests[j]->pair) continue;
       jrq = neighbor->requests[j];
       if (irq->same_kind(jrq) && jrq->skip == 0) break;
     }
 
     if (j < neighbor->nrequest) irq->otherlist = j;
     else {
       int newrequest = neighbor->request(this);
       neighbor->requests[newrequest]->copy_request(irq);
       irq->otherlist = newrequest;
     }
 
     if (irq->granhistory || irq->respainner || irq->respamiddle) {
       irq->skip = 0;
       irq->otherlist = -1;
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairHybrid::write_restart(FILE *fp)
 {
   fwrite(&nstyles,sizeof(int),1,fp);
 
   // each sub-style writes its settings, but no coeff info
 
   int n;
   for (int m = 0; m < nstyles; m++) {
     n = strlen(keywords[m]) + 1;
     fwrite(&n,sizeof(int),1,fp);
     fwrite(keywords[m],sizeof(char),n,fp);
     styles[m]->write_restart_settings(fp);
   }
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairHybrid::read_restart(FILE *fp)
 {
   int me = comm->me;
   if (me == 0) fread(&nstyles,sizeof(int),1,fp);
   MPI_Bcast(&nstyles,1,MPI_INT,0,world);
 
   styles = new Pair*[nstyles];
   keywords = new char*[nstyles];
   
   // each sub-style is created via new_pair()
   // each reads its settings, but no coeff info
 
   int n;
   for (int m = 0; m < nstyles; m++) {
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
     styles[m] = force->new_pair(keywords[m]);
     styles[m]->read_restart_settings(fp);
   }
 }
 
 /* ----------------------------------------------------------------------
    call sub-style to compute single interaction
    error if sub-style does not support single() call
    since overlay could have multiple sub-styles, sum results explicitly
 ------------------------------------------------------------------------- */
 
 double PairHybrid::single(int i, int j, int itype, int jtype,
 			  double rsq, double factor_coul, double factor_lj,
 			  double &fforce)
 {
   if (nmap[itype][jtype] == 0)
     error->one("Invoked pair single on pair style none");
 
   double fone;
   fforce = 0.0;
   double esum = 0.0;
 
   for (int m = 0; m < nmap[itype][jtype]; m++) {
     if (rsq < styles[map[itype][jtype][m]]->cutsq[itype][jtype]) {
       if (styles[map[itype][jtype][m]]->single_enable == 0)
 	error->all("Pair hybrid sub-style does not support single call");
       esum += styles[map[itype][jtype][m]]->
 	single(i,j,itype,jtype,rsq,factor_coul,factor_lj,fone);
       fforce += fone;
     }
   }
 
   return esum;
 }
 
 /* ----------------------------------------------------------------------
    modify parameters of the pair style
    call modify_params of PairHybrid
    also pass command args to each sub-style of hybrid
 ------------------------------------------------------------------------- */
 
 void PairHybrid::modify_params(int narg, char **arg)
 {
   Pair::modify_params(narg,arg);
   for (int m = 0; m < nstyles; m++) styles[m]->modify_params(narg,arg);
 }
 
 /* ----------------------------------------------------------------------
    memory usage of each sub-style
 ------------------------------------------------------------------------- */
 
 double PairHybrid::memory_usage()
 {
   double bytes = maxeatom * sizeof(double);
   bytes += maxvatom*6 * sizeof(double);
   for (int m = 0; m < nstyles; m++) bytes += styles[m]->memory_usage();
   return bytes;
 }
 
 /* ----------------------------------------------------------------------
    extract a ptr to a particular quantity stored by pair
    pass request thru to sub-styles
    return first non-NULL result except for cut_coul request
    for cut_coul, insure all non-NULL results are equal since required by Kspace
 ------------------------------------------------------------------------- */
 
 void *PairHybrid::extract(char *str, int &dim)
 {
   void *cutptr = NULL;
   void *ptr;
   double cutvalue;
 
   for (int m = 0; m < nstyles; m++) {
     ptr = styles[m]->extract(str,dim);
     if (ptr && strcmp(str,"cut_coul") == 0) {
       double *p_newvalue = (double *) ptr;
       double newvalue = *p_newvalue;
       if (cutptr && newvalue != cutvalue)
 	error->all("Coulomb cutoffs of pair hybrid sub-styles do not match");
       cutptr = ptr;
       cutvalue = newvalue;
     } else if (ptr) return ptr;
   }
 
   if (strcmp(str,"cut_coul") == 0) return cutptr;
   return NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairHybrid::reset_dt()
 {
   for (int m = 0; m < nstyles; m++) styles[m]->reset_dt();
 }
 
 /* ----------------------------------------------------------------------
    check if itype,jtype maps to sub-style
 ------------------------------------------------------------------------- */
 
 int PairHybrid::check_ijtype(int itype, int jtype, char *substyle)
 {
   for (int m = 0; m < nmap[itype][jtype]; m++)
     if (strcmp(keywords[map[itype][jtype][m]],substyle) == 0) return 1;
   return 0;
 }