diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index 0c3558e67..766131f4f 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -1,190 +1,609 @@
 /***************************************************************************
                                 lal_eam.cpp
                              -------------------
                       W. Michael Brown, Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the eam pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
  
 #ifdef USE_OPENCL
 #include "eam_cl.h"
 #else
 #include "eam_ptx.h"
 #endif
 
 #include "lal_eam.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define EAMT EAM<numtyp, acctyp>
 
-extern Device<PRECISION,ACC_PRECISION> device;
+extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-EAMT::EAM() : BaseCharge<numtyp,acctyp>(), 
-                      _allocated(false) {
+EAMT::EAM() : _compiled(false), _max_bytes(0), _allocated(false) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
 }
 
 template <class numtyp, class acctyp>
 EAMT::~EAM() {
+  delete ans;
+  delete nbor;
   clear();
 }
  
 template <class numtyp, class acctyp>
 int EAMT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int EAMT::init_atomic(const int nlocal, const int nall,
+                      const int max_nbors, const int maxspecial,
+                      const double cell_size,
+                      const double gpu_split, FILE *_screen,
+                      const char *pair_program) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_charge();
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+    
+  bool charge = true;
+  bool rot = false;
+  bool pre_cut = false;
+  int success=device->init(*ans,charge,rot,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,pre_cut,
+                           _threads_per_atom);
+  if (success!=0)
+    return success;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
+  compile_kernels(*ucl_device,pair_program);
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->dev_x,4);
+  q_tex.bind_float(atom->dev_q,1);
+
+  _max_an_bytes=ans->gpu_bytes()
+    +nbor->gpu_bytes();
+
+  return success;
 }
 
 template <class numtyp, class acctyp>
 int EAMT::init(const int ntypes, double host_cutforcesq,
-              int **host_type2rhor, int **host_type2z2r,
+              int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
               double ***host_rhor_spline, double ***host_z2r_spline,
-              double rdr, int nrhor, int nz2r, int nr,
+              double ***host_frho_spline,
+              double rdr, double rdrho, int nrhor, int nrho, 
+              int nz2r, int nfrho, int nr,
               const int nlocal, const int nall, const int max_nbors,
               const int maxspecial, const double cell_size,
               const double gpu_split, FILE *_screen) 
 {
-   
   int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+  success=init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                             _screen,eam);
   
   if (success!=0)
     return success;
-  
+    
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
 
   int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
   }
   
   _ntypes=lj_types;
   _cutforcesq=host_cutforcesq;
   _rdr=rdr;
+  _rdrho = rdrho;
   _nrhor=nrhor;
+  _nrho=nrho;
   _nz2r=nz2r;
+  _nfrho=nfrho;
   _nr=nr;
   
   UCL_H_Vec<numtyp> dview_type(lj_types*lj_types*2,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
   
   for (int i=0; i<lj_types*lj_types*2; i++)
     dview_type[i]=(numtyp)0.0; 
                                 
   // pack type2rhor and type2z2r
   type2rhor_z2r.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   
   this->atom->type_pack2(ntypes,lj_types,type2rhor_z2r,dview_type,
                         host_type2rhor,
                         host_type2z2r);
+  
+  // pack type2frho
+  UCL_H_Vec<numtyp> dview_type2frho(ntypes,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
 
+  type2frho.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<ntypes; i++)
+    dview_type2frho[i]=(numtyp)host_type2frho[i];
+  ucl_copy(type2frho,dview_type2frho,false);
+                        
+  // pack frho_spline
+  UCL_H_Vec<numtyp> dview_frho_spline(nfrho*(nr+1)*7,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+                               
+  for (int ix=0; ix<nfrho; ix++)
+    for (int iy=0; iy<nr+1; iy++)
+      for (int iz=0; iz<7; iz++) 
+    dview_frho_spline[ix*(nr+1)*7+iy*7+iz]=host_frho_spline[ix][iy][iz];
+  
+  frho_spline.alloc(nfrho*(nr+1)*7,*(this->ucl_device),UCL_READ_ONLY);
+  ucl_copy(frho_spline,dview_frho_spline,false);
+  
   // pack rhor_spline
   UCL_H_Vec<numtyp> dview_rhor_spline(nrhor*(nr+1)*7,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
                                
   for (int ix=0; ix<nrhor; ix++)
     for (int iy=0; iy<nr+1; iy++)
       for (int iz=0; iz<7; iz++) 
     dview_rhor_spline[ix*(nr+1)*7+iy*7+iz]=host_rhor_spline[ix][iy][iz];
   
   rhor_spline.alloc(nrhor*(nr+1)*7,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(rhor_spline,dview_rhor_spline,false);
   
   // pack z2r_spline
   UCL_H_Vec<numtyp> dview_z2r_spline(nz2r*(nr+1)*7,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
                                
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++)
       for (int iz=0; iz<7; iz++) 
     dview_z2r_spline[ix*(nr+1)*7+iy*7+iz]=host_z2r_spline[ix][iy][iz];
   
   z2r_spline.alloc(nz2r*(nr+1)*7,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(z2r_spline,dview_z2r_spline,false);
 
   _allocated=true;
-  this->_max_bytes=type2rhor_z2r.row_bytes()+
-        rhor_spline.row_bytes()+z2r_spline.row_bytes();
+  this->_max_bytes=type2rhor_z2r.row_bytes()
+        + type2frho.row_bytes()
+        + rhor_spline.row_bytes()+z2r_spline.row_bytes()
+        + frho_spline.row_bytes();
   return 0;
 }
 
+template <class numtyp, class acctyp>
+void EAMT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+}
+
 template <class numtyp, class acctyp>
 void EAMT::clear() {
   if (!_allocated)
     return;
   _allocated=false;
   
   type2rhor_z2r.clear();
+  type2frho.clear();
   rhor_spline.clear();
   z2r_spline.clear();
-  this->clear_atomic();
+  frho_spline.clear();
+  
+  host_fp.clear();
+  dev_fp.clear();
+
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  if (_compiled) {
+    k_pair_fast.clear();
+    k_pair.clear();
+    k_energy.clear();
+    delete pair_program;
+    _compiled=false;
+  }
+  
+  time_pair.clear();
+  hd_balancer.clear();
+
+  nbor->clear();
+  ans->clear();
+  device->clear();
 }
 
 template <class numtyp, class acctyp>
 double EAMT::host_memory_usage() const {
-  return this->host_memory_usage_atomic()+sizeof(EAM<numtyp,acctyp>);
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(EAM<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * EAMT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return false;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void EAMT::build_nbor_list(const int inum, const int host_inum,
+                                         const int nall, double **host_x,
+                                         int *host_type, double *sublo,
+                                         double *subhi, int *tag, 
+                                         int **nspecial, int **special,
+                                         bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
+                        nspecial, special, success, mn);
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void EAMT::compute(const int f_ago, const int inum_full,
+                   const int nall, double **host_x, int *host_type,
+                   int *ilist, int *numj, int **firstneigh,
+                   const bool eflag, const bool vflag,
+                   const bool eatom, const bool vatom,
+                   int &host_start, const double cpu_time,
+                   bool &success, double *host_q,
+                   const int nlocal, double *boxlo, double *prd) {
+  acc_timers();
+  
+  // compute density already took care of the neighbor list
+
+  atom->cast_q_data(host_q);
+  atom->add_q_data();
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** EAMT::compute(const int ago, const int inum_full,
+                    const int nall, double **host_x, int *host_type,
+                    double *sublo, double *subhi, int *tag,
+                    int **nspecial, int **special, const bool eflag, 
+                    const bool vflag, const bool eatom,
+                    const bool vatom, int &host_start,
+                    int **ilist, int **jnum,
+                    const double cpu_time, bool &success,
+                    double *host_q, double *boxlo, double *prd, int inum) {
+  acc_timers();
+  
+  // use the atom count returned from load balance invoked in compute energy
+  ans->inum(inum);
+  host_start=inum;
+
+  atom->cast_q_data(host_q);
+  hd_balancer.start_timer();
+  atom->add_q_data();
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+  
+  return nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then compute atom energies/forces
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void EAMT::compute_energy(const int f_ago, const int inum_full,
+                   const int nall, double **host_x, int *host_type,
+                   int *ilist, int *numj, int **firstneigh,
+                   const bool eflag, const bool vflag,
+                   const bool eatom, const bool vatom,
+                   int &host_start, const double cpu_time,
+                   bool &success, double *fp,
+                   const int nlocal, double *boxlo, double *prd, 
+                   double *evdwl) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return;
+  }
+  
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+
+  energy(eflag,vflag);
+  
+  // copy fp from device to host for comm
+  
+  ucl_copy(host_fp,dev_fp,false);
+  
+  acctyp *ap=host_fp.begin();
+  for (int i=0; i<inum; i++) {
+    fp[i]=*ap;
+    ap++;
+  }
+  
+  if (eflag) {
+    double e=0.0;
+    ucl_copy(ans->host_engv,ans->dev_engv,false); 
+    for (int i=0; i<inum; i++) 
+      e+=ans->host_engv[i];
+    *evdwl+=e;
+  }
+
+  hd_balancer.stop_timer();
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU and then compute per-atom densities
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** EAMT::compute_energy(const int ago, const int inum_full,
+                    const int nall, double **host_x, int *host_type,
+                    double *sublo, double *subhi, int *tag,
+                    int **nspecial, int **special, const bool eflag, 
+                    const bool vflag, const bool eatom,
+                    const bool vatom, int &host_start,
+                    int **ilist, int **jnum,
+                    const double cpu_time, bool &success,
+                    double *fp, double *boxlo, double *prd,
+                    double *evdwl, int &inum) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return NULL;
+  }
+  
+  // load balance, returning the atom count on the device (inum)
+  hd_balancer.balance(cpu_time);
+  inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+ 
+  // Build neighbor list on GPU if necessary 
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  energy(eflag,vflag);
+  
+  // copy fp from device to host for comm
+  
+  ucl_copy(host_fp,dev_fp,false);
+  
+  acctyp *ap=host_fp.begin();
+  for (int i=0; i<inum; i++) {
+    fp[i]=*ap;
+    ap++;
+  }
+  
+  if (eflag) {
+    double e=0.0;
+    ucl_copy(ans->host_engv,ans->dev_engv,false); 
+    for (int i=0; i<inum; i++) 
+      e+=ans->host_engv[i];
+    *evdwl+=e;
+  }     
+
+  hd_balancer.stop_timer();
+  
+  return nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::loop(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
   if (_eflag)
     eflag=1;
   else
     eflag=0;
 
   if (_vflag)
     vflag=1;
   else
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &this->atom->dev_q.begin(), 
                    &type2rhor_z2r.begin(),
                    &rhor_spline.begin(), &z2r_spline.begin(),
                    &this->nbor->dev_nbor.begin(),
                    &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                    &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                   &nbor_pitch, &_cutforcesq, &_rdr,
-                   &_nrhor, &_nz2r, &_nr,
+                   &nbor_pitch, &_cutforcesq, &_rdr, &_nr,
                    &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &this->atom->dev_q.begin(), 
                    &type2rhor_z2r.begin(),
                    &rhor_spline.begin(), &z2r_spline.begin(),
                    &this->nbor->dev_nbor.begin(),
                    &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                    &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                   &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr,
-                   &_nrhor, &_nz2r, &_nr,
+                   &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr,
                    &this->_threads_per_atom);
   }
 
   this->time_pair.stop();
 }
 
+// ---------------------------------------------------------------------------
+// Calculate per-atom energies and forces
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void EAMT::energy(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  
+  
+  this->k_energy.set_size(GX,BX);
+  this->k_energy.run(&this->atom->dev_x.begin(), 
+                 &type2rhor_z2r.begin(), &type2frho.begin(),
+                 &rhor_spline.begin(), &frho_spline.begin(),
+                 &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(),
+                 &dev_fp.begin(), 
+                 &ans->dev_engv.begin(),
+                 &eflag, &vflag, &ainum,
+                 &nbor_pitch, 
+                 &_ntypes, &_cutforcesq, 
+                 &_rdr, &_rdrho,
+                 &_nrho, &_nr,
+                 &this->_threads_per_atom);
+
+  this->time_pair.stop();
+}
+
+template <class numtyp, class acctyp>
+void EAMT::compile_kernels(UCL_Device &dev, const char *pair_str) {
+  if (_compiled)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE)+" -D"+
+                    std::string(OCL_VENDOR);
+
+  pair_program=new UCL_Program(dev);
+  pair_program->load_string(pair_str,flags.c_str());
+  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
+  k_pair.set_function(*pair_program,"kernel_pair");
+  k_energy.set_function(*pair_program,"kernel_energy");
+  pos_tex.get_texture(*pair_program,"pos_tex");
+  q_tex.get_texture(*pair_program,"q_tex");
+
+  _compiled=true;
+}
+
 template class EAM<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index 74539cc32..7e3f8fc42 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -1,258 +1,357 @@
 // **************************************************************************
 //                             lal_eam.cu
 //                             -------------------
-//                     W. Michael Brown, Trung Dac Nguyen (ORNL)
+//                     Trung Dac Nguyen, W. Michael Brown (ORNL)
 //
-//  Device code for acceleration of the lj/cut/coul/fsww/cut pair style
+//  Device code for acceleration of the eam pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
 #ifndef _DOUBLE_DOUBLE
 ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
   { return tex1Dfetch(pos_tex, i); }
 ucl_inline float fetch_q(const int& i, const float *q) 
   { return tex1Dfetch(q_tex, i); }
 #endif
 #endif
 
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
+#define MAX(A,B) ((A) > (B) ? (A) : (B))
+
+__kernel void kernel_energy(__global numtyp4 *x_, 
+                    __global numtyp2 *type2rhor_z2r, __global numtyp *type2frho,
+                    __global numtyp *rhor_spline, __global numtyp *frho_spline,
+                    __global int *dev_nbor, __global int *dev_packed,
+                    __global acctyp *fp_, 
+                    __global acctyp *engv, const int eflag, 
+                    const int vflag, const int inum, 
+                    const int nbor_pitch,
+                    const int ntypes, const numtyp cutforcesq, 
+                    const numtyp rdr, const numtyp rdrho,
+                    const int nrho, const int nr,
+                    const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  
+  acctyp rho = (acctyp)0;
+  acctyp energy = (acctyp)0;
+   
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+    
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+      
+      if (rsq<cutforcesq) {
+        numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0;
+        int m=p;
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,(numtyp)1.0);
+        
+        int mtype = jtype*ntypes+itype;
+        int index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7;
+        numtyp coeff3 = rhor_spline[index+3];
+        numtyp coeff4 = rhor_spline[index+4];
+        numtyp coeff5 = rhor_spline[index+5];
+        numtyp coeff6 = rhor_spline[index+6];
+        rho += ((coeff3*p + coeff4)*p + coeff5)*p + coeff6;
+      }
+    } // for nbor
+    
+    // reduce to get the density at atom ii
+    
+    if (t_per_atom>1) {
+      __local acctyp red_acc[BLOCK_PAIR];                               
+      red_acc[tid]=rho;                                                 
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                      
+        if (offset < s)                                        
+          red_acc[tid] += red_acc[tid+s];
+      }                                                                    
+      rho=red_acc[tid];                                                 
+    }
+    
+    // calculate the embedded force for ii
+    if (offset==0) {
+      numtyp p = rho*rdrho + (numtyp)1.0;
+      int m=p;
+      m = MAX(1,MIN(m,nrho-1));
+      p -= m;
+      p = MIN(p,(numtyp)1.0);
+      
+      int index = type2frho[itype]*(nr+1)*7+m*7;
+      numtyp coeff0 = frho_spline[index+0];
+      numtyp coeff1 = frho_spline[index+1];
+      numtyp coeff2 = frho_spline[index+2];
+      numtyp fp = (coeff0*p + coeff1)*p + coeff2;
+      fp_[ii]=fp;                       
+      
+      engv+=ii;  
+      if (eflag>0) {
+        numtyp coeff3 = frho_spline[index+3];
+        numtyp coeff4 = frho_spline[index+4];
+        numtyp coeff5 = frho_spline[index+5];
+        numtyp coeff6 = frho_spline[index+6];
+        energy = ((coeff3*p + coeff4)*p + coeff5)*p + coeff6;
+        *engv=energy;
+      }
+    }
+  } // if ii
+}
+
 
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
                           __global numtyp2 *type2rhor_z2r,
                           __global numtyp *rhor_spline, __global numtyp *z2r_spline,
                           __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag, 
                           const int vflag, const int inum, const int nbor_pitch,
                           const int ntypes, const numtyp cutforcesq, 
-                          const numtyp rdr, const int nrhor, const int nz2r, const int nr,
+                          const numtyp rdr, const int nr,
                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0;
   f.y=(acctyp)0;
   f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp ifp=fetch_q(i,fp_);  //fp_[i];
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       numtyp jfp=fetch_q(j,fp_); //fp_[j];
       int jtype=jx.w;
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
       
       if (rsq<cutforcesq) {
         numtyp r = ucl_sqrt(rsq);
         numtyp p = r*rdr + (numtyp)1.0;
-        int m=__float2int_rd(p);
+        int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
         
         int mtype,index;
         numtyp coeff0,coeff1,coeff2,coeff3,coeff4,coeff5,coeff6;
         
         mtype = itype*ntypes+jtype;
         index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7;
         coeff0 = rhor_spline[index+0];
         coeff1 = rhor_spline[index+1];
         coeff2 = rhor_spline[index+2];
         numtyp rhoip = (coeff0*p + coeff1)*p + coeff2;
         
         mtype = jtype*ntypes+itype;
         index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7;
         coeff0 = rhor_spline[index+0];
         coeff1 = rhor_spline[index+1];
         coeff2 = rhor_spline[index+2];
         numtyp rhojp = (coeff0*p + coeff1)*p + coeff2;
         
         mtype = itype*ntypes+jtype;
         index = type2rhor_z2r[mtype].y*(nr+1)*7+m*7;
         coeff0 = z2r_spline[index+0];
         coeff1 = z2r_spline[index+1];
         coeff2 = z2r_spline[index+2];
         coeff3 = z2r_spline[index+3];
         coeff4 = z2r_spline[index+4];
         coeff5 = z2r_spline[index+5];
         coeff6 = z2r_spline[index+6];
         
         numtyp z2p = (coeff0*p + coeff1)*p + coeff2;
         numtyp z2 = ((coeff3*p + coeff4)*p + coeff5)*p + coeff6;
 
         numtyp recip = (numtyp)1.0/r;
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
         numtyp psip = ifp*rhojp + jfp*rhoip + phip;
         numtyp force = -psip*recip;
         
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           energy += phi;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
   
     } // for nbor
     store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                   ans,engv);
   } // if ii
 
 }
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
                           __global numtyp2 *type2rhor_z2r,
                           __global numtyp *rhor_spline, __global numtyp *z2r_spline,
                           __global int *dev_nbor, __global int *dev_packed, 
                           __global acctyp4 *ans, __global acctyp *engv, 
                           const int eflag, const int vflag, const int inum, 
                           const int nbor_pitch,
                           const numtyp cutforcesq, 
-                          const numtyp rdr, const int nrhor, const int nz2r, const int nr,
+                          const numtyp rdr, const int nr,
                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
 
   
   if (ii<inum) {
     __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp ifp=fetch_q(i,fp_); //fp_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       j &= NEIGHMASK;
 
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       numtyp jfp=fetch_q(j,fp_); //fp_[j];
       int jtype=jx.w;
       
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
         
       if (rsq<cutforcesq) {
         numtyp r = ucl_sqrt(rsq);
         numtyp p = r*rdr + (numtyp)1.0;
-        int m=__float2int_rd(p);
+        int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
         
         numtyp coeff0,coeff1,coeff2,coeff3,coeff4,coeff5,coeff6;
         int mtype,index;
         
         mtype = itype+jx.w;
         index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7;
         coeff0 = rhor_spline[index+0];
         coeff1 = rhor_spline[index+1];
         coeff2 = rhor_spline[index+2];
         numtyp rhoip = (coeff0*p + coeff1)*p + coeff2;
         
         mtype = jtype+ix.w;
         index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7;
         coeff0 = rhor_spline[index+0];
         coeff1 = rhor_spline[index+1];
         coeff2 = rhor_spline[index+2];
         numtyp rhojp = (coeff0*p + coeff1)*p + coeff2;
         
         mtype = itype+jx.w;
         index = type2rhor_z2r[mtype].y*(nr+1)*7+m*7;
         coeff0 = z2r_spline[index+0];
         coeff1 = z2r_spline[index+1];
         coeff2 = z2r_spline[index+2];
         coeff3 = z2r_spline[index+3];
         coeff4 = z2r_spline[index+4];
         coeff5 = z2r_spline[index+5];
         coeff6 = z2r_spline[index+6];
         
         numtyp z2p = (coeff0*p + coeff1)*p + coeff2;
         numtyp z2 = ((coeff3*p + coeff4)*p + coeff5)*p + coeff6;
 
         numtyp recip = (numtyp)1.0/r;
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
         numtyp psip = ifp*rhojp + jfp*rhoip + phip;
         numtyp force = -psip*recip;
         
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           energy += phi;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
     store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                   ans,engv);
   } // if ii
 }
 
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index 01a159088..4da240079 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -1,87 +1,275 @@
 /***************************************************************************
                               lal_eam.h
                              -------------------
                       W. Michael Brown, Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the eam pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifndef LAL_EAM_H
 #define LAL_EAM_H
 
-#include "lal_base_charge.h"
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
 
 namespace LAMMPS_AL {
 
 template <class numtyp, class acctyp>
-class EAM : public BaseCharge<numtyp, acctyp> {
+class EAM {
  public:
   EAM();
   ~EAM();
   
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
+                  const char *pair_program);
+                  
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * 
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double host_cutforcesq,
-          int **host_type2rhor, int **host_type2z2r,
+          int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
           double ***host_rhor_spline, double ***host_z2r_spline,
-          double rdr, int nrhor, int nz2r, int nr,
+          double ***host_frho_spline,
+          double rdr, double rdrho, int nrhor, int nrho, 
+          int nz2r, int nfrho, int nr,
           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *_screen);
   
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();
+  
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->dev_x,4);
+      q_tex.bind_float(atom->dev_q,1);
+    }
+    
+    // resize rho
+    dev_fp.clear();
+    host_fp.clear();
+    
+    bool cpuview=false;
+    if (ucl_device->device_type()==UCL_CPU)
+      cpuview=true;
+    
+    int _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
+    host_fp.alloc(_max_atoms,*ucl_device);
+    if (cpuview)
+      dev_fp.view(host_fp);
+    else 
+      dev_fp.alloc(_max_atoms,*ucl_device,UCL_WRITE_ONLY);
+    
+    ans->resize(inum,success);
+  }
+  
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum, 
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
 
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
   
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers();
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, int *tag, int **nspecial,
+                       int **special, bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd);
+
+  /// Pair loop with device neighboring
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd, int inum);
+  
+  /// Pair loop with host neighboring
+  void compute_energy(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd, double* evdwl);
+               
+  /// Pair loop with device neighboring
+  int** compute_energy(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd, double* evdwl, int &inum);
+
+// -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+
+  // ------------------------ FORCE/ENERGY/DENSITY DATA --------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_pair_fast, k_pair, k_energy;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
   // --------------------------- TYPE DATA --------------------------
     
   UCL_D_Vec<numtyp2> type2rhor_z2r;
   
-  UCL_D_Vec<numtyp> rhor_spline;
+  UCL_D_Vec<numtyp> type2frho;
   
-  UCL_D_Vec<numtyp> z2r_spline;
+  UCL_D_Vec<numtyp> frho_spline, rhor_spline, z2r_spline;
   
-  numtyp _cutforcesq,_rdr;
+  numtyp _cutforcesq,_rdr,_rdrho;
   
-  int _nrhor,_nz2r,_nr;
+  int _nfrho,_nrhor,_nrho,_nz2r,_nr;
   
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
   
   /// Number of atom types 
   int _ntypes;
+
+  /// Per-atom arrays
+  UCL_H_Vec<numtyp> host_fp;
+  UCL_D_Vec<numtyp> dev_fp;
   
-  // --------------------------- TEXTURES -----------------------------
-  UCL_Texture fp_tex;
-  
- private:
+protected:
+  bool _compiled;
+  int _block_size, _block_bio_size, _threads_per_atom;
+  double  _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
+
+  void compile_kernels(UCL_Device &dev, const char *pair_string);  
+
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
+  void energy(const bool _eflag, const bool _vflag);
 };
 
 }
 
 #endif
 
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index d2b178774..521627dfc 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -1,132 +1,161 @@
 // **************************************************************************
-//                           lal_lj_coul_fsww_ext.cpp
+//                               lal_eam_ext.cpp
 //                             -------------------
 //                     W. Michael Brown, Trung Dac Nguyen (ORNL)
 //
-//  Class for acceleration of the lj/cut/coul/fsww/cut pair style
+//  Class for acceleration of the eam pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
 // ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_eam.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static EAM<PRECISION,ACC_PRECISION> EAMMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int eam_gpu_init(const int ntypes, double host_cutforcesq, 
-                 int **host_type2rhor, int **host_type2z2r,
+                 int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
-                 double rdr, int nrhor, int nz2r, int nr, 
+                 double ***host_frho_spline,
+                 double rdr, double rdrho, int nrhor, 
+                 int nrho, int nz2r, int nfrho, int nr, 
                  const int nlocal, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, 
                  int &gpu_mode, FILE *screen) {
   EAMMF.clear();
   gpu_mode=EAMMF.device->gpu_mode();
   double gpu_split=EAMMF.device->particle_split();
   int first_gpu=EAMMF.device->first_device();
   int last_gpu=EAMMF.device->last_device();
   int world_me=EAMMF.device->world_me();
   int gpu_rank=EAMMF.device->gpu_rank();
   int procs_per_gpu=EAMMF.device->procs_per_gpu();
 
   EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu);
 
   bool message=false;
   if (EAMMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
     fprintf(screen,"Initializing GPU and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=EAMMF.init(ntypes, host_cutforcesq,
-                        host_type2rhor, host_type2z2r,
-                        host_rhor_spline, host_z2r_spline,
-                        rdr, nrhor, nz2r, nr, 
-                        nlocal, nall, 300, maxspecial,
-                        cell_size, gpu_split, screen);
+                    host_type2rhor, host_type2z2r, host_type2frho,
+                    host_rhor_spline, host_z2r_spline,
+                    host_frho_spline,
+                    rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, 
+                    nlocal, nall, 300, maxspecial,
+                    cell_size, gpu_split, screen);
 
   EAMMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
         fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMMF.init(ntypes, host_cutforcesq,
-                        host_type2rhor, host_type2z2r,
-                        host_rhor_spline, host_z2r_spline,
-                        rdr, nrhor, nz2r, nr, 
-                        nlocal, nall, 300, maxspecial,
-                        cell_size, gpu_split, screen);
+                    host_type2rhor, host_type2z2r, host_type2frho,
+                    host_rhor_spline, host_z2r_spline,
+                    host_frho_spline,
+                    rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, 
+                    nlocal, nall, 300, maxspecial,
+                    cell_size, gpu_split, screen);
 
     EAMMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     EAMMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void eam_gpu_clear() {
   EAMMF.clear();
 }
 
 int ** eam_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, double *host_fp, double *boxlo,
-                         double *prd) {
+                         double *prd, int inum) {
   return EAMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
-                        host_fp, boxlo, prd);
+                        host_fp, boxlo, prd, inum);
 }  
 			
 void eam_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_fp,
                       const int nlocal, double *boxlo, double *prd) {
   EAMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 host_fp,nlocal,boxlo,prd);
 }
 
+int ** eam_gpu_compute_energy_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum,  const double cpu_time,
+                         bool &success, double *host_fp, double *boxlo,
+                         double *prd, double *eng_vdwl, int &inum) {
+  return EAMMF.compute_energy(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_fp, boxlo, prd, eng_vdwl, inum);
+}  
+
+void eam_gpu_compute_energy(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_fp,
+                      const int nlocal, double *boxlo, double *prd, double* evdwl) {
+  EAMMF.compute_energy(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                host_fp,nlocal,boxlo,prd,evdwl);
+}
+
 double eam_gpu_bytes() {
   return EAMMF.host_memory_usage();
 }
 
 
diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp
index 62eb38e4d..cc3ecff10 100644
--- a/src/GPU/pair_eam_gpu.cpp
+++ b/src/GPU/pair_eam_gpu.cpp
@@ -1,387 +1,429 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL)
 ------------------------------------------------------------------------- */
 
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_eam_gpu.h"
 #include "atom.h"
 #include "force.h"
 #include "comm.h"
 #include "domain.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "memory.h"
 #include "error.h"
 #include "neigh_request.h"
 #include "gpu_extra.h"
 
 using namespace LAMMPS_NS;
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 #define MAXLINE 1024
 
 // External functions from cuda library for atom decomposition
 
 int eam_gpu_init(const int ntypes, double host_cutforcesq,
                  int **host_type2rhor, int **host_type2z2r,
+                 int *host_type2frho, 
                  double ***host_rhor_spline, double ***host_z2r_spline,
-                 double rdr, int nrhor, int nz2r, int nr,
+                 double ***host_frho_spline,
+                 double rdr, double rdrho, int nrhor, int nrho, 
+                 int nz2r, int nfrho, int nr,
                  const int nlocal, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, 
                  int &gpu_mode, FILE *screen);
 void eam_gpu_clear();
 int** eam_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, double *host_fp, double *boxlo,
-                         double *prd);
+                         double *prd, int inum);
 void eam_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_fp,
                       const int nlocal, double *boxlo, double *prd);
+int** eam_gpu_compute_energy_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum,  const double cpu_time,
+                         bool &success, double *host_fp, double *boxlo,
+                         double *prd, double *evdwl, int &inum);
+void eam_gpu_compute_energy(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_fp,
+                      const int nlocal, double *boxlo, double *prd, double *evdwl);
 double eam_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
 
 PairEAMGPU::PairEAMGPU(LAMMPS *lmp) : PairEAM(lmp), gpu_mode(GPU_FORCE)
 {
   respa_enable = 0;
   cpu_time = 0.0;
 }
 
 /* ----------------------------------------------------------------------
    check if allocated, since class can be destructed when incomplete
 ------------------------------------------------------------------------- */
 
 PairEAMGPU::~PairEAMGPU()
 {
   eam_gpu_clear();
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairEAMGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
   return bytes + eam_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEAMGPU::compute(int eflag, int vflag)
 {
-  int i,j,ii,jj,m,inum,jnum,itype,jtype;
-  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
-  double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi;
-  double *coeff;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int i,j,ii,jj,m,jnum,itype,jtype;
+  double evdwl,*coeff;
 
   evdwl = 0.0;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
  
   // grow energy and fp arrays if necessary
   // need to be atom->nmax in length
     
   if (atom->nmax > nmax) {
     memory->destroy(rho);
     memory->destroy(fp);
     nmax = atom->nmax;
     memory->create(rho,nmax,"pair:rho");
     memory->create(fp,nmax,"pair:fp");
   }
-  
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
+
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
-  
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-  
+
   // zero out density
 
   if (newton_pair) {
     m = nlocal + atom->nghost;
-    for (i = 0; i < m; i++) rho[i] = 0.0;
-  } else for (i = 0; i < nlocal; i++) rho[i] = 0.0;
- 
+    for (i = 0; i < m; i++) rho[i] = 0.0; 
+  } else for (i = 0; i < nlocal; i++) rho[i] = 0.0; 
+
+  
+  // compute density on each atom on GPU
+
+  int nall = atom->nlocal + atom->nghost;  
+  int inum, host_start, inum_dev;
+  
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh; 
+  if (gpu_mode != GPU_FORCE) { 
+    inum = atom->nlocal;
+    
+    firstneigh = eam_gpu_compute_energy_n(neighbor->ago, inum, nall, atom->x,
+             atom->type, domain->sublo, domain->subhi,
+             atom->tag, atom->nspecial, atom->special,
+             eflag, vflag, eflag_atom, vflag_atom,
+             host_start, &ilist, &numneigh, cpu_time,
+             success, fp, domain->boxlo, 
+             domain->prd, &eng_vdwl, inum_dev);
+  } else { // gpu_mode == GPU_FORCE
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    eam_gpu_compute_energy(neighbor->ago, inum, nall, atom->x, atom->type,
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success, fp,
+		    atom->nlocal, domain->boxlo, domain->prd, &eng_vdwl);
+  }
+    
+  if (!success)
+    error->one(FLERR,"Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    cpu_compute_energy(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+  
+  // communicate derivative of embedding function
+
+  comm->forward_comm_pair(this);
+    
+  // compute forces on each atom on GPU
+
+  if (gpu_mode != GPU_FORCE) {
+    inum = atom->nlocal;
+    firstneigh = eam_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				   atom->type, domain->sublo, domain->subhi,
+				   atom->tag, atom->nspecial, atom->special,
+				   eflag, vflag, eflag_atom, vflag_atom,
+				   host_start, &ilist, &numneigh, cpu_time,
+				   success, fp, domain->boxlo, 
+				   domain->prd, inum_dev);
+  } else { // gpu_mode == GPU_FORCE
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    eam_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success, fp,
+		    atom->nlocal, domain->boxlo, domain->prd);
+  }
+  if (!success)
+    error->one(FLERR,"Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    double cpu_time2 = MPI_Wtime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time += MPI_Wtime() - cpu_time2;
+  }
+  
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+void PairEAMGPU::cpu_compute_energy(int start, int inum, int eflag, int vflag,
+				      int *ilist, int *numneigh,
+				      int **firstneigh) 
+{
+  int i,j,ii,jj,m,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,p,rhoip,rhojp,phi;
+  double *coeff;
+  int *jlist;
+  
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int newton_pair = force->newton_pair;
   
   // rho = density at each atom
   // loop over neighbors of my atoms
 
-  for (ii = 0; ii < inum; ii++) {
+  for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
 
       if (rsq < cutforcesq) {
 	jtype = type[j];
 	p = sqrt(rsq)*rdr + 1.0;
 	m = static_cast<int> (p);
 	m = MIN(m,nr-1);
 	p -= m;
 	p = MIN(p,1.0);
 	coeff = rhor_spline[type2rhor[jtype][itype]][m];
 	rho[i] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
-	if (newton_pair || j < nlocal && gpu_mode != GPU_FORCE) {
-	  coeff = rhor_spline[type2rhor[itype][jtype]][m];
-	  rho[j] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
-	}
-      }
-      
+	    }
     }
   }
 
   // communicate and sum densities
 
   if (newton_pair) comm->reverse_comm_pair(this);
   
   
   // fp = derivative of embedding energy at each atom
   // phi = embedding energy at each atom
   
-  for (ii = 0; ii < inum; ii++) {
+  for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     p = rho[i]*rdrho + 1.0;    
     m = static_cast<int> (p);
     m = MAX(1,MIN(m,nrho-1));
     p -= m;
     p = MIN(p,1.0);
     
     coeff = frho_spline[type2frho[type[i]]][m];
     fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
     if (eflag) {
       phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
       if (eflag_global) eng_vdwl += phi;
       if (eflag_atom) eatom[i] += phi;
     
     }
   }
-
-  // communicate derivative of embedding function
-
-  comm->forward_comm_pair(this);
-  
-//  printf("\nBefore computing force: eng_vdwl = %f\n", eng_vdwl);
-  
-  // compute forces on each atom on GPU
-  {
-  int nall = atom->nlocal + atom->nghost;  
-  int inum, host_start;
-  
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;  
-  if (gpu_mode != GPU_FORCE) {
-    inum = atom->nlocal;
-    firstneigh = eam_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
-				   atom->type, domain->sublo, domain->subhi,
-				   atom->tag, atom->nspecial, atom->special,
-				   eflag, vflag, eflag_atom, vflag_atom,
-				   host_start, &ilist, &numneigh, cpu_time,
-				   success, fp, domain->boxlo, 
-				   domain->prd);
-  } else { // gpu_mode == GPU_FORCE
-    inum = list->inum;
-    ilist = list->ilist;
-    numneigh = list->numneigh;
-    firstneigh = list->firstneigh;
-    eam_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
-		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
-		    vflag_atom, host_start, cpu_time, success, fp,
-		    atom->nlocal, domain->boxlo, domain->prd);
-  }
-  if (!success)
-    error->one(FLERR,"Out of memory on GPGPU");
-
-  if (host_start<inum) {
-    cpu_time = MPI_Wtime();
-    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
-    cpu_time = MPI_Wtime() - cpu_time;
-  }
-  
-  }
-
-//  printf("\nAfter computing force: eng_vdwl = %f\n", eng_vdwl);
-  
-  if (vflag_fdotr) virial_fdotr_compute();
-
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairEAMGPU::cpu_compute(int start, int inum, int eflag, int vflag,
 				      int *ilist, int *numneigh,
 				      int **firstneigh)
 {
   int i,j,ii,jj,m,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi;
   double *coeff;
   int *jlist;
 
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
   
   // compute forces on each atom
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
 
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
 
       if (rsq < cutforcesq) {
 	jtype = type[j];
 	r = sqrt(rsq);
 	p = r*rdr + 1.0;
 	m = static_cast<int> (p);
 	m = MIN(m,nr-1);
 	p -= m;
 	p = MIN(p,1.0);
 
 	// rhoip = derivative of (density at atom j due to atom i)
 	// rhojp = derivative of (density at atom i due to atom j)
 	// phi = pair potential energy
 	// phip = phi'
 	// z2 = phi * r
 	// z2p = (phi * r)' = (phi' r) + phi
 	// psip needs both fp[i] and fp[j] terms since r_ij appears in two
 	//   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
 	//   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
 
 	coeff = rhor_spline[type2rhor[itype][jtype]][m];
 	rhoip = (coeff[0]*p + coeff[1])*p + coeff[2];
 	coeff = rhor_spline[type2rhor[jtype][itype]][m];
 	rhojp = (coeff[0]*p + coeff[1])*p + coeff[2];
 	coeff = z2r_spline[type2z2r[itype][jtype]][m];
 	z2p = (coeff[0]*p + coeff[1])*p + coeff[2];
 	z2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
 
 	recip = 1.0/r;
 	phi = z2*recip;
 	phip = z2p*recip - phi*recip;
 	psip = fp[i]*rhojp + fp[j]*rhoip + phip;
 	fpair = -psip*recip;
 
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 
 	if (eflag) evdwl = phi;
 	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
       }
     }
   }
 
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairEAMGPU::init_style()
 {
   if (force->newton_pair) 
     error->all(FLERR,"Cannot use newton pair with eam/gpu pair style");
   
   if (!allocated) error->all(FLERR,"Not allocate memory eam/gpu pair style");
   
   // convert read-in file(s) to arrays and spline them
 
   file2array();
   array2spline();
   
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
   double cut;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cut *= cut;
         if (cut > maxcut)
           maxcut = cut;
         cutsq[i][j] = cutsq[j][i] = cut;
       } else
         cutsq[i][j] = cutsq[j][i] = 0.0;
     }
   }
   double cell_size = sqrt(maxcut) + neighbor->skin;
   
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int success = eam_gpu_init(atom->ntypes+1, cutforcesq,
-          type2rhor, type2z2r, rhor_spline, z2r_spline,
-          rdr, nrhor, nz2r, nr, atom->nlocal, 
+          type2rhor, type2z2r, type2frho,
+          rhor_spline, z2r_spline, frho_spline,
+          rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, atom->nlocal, 
           atom->nlocal+atom->nghost, 300, maxspecial,
           cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
   
-  if (gpu_mode == GPU_FORCE) {
-    int irequest = neighbor->request(this);
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->full = 1;
-  } else neighbor->request(this);
+  int irequest = neighbor->request(this);
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
 }
 
 
 
 
diff --git a/src/GPU/pair_eam_gpu.h b/src/GPU/pair_eam_gpu.h
index 0c4f7b580..b7eaffcfa 100644
--- a/src/GPU/pair_eam_gpu.h
+++ b/src/GPU/pair_eam_gpu.h
@@ -1,50 +1,51 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifdef PAIR_CLASS
 
 PairStyle(eam/gpu,PairEAMGPU)
 
 #else
 
 #ifndef LMP_PAIR_EAM_GPU_H
 #define LMP_PAIR_EAM_GPU_H
 
 #include "stdio.h"
 #include "pair_eam.h"
 
 namespace LAMMPS_NS {
 
 class PairEAMGPU : public PairEAM {
  public:
 
   PairEAMGPU(class LAMMPS *);
   virtual ~PairEAMGPU();
   void cpu_compute(int, int, int, int, int *, int *, int **);
+  void cpu_compute_energy(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();
 
  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
 
  private:
   int gpu_mode;
   double cpu_time;
   int *gpulist;
   
 };
 
 }
 
 #endif
 #endif