diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu
index 798cb3ae3..4086ffc64 100644
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@@ -1,424 +1,497 @@
 /* ----------------------------------------------------------------------
    LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #ifndef PPPM_GPU_KERNEL
 #define PPPM_GPU_KERNEL
 
 #define MAX_STENCIL 8
 #define BLOCK_1D 64
 
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 
 #ifdef NV_KERNEL
 
 #include "geryon/ucl_nv_kernel.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
 
 #ifdef _DOUBLE_DOUBLE
 __inline double4 fetch_pos(const int& i, const double4 *pos)
 {
   return pos[i];
 }
 __inline double fetch_q(const int& i, const double *q)
 {
   return q[i];
 }
 
 __device__ inline void atom_add_float(double* address, double val) {
   double old=*address, assumed;
   do { 
     assumed=old;
     old=__longlong_as_double( atomicCAS((unsigned long long int*)address, 
                                           __double_as_longlong(assumed),
                                           __double_as_longlong(val+assumed)));
   } while (assumed!=old); 
 }
 
 #else
 __inline float4 fetch_pos(const int& i, const float4 *pos)
 {
   return tex1Dfetch(pos_tex, i);
 }
 __inline float fetch_q(const int& i, const float *q)
 {
   return tex1Dfetch(q_tex, i);
 }
 
 #if (__CUDA_ARCH__ < 200)
 __device__ inline void atom_add_float(float *address, float val) {
   float old=*address, assumed;
   do { 
     assumed=old;
     old=__int_as_float( atomicCAS((int *)address, __float_as_int(assumed),
                         __float_as_int(val+assumed)));
   } while (assumed != old); 
 }
 #else
 #define atom_add_float atomicAdd
 #endif
 
 #endif
 
 #else
 
 #pragma OPENCL EXTENSION cl_khr_fp64: enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
 #define GLOBAL_ID_X get_global_id(0)
 #define THREAD_ID_X get_local_id(0)
 #define BLOCK_ID_X get_group_id(0)
 #define BLOCK_SIZE_X get_local_size(0)
 #define GLOBAL_SIZE_X get_global_size(0)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
 #define __inline inline
 
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
 
 #endif
 
 __kernel void particle_map(__global numtyp4 *x_, const int nlocal, 
                            __global int *counts, __global int *ans, 
                            const numtyp b_lo_x, const numtyp b_lo_y,
                            const numtyp b_lo_z, const numtyp delxinv,
                            const numtyp delyinv, const numtyp delzinv,
                            const int nlocal_x, const int nlocal_y,
                            const int nlocal_z, const int atom_stride,
                            const int max_atoms, __global int *error) {
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X;
 
   // Resequence the atom indices to avoid collisions during atomic ops
   int nthreads=GLOBAL_SIZE_X;
   ii=mul24(ii,BLOCK_1D);
   ii-=int(ii/nthreads)*(nthreads-1);
 
   int nx,ny,nz;
   numtyp tx,ty,tz;
 
   if (ii<nlocal) {
     numtyp4 p=fetch_pos(ii,x_);
 
     tx=(p.x-b_lo_x)*delxinv;
     nx=int(tx);
     ty=(p.y-b_lo_y)*delyinv;
     ny=int(ty);
     tz=(p.z-b_lo_z)*delzinv;
     nz=int(tz);
 
     if (tx<0 || ty<0 || tz<0 || nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
       *error=1;
     else {
       int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx;
       int old=atom_add(counts+i, 1);
       if (old==max_atoms) {
         *error=2;
         atom_add(counts+i,-1);
       } else
         ans[atom_stride*old+i]=ii;
     }
   }
 }
 
 __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_,
                        __global int *counts, __global int *atoms,
                        __global numtyp *brick, __global numtyp *_rho_coeff,
                        const int atom_stride, const int npts_x,
                        const int npts_yx, const int nlocal_x,
                        const int nlocal_y, const int nlocal_z,
                        const int x_threads, const numtyp b_lo_x,
                        const numtyp b_lo_y, const numtyp b_lo_z,
                        const numtyp delxinv, const numtyp delyinv,
                        const numtyp delzinv, const int order, const int order2,
                        const numtyp delvolinv) {
   __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
 
   int nx=THREAD_ID_X;
   int ny=THREAD_ID_Y;
   if (nx<order && ny<order) {
     int ri=mul24(nx,order)+ny;
     rho_coeff[ri]=_rho_coeff[ri];
   }
   __syncthreads();
   
   nx+=mul24(BLOCK_ID_X,BLOCK_SIZE_X);
   ny+=mul24(BLOCK_ID_Y,BLOCK_SIZE_Y);
 
   // Get the z-block we are working on
   int z_block=nx/x_threads;
   nx=nx%x_threads;
   int nz=mul24(z_block,8);
   int z_stop=nz+8;
   if (z_stop>nlocal_z)
     z_stop=nlocal_z;
   
   if (nx<nlocal_x && ny<nlocal_y) {
     int z_stride=mul24(nlocal_x,nlocal_y);
     int z_pos=mul24(nz,z_stride)+mul24(ny,nlocal_x)+nx;
     for ( ; nz<z_stop; nz++) {
       int natoms=counts[z_pos];
       for (int row=0; row<natoms; row++) {
         int atom=atoms[mul24(atom_stride,row)+z_pos];
         numtyp4 p=fetch_pos(atom,x_);
         numtyp z0=delvolinv*fetch_q(atom,q_);
         
         numtyp dx=nx-(p.x-b_lo_x)*delxinv;
         numtyp dy=ny-(p.y-b_lo_y)*delyinv;
         numtyp dz=nz-(p.z-b_lo_z)*delzinv;
 
         numtyp rho1d[2][MAX_STENCIL];
         for (int k=0; k<order; k++) {
           rho1d[0][k]=(numtyp)0.0;
           rho1d[1][k]=(numtyp)0.0;
           for (int l=order2+k; l>=k; l-=order) {
             rho1d[0][k]=rho_coeff[l]+rho1d[0][k]*dx;
             rho1d[1][k]=rho_coeff[l]+rho1d[1][k]*dy;
           }
         }
         
         int mz=mul24(nz,npts_yx)+nx;
         for (int n=0; n<order; n++) {
           numtyp rho1d_2=(numtyp)0.0;
           for (int k=order2+n; k>=n; k-=order)
             rho1d_2=rho_coeff[k]+rho1d_2*dz;
           numtyp y0=z0*rho1d_2;
           int my=mz+mul24(ny,npts_x);
           for (int m=0; m<order; m++) {
 	          numtyp x0=y0*rho1d[1][m];
 	          for (int l=0; l<order; l++)
               atom_add_float(brick+my+l,x0*rho1d[0][l]);
 	          my+=npts_x;
 	        }
 	        mz+=npts_yx;
 	      }
 	    }
 	    z_pos+=z_stride;
 	  }
 	}
 }
 
 /* --------------------------- */
 
 __kernel void make_rho2(__global numtyp4 *x_, __global numtyp *q_,
                         __global int *counts, __global int *atoms,
                         __global numtyp *brick, __global numtyp *_rho_coeff,
                         const int atom_stride, const int npts_x,
                         const int npts_yx, const int npts_z, const int nlocal_x,
                         const int nlocal_y, const int nlocal_z,
                         const int order_m_1, const numtyp b_lo_x,
                         const numtyp b_lo_y, const numtyp b_lo_z,
                         const numtyp delxinv, const numtyp delyinv,
                         const numtyp delzinv, const int order, const int order2,
                         const numtyp delvolinv) {
   __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
   __local numtyp front[BLOCK_1D+MAX_STENCIL];
   __local int nx,ny,x_start,y_start,x_stop,y_stop;
   __local int z_stride, z_local_stride;
 
   int tx=THREAD_ID_X;
   int tx_halo=BLOCK_1D+tx;
   if (tx<order2+order)
     rho_coeff[tx]=_rho_coeff[tx];
     
   if (tx==0) {
     nx=BLOCK_ID_X;
     ny=BLOCK_ID_Y;
     x_start=0;
     y_start=0;
     x_stop=order;
     y_stop=order;
     if (nx<order_m_1)
       x_start=order_m_1-nx;
     if (ny<order_m_1)
       y_start=order_m_1-ny;
     if (nx>=nlocal_x)
       x_stop-=nx-nlocal_x+1;
     if (ny>=nlocal_y)
       y_stop-=ny-nlocal_y+1;
     z_stride=mul24(npts_yx,BLOCK_1D);
     z_local_stride=mul24(mul24(nlocal_x,nlocal_y),BLOCK_1D);
   }
   
   if (tx<order) 
     front[tx_halo]=(numtyp)0.0;
     
   __syncthreads();
 
   numtyp ans[MAX_STENCIL];
   int loop_count=npts_z/BLOCK_1D+1;
   int nz=tx;
   int pt=mul24(nz,npts_yx)+mul24(ny,npts_x)+nx;
   int z_local=mul24(mul24(nz,nlocal_x),nlocal_y);
   for (int i=0 ; i<loop_count; i++) {
     for (int n=0; n<order; n++)
       ans[n]=(numtyp)0.0;
     if (nz<nlocal_z) {
       for (int m=y_start; m<y_stop; m++) {
         int y_pos=ny+m-order_m_1;
         int y_local=mul24(y_pos,nlocal_x);
         for (int l=x_start; l<x_stop; l++) {
           int x_pos=nx+l-order_m_1;
           int pos=z_local+y_local+x_pos;
           int natoms=mul24(counts[pos],atom_stride);
           for (int row=pos; row<natoms; row+=atom_stride) {
             int atom=atoms[row];
             numtyp4 p=fetch_pos(atom,x_);
             numtyp z0=delvolinv*fetch_q(atom,q_);
       
             numtyp dx=x_pos-(p.x-b_lo_x)*delxinv;
             numtyp dy=y_pos-(p.y-b_lo_y)*delyinv;
             numtyp dz=nz-(p.z-b_lo_z)*delzinv;
             
             numtyp rho1d_1=(numtyp)0.0;
             numtyp rho1d_0=(numtyp)0.0;
             for (int k=order2+order-1; k > -1; k-=order) {
               rho1d_1=rho_coeff[k-m]+rho1d_1*dy;
               rho1d_0=rho_coeff[k-l]+rho1d_0*dx;
             }
             z0*=rho1d_1*rho1d_0;
 
             for (int n=0; n<order; n++) {
               numtyp rho1d_2=(numtyp)0.0;
               for (int k=order2+n; k>=n; k-=order)
                 rho1d_2=rho_coeff[k]+rho1d_2*dz;
               ans[n]+=z0*rho1d_2;
             }
           }
         }
       }
     }
     
     __syncthreads();
     if (tx<order) {
       front[tx]=front[tx_halo];
       front[tx_halo]=(numtyp)0.0;
     } else 
       front[tx]=(numtyp)0.0;
     
     for (int n=0; n<order; n++) {
       front[tx+n]+=ans[n];
       __syncthreads();
     }
 
     if (nz<npts_z)
       brick[pt]=front[tx];
     nz+=BLOCK_1D;
     pt+=z_stride;
     z_local+=z_local_stride;
   }
 }
 
 /* --------------------------- */
 
 __kernel void make_rho3(__global numtyp4 *x_, __global numtyp *q_,
                         const int nlocal, __global numtyp *brick,
                         __global numtyp *_rho_coeff, const int npts_x,
                         const int npts_yx, const int nlocal_x,
                         const int nlocal_y, const int nlocal_z,
                         const numtyp b_lo_x, const numtyp b_lo_y,
                         const numtyp b_lo_z, const numtyp delxinv,
                         const numtyp delyinv, const numtyp delzinv,
                         const int order, const int order2,
                         const numtyp delvolinv, __global int *error) {
   __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
   int ii=THREAD_ID_X;
   if (ii<order2+order)
     rho_coeff[ii]=_rho_coeff[ii];
   __syncthreads();
   
   ii+=BLOCK_ID_X*BLOCK_SIZE_X;
   
   // Resequence the atom indices to avoid collisions during atomic ops
   int nthreads=GLOBAL_SIZE_X;
   ii=mul24(ii,BLOCK_1D);
   ii-=int(ii/nthreads)*(nthreads-1);
 
   int nx,ny,nz;
   numtyp tx,ty,tz;
 
   if (ii<nlocal) {
     numtyp4 p=fetch_pos(ii,x_);
 
     tx=(p.x-b_lo_x)*delxinv;
     nx=int(tx);
     ty=(p.y-b_lo_y)*delyinv;
     ny=int(ty);
     tz=(p.z-b_lo_z)*delzinv;
     nz=int(tz);
 
     if (tx<0 || ty<0 || tz<0 || nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
       *error=1;
     else {
       numtyp z0=delvolinv*fetch_q(ii,q_);
         
       numtyp dx=nx+(numtyp)0.5-tx;
       numtyp dy=ny+(numtyp)0.5-ty;
       numtyp dz=nz+(numtyp)0.5-tz;
 
       numtyp rho1d[2][MAX_STENCIL];
       for (int k=0; k<order; k++) {
         rho1d[0][k]=(numtyp)0.0;
         rho1d[1][k]=(numtyp)0.0;
         for (int l=order2+k; l>=k; l-=order) {
           rho1d[0][k]=rho_coeff[l]+rho1d[0][k]*dx;
           rho1d[1][k]=rho_coeff[l]+rho1d[1][k]*dy;
         }
       }
         
       int mz=mul24(nz,npts_yx)+nx;
       for (int n=0; n<order; n++) {
         numtyp rho1d_2=(numtyp)0.0;
         for (int k=order2+n; k>=n; k-=order)
           rho1d_2=rho_coeff[k]+rho1d_2*dz;
         numtyp y0=z0*rho1d_2;
         int my=mz+mul24(ny,npts_x);
         for (int m=0; m<order; m++) {
           numtyp x0=y0*rho1d[1][m];
 	        for (int l=0; l<order; l++)
             atom_add_float(brick+my+l,x0*rho1d[0][l]);
           my+=npts_x;
         }
         mz+=npts_yx;
 	    }
 	  }
 	}
 }
 
+__kernel void field_force(__global numtyp4 *x_, __global numtyp *q_,
+                          const int nlocal, __global numtyp *x_brick,
+                          __global numtyp *y_brick, __global numtyp *z_brick,
+                          __global numtyp *_rho_coeff, const int npts_x,
+                          const int npts_yx, const numtyp b_lo_x,
+                          const numtyp b_lo_y, const numtyp b_lo_z,
+                          const numtyp delxinv,  const numtyp delyinv,
+                          const numtyp delzinv, const int order,
+                          const int order2, const numtyp qqrd2e, 
+                          const numtyp scale, __global acctyp4 *ans) {
+  __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
+  int ii=THREAD_ID_X;
+  if (ii<order2+order)
+    rho_coeff[ii]=_rho_coeff[ii];
+  __syncthreads();
+  
+  ii+=BLOCK_ID_X*BLOCK_SIZE_X;
+  
+  int nx,ny,nz;
+  numtyp tx,ty,tz;
+
+  if (ii<nlocal) {
+    numtyp4 p=fetch_pos(ii,x_);
+    numtyp qs=qqrd2e*scale*fetch_q(ii,q_);
+
+    tx=(p.x-b_lo_x)*delxinv;
+    nx=int(tx);
+    ty=(p.y-b_lo_y)*delyinv;
+    ny=int(ty);
+    tz=(p.z-b_lo_z)*delzinv;
+    nz=int(tz);
+
+    numtyp dx=nx+(numtyp)0.5-tx;
+    numtyp dy=ny+(numtyp)0.5-ty;
+    numtyp dz=nz+(numtyp)0.5-tz;
+
+    numtyp rho1d[2][MAX_STENCIL];
+    for (int k=0; k<order; k++) {
+      rho1d[0][k]=(numtyp)0.0;
+      rho1d[1][k]=(numtyp)0.0;
+      for (int l=order2+k; l>=k; l-=order) {
+        rho1d[0][k]=rho_coeff[l]+rho1d[0][k]*dx;
+        rho1d[1][k]=rho_coeff[l]+rho1d[1][k]*dy;
+      }
+    }
+        
+    numtyp4 ek;
+    ek.x=(acctyp)0.0;
+    ek.y=(acctyp)0.0;
+    ek.z=(acctyp)0.0;
+    int mz=mul24(nz,npts_yx)+nx;
+    for (int n=0; n<order; n++) {
+      numtyp rho1d_2=(numtyp)0.0;
+      for (int k=order2+n; k>=n; k-=order)
+        rho1d_2=rho_coeff[k]+rho1d_2*dz;
+      numtyp z0=qs*rho1d_2;
+      int my=mz+mul24(ny,npts_x);
+      for (int m=0; m<order; m++) {
+        numtyp y0=z0*rho1d[1][m];
+	      for (int l=0; l<order; l++) {
+	        numtyp x0=y0*rho1d[0][l];
+	        ek.x-=x0*x_brick[my+l];
+	        ek.y-=x0*y_brick[my+l];
+	        ek.z-=x0*z_brick[my+l];
+	      }
+        my+=npts_x;
+      }
+      mz+=npts_yx;
+	  }
+    ans[ii]=ek;
+	}
+}
+
 #endif