diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index f4334f9df..d6f8a6a81 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -1,311 +1,311 @@
 # /* ----------------------------------------------------------------------   
 #    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
 #    http://lammps.sandia.gov, Sandia National Laboratories                   
 #    Steve Plimpton, sjplimp@sandia.gov                                       
 #                                                                             
 #    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
 #    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
 #    certain rights in this software.  This software is distributed under      
 #    the GNU General Public License.                                          
 #                                                                             
 #    See the README file in the top-level LAMMPS directory.                   
 # ------------------------------------------------------------------------- */
 #                                                                             
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
 #                          Peng Wang (Nvidia), penwang@nvidia.com
 #                          Inderaj Bains (NVIDIA), ibains@nvidia.com
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
 CUDA  = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
              $(CUDA_PRECISION)
 CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
         -Icudpp_mini
 CUDA_LINK = $(CUDA_LIB) -lcudart
 
 GPU_LIB = $(LIB_DIR)/libgpu.a
 
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVC_H  = $(wildcard ./geryon/nvc*.h) $(UCL_H)
 NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) 
 # Headers for Pair Stuff
 PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
           pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
-          pair_gpu_balance.h
+          pair_gpu_balance.h pppm_gpu_memory.h
 
 ALL_H = $(NVD_H) $(PAIR_H)
 
 EXECS = $(BIN_DIR)/nvc_get_devices
 CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
         $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
         $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
 OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
        $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
        $(OBJ_DIR)/pair_gpu_device.o \
        $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
        $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
        $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
        $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
        $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
        $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
        $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
        $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
        $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
        $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
        $(OBJ_DIR)/cmmc_msm_gpu_memory.o $(OBJ_DIR)/cmmc_msm_gpu.o \
        $(CUDPP)
 PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
        $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
        $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
        $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
        $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \
        $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \
        $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
        $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
        $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
        $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
        $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \
        $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
        $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
        $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \
        $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
        $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
        $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h \
        $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h
 
 all: $(GPU_LIB) $(EXECS)
 
 $(OBJ_DIR)/cudpp.o: cudpp_mini/cudpp.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp.cpp -Icudpp_mini
 
 $(OBJ_DIR)/cudpp_plan.o: cudpp_mini/cudpp_plan.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan.cpp -Icudpp_mini
 
 $(OBJ_DIR)/cudpp_maximal_launch.o: cudpp_mini/cudpp_maximal_launch.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_maximal_launch.cpp -Icudpp_mini
 
 $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
 
 $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
 	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
 
 $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
 	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
 
 $(OBJ_DIR)/pair_gpu_atom_kernel.ptx: pair_gpu_atom_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_atom_kernel.cu
 
 $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h
 
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
 	$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
 
 $(OBJ_DIR)/pair_gpu_nbor_ptx.h: $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h
 
 $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_build_kernel.cu
 
 $(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
 
 $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu
 
 $(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h
 
 $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h
 	$(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(CUDR) -o $@ -c atomic_gpu_memory.cpp
 
 $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(CUDR) -o $@ -c charge_gpu_memory.cpp
 
 $(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu
 
 $(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h
 
 $(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu
 
 $(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h
 
 $(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h
 	$(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
 	$(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
 
 $(OBJ_DIR)/gb_gpu_kernel_lj.ptx: gb_gpu_kernel_lj.cu pair_gpu_precision.h gb_gpu_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_lj.cu
 
 $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx: gb_gpu_kernel_nbor.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_nbor.cu
 
 $(OBJ_DIR)/gb_gpu_ptx.h: $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h
 
 $(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_ptx.h
 	$(CUDR) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
 	$(CUDR) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_cut_gpu_kernel.ptx: lj_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ lj_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h
 
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
 	$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ ljc_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h
 
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
 	$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ ljcl_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h
 
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
 	$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu
 
 $(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h
 
 $(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp
 	$(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
 
 $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h
 
 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
 	$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h
 
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
 	$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu
 
 $(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h
 
 $(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp
 	$(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h
 
 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
 	$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_long_gpu_kernel.cu
 
 $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
 
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
 	$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx: cmmc_msm_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_msm_gpu_kernel.cu
 
 $(OBJ_DIR)/cmmc_msm_gpu_ptx.h: $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h
 
 $(OBJ_DIR)/cmmc_msm_gpu_memory.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu_memory.cpp $(OBJ_DIR)/cmmc_msm_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmmc_msm_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_msm_gpu.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu.cpp
 	$(CUDR) -o $@ -c cmmc_msm_gpu.cpp -I$(OBJ_DIR)
 
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) 
 
 $(GPU_LIB): $(OBJS)
 	$(AR) -crusv $(GPU_LIB) $(OBJS)
 
 clean:
 	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo
 
 veryclean: clean
 	rm -rf *~ *.linkinfo
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 3e758bb75..9c601bc71 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -1,207 +1,207 @@
 # /* ----------------------------------------------------------------------   
 #    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
 #    http://lammps.sandia.gov, Sandia National Laboratories                   
 #    Steve Plimpton, sjplimp@sandia.gov                                       
 #                                                                             
 #    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
 #    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
 #    certain rights in this software.  This software is distributed under      
 #    the GNU General Public License.                                          
 #                                                                             
 #    See the README file in the top-level LAMMPS directory.                   
 # ------------------------------------------------------------------------- */
 #                                                                             
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
 #                          Peng Wang (Nvidia), penwang@nvidia.com             
 #                          Inderaj Bains (NVIDIA), ibains@nvidia.com
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
 OCL  = $(OCL_CPP) $(OCL_PREC) -DUSE_OPENCL
 OCL_LIB = $(LIB_DIR)/libgpu.a
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H)
 # Headers for Pair Stuff
 PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
           pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
-          pair_gpu_balance.h
+          pair_gpu_balance.h pppm_gpu_memory.h
 
 ALL_H = $(OCL_H) $(PAIR_H)
 
 EXECS = $(BIN_DIR)/ocl_get_devices
 OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
        $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
        $(OBJ_DIR)/pair_gpu_device.o \
        $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
        $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
        $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
        $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
        $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
        $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
        $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
        $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
        $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
        $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o 
 KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \
        $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \
        $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
        $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
        $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \
        $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \
        $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \
        $(OBJ_DIR)/cmmc_long_gpu_cl.h 
 
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
 
 all: $(OCL_LIB) $(EXECS)
 
 $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_cl.h
 
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
 	$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
 
 $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
 	$(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
 	$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h
 
 $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h
 	$(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(OCL) -o $@ -c atomic_gpu_memory.cpp
 
 $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(OCL) -o $@ -c charge_gpu_memory.cpp
 
 $(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h;
 
 $(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp  $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h
 	$(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
 	$(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
 	$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
 
 $(OBJ_DIR)/gb_gpu_cl.h: gb_gpu_kernel.cu gb_gpu_kernel_lj.cu gb_gpu_extra.h
 	cat gb_gpu_extra.h gb_gpu_kernel.cu > $(OBJ_DIR)/gb_gpu_kernel.tar; \
 	cat gb_gpu_extra.h gb_gpu_kernel_lj.cu > $(OBJ_DIR)/gb_gpu_kernel_lj.tar; \
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_cl.h; \
 	rm -f $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar
 
 $(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h
 	$(OCL) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
 	$(OCL) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_cl.h;
 
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp  $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
 	$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_cl.h;
 
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp  $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
 	$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_cl.h;
 
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp  $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
 	$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h;
 
 $(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp  $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp
 	$(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
 
 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp  $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
 	$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h;
 
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp  $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
 	$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h;
 
 $(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp  $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp
 	$(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
 
 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp  $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
 	$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_cl.h;
 
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp  $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
 	$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
 
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
 
 $(OCL_LIB): $(OBJS) $(PTXS)
 	$(AR) -crusv $(OCL_LIB) $(OBJS)
 
 opencl: $(OCL_EXECS)
 
 clean:
 	rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo
 
 veryclean: clean
 	rm -rf *~ *.linkinfo
 
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 837e5f52c..83d20a51c 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -1,529 +1,546 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #include "pair_gpu_device.h"
 #include "pair_gpu_precision.h"
 #include <map>
 #include <math.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
 #ifdef USE_OPENCL
 #include "pair_gpu_dev_cl.h"
 #else
 #include "pair_gpu_dev_ptx.h"
 #endif
 
 #define BLOCK_1D 64
 #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
 
 template <class numtyp, class acctyp>
 PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
                                   _gpu_mode(GPU_FORCE), _first_device(0),
                                   _last_device(0), _compiled(false) {
 }
 
 template <class numtyp, class acctyp>
 PairGPUDeviceT::~PairGPUDevice() {
   clear_device();
 }
 
 template <class numtyp, class acctyp>
 bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
                                  const int first_gpu, const int last_gpu,
                                  const int gpu_mode, const double p_split,
                                  const int nthreads) {
   _nthreads=nthreads;
   #ifdef _OPENMP
   omp_set_num_threads(nthreads);
   #endif
 
   if (_device_init)
     return true;
   _device_init=true;
   _comm_world=world;
   _comm_replica=replica;
   _first_device=first_gpu;
   _last_device=last_gpu;
   _gpu_mode=gpu_mode;
   _particle_split=p_split;
 
   // Get the rank/size within the world
   MPI_Comm_rank(_comm_world,&_world_me);
   MPI_Comm_size(_comm_world,&_world_size);
   // Get the rank/size within the replica
   MPI_Comm_rank(_comm_replica,&_replica_me);
   MPI_Comm_size(_comm_replica,&_replica_size);
 
   // Get the names of all nodes
   int name_length;
   char node_name[MPI_MAX_PROCESSOR_NAME];
   char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
   MPI_Get_processor_name(node_name,&name_length);
   MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
                 MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
   std::string node_string=std::string(node_name);
   
   // Get the number of procs per node                
   std::map<std::string,int> name_map;
   std::map<std::string,int>::iterator np;
   for (int i=0; i<_world_size; i++) {
     std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
     np=name_map.find(i_string);
     if (np==name_map.end())
       name_map[i_string]=1;
     else
       np->second++;
   }
   int procs_per_node=name_map.begin()->second;
 
   // Assign a unique id to each node
   int split_num=0, split_id=0;
   for (np=name_map.begin(); np!=name_map.end(); ++np) {
     if (np->first==node_string)
       split_id=split_num;
     split_num++;
   }
   
   // Set up a per node communicator and find rank within
   MPI_Comm node_comm;
   MPI_Comm_split(_comm_world, split_id, 0, &node_comm);  
   int node_rank;
   MPI_Comm_rank(node_comm,&node_rank);                  
 
   // set the device ID
   _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
                                        (last_gpu-first_gpu+1)));
   int my_gpu=node_rank/_procs_per_gpu+first_gpu;
   
   // Set up a per device communicator
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
   MPI_Comm_rank(_comm_gpu,&_gpu_rank);
 
   gpu=new UCL_Device();
   if (my_gpu>=gpu->num_devices())
     return false;
   
   gpu->set(my_gpu);
 
   _block_size=BLOCK_1D;
   if (static_cast<size_t>(_block_size)>gpu->group_size())
     _block_size=gpu->group_size();
 
+  _long_range_precompute=0;
+  
   return true;
 }
 
 template <class numtyp, class acctyp>
 bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
                           const bool rot, const int nlocal, 
                           const int host_nlocal, const int nall,
                           PairGPUNbor *nbor, const int maxspecial,
                           const int gpu_host, const int max_nbors, 
                           const double cell_size, const bool pre_cut) {
   if (!_device_init)
     return false;                          
 
   // Counts of data transfers for timing overhead estimates
   _data_in_estimate=0;
   _data_out_estimate=1;
 
   // Initial number of local particles
   int ef_nlocal=nlocal;
   if (_particle_split<1.0 && _particle_split>0.0)
     ef_nlocal=static_cast<int>(_particle_split*nlocal);
 
   bool gpu_nbor=false;
   if (_gpu_mode==GPU_NEIGH)
     gpu_nbor=true;
     
   if (_init_count==0) {
     // Initialize atom and nbor data
     if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
       return false;
     compile_kernels();
     _data_in_estimate++;
     if (charge)
       _data_in_estimate++;
     if (rot)
       _data_in_estimate++;
   } else {
     if (atom.charge()==false && charge)
       _data_in_estimate++;
     if (atom.quat()==false && rot)
       _data_in_estimate++;
     atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial);
   }
   
   if (!ans.init(ef_nlocal,charge,rot,*gpu))
     return false;
 
   if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                   *gpu,gpu_nbor,gpu_host,pre_cut))
     return false;
   nbor->cell_size(cell_size);
 
   _init_count++;
   return true;
 }
 
 template <class numtyp, class acctyp>
 bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
                           const int nall) {
   if (!_device_init)
     return false;                          
 
   if (_init_count==0) {
     // Initialize atom and nbor data
     if (!atom.init(nall,true,false,*gpu,false,false))
       return false;
     compile_kernels();
   } else
     atom.add_fields(true,false,false,false);
 
   if (!ans.init(nlocal,true,false,*gpu))
     return false;
 
   _init_count++;
   return true;
 }
 
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::set_single_precompute
+                     (PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) {
+  _long_range_precompute=1;
+  pppm_single=pppm;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::set_double_precompute
+                     (PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) {
+  _long_range_precompute=2;
+  pppm_double=pppm;
+}
+
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::init_message(FILE *screen, const char *name,
                                   const int first_gpu, const int last_gpu) {
   #ifdef USE_OPENCL
   std::string fs="";
   #else
   std::string fs=toa(gpu->free_gigabytes())+"/";
   #endif
   
   if (_replica_me == 0 && screen) {
     fprintf(screen,"\n-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
     fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
     fprintf(screen,"-  with %d procs per device.\n",_procs_per_gpu);
     #ifdef _OPENMP
     fprintf(screen,"-  with %d threads per proc.\n",_nthreads);
     #endif
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
 
     for (int i=first_gpu; i<=last_gpu; i++) {
       std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
                         toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
                         " GHZ (";
       if (sizeof(PRECISION)==4) {
         if (sizeof(ACC_PRECISION)==4)
           sname+="Single Precision)";
         else
           sname+="Mixed Precision)";
       } else
         sname+="Double Precision)";
 
       fprintf(screen,"GPU %d: %s\n",i,sname.c_str());         
     }
 
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n\n");
   }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls, 
                                            double &gpu_overhead,
                                            double &gpu_driver_overhead) {
   UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
   UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
   UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
   UCL_Timer over_timer(*gpu);
 
   if (_data_in_estimate>0) {
     host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
     dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
     timers_in=new UCL_Timer[_data_in_estimate];
   }
   
   if (_data_out_estimate>0) {
     host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
     dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
     timers_out=new UCL_Timer[_data_out_estimate];
   }
   
   if (kernel_calls>0) {
     kernel_data=new UCL_D_Vec<int>[kernel_calls];
     timers_kernel=new UCL_Timer[kernel_calls];
   }
   
   for (int i=0; i<_data_in_estimate; i++) {
     host_data_in[i].alloc(1,*gpu);
     dev_data_in[i].alloc(1,*gpu);
     timers_in[i].init(*gpu);
   }  
   
   for (int i=0; i<_data_out_estimate; i++) {
     host_data_out[i].alloc(1,*gpu);
     dev_data_out[i].alloc(1,*gpu);
     timers_out[i].init(*gpu);
   }  
   
   for (int i=0; i<kernel_calls; i++) {
     kernel_data[i].alloc(1,*gpu);
     timers_kernel[i].init(*gpu);
   }  
   
   gpu_overhead=0.0;
   gpu_driver_overhead=0.0;
   
   for (int i=0; i<10; i++) {
     gpu->sync();
     gpu_barrier();
     over_timer.start();
     gpu->sync();
     gpu_barrier();
 
     double driver_time=MPI_Wtime();
     for (int i=0; i<_data_in_estimate; i++) {
       timers_in[i].start();
       ucl_copy(dev_data_in[i],host_data_in[i],true);
       timers_in[i].stop();
     }
     
     for (int i=0; i<kernel_calls; i++) {
       timers_kernel[i].start();
       zero(kernel_data[i],1);
       timers_kernel[i].stop();
     }
 
     for (int i=0; i<_data_out_estimate; i++) {
       timers_out[i].start();
       ucl_copy(host_data_out[i],dev_data_out[i],true);
       timers_out[i].stop();
     }
     over_timer.stop();
 
     double time=over_timer.seconds();
     driver_time=MPI_Wtime()-driver_time;
      
     for (int i=0; i<_data_in_estimate; i++)
       timers_in[i].add_to_total();
     for (int i=0; i<kernel_calls; i++)
       timers_kernel[i].add_to_total();
     for (int i=0; i<_data_out_estimate; i++)
       timers_out[i].add_to_total();
 
     double mpi_time, mpi_driver_time;
     MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
     MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
     gpu_overhead+=mpi_time;
     gpu_driver_overhead+=mpi_driver_time;
   }
   gpu_overhead/=10.0;
   gpu_driver_overhead/=10.0;
 
   if (_data_in_estimate>0) {
     delete [] host_data_in;
     delete [] dev_data_in;
     delete [] timers_in;
   }
   
   if (_data_out_estimate>0) {
     delete [] host_data_out;
     delete [] dev_data_out;
     delete [] timers_out;
   }
   
   if (kernel_calls>0) {
     delete [] kernel_data;
     delete [] timers_kernel;
   }
 }              
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::output_times(UCL_Timer &time_pair, 
                                   PairGPUAns<numtyp,acctyp> &ans, 
                                   PairGPUNbor &nbor, const double avg_split, 
                                   const double max_bytes, 
                                   const double gpu_overhead,
                                   const double driver_overhead, FILE *screen) {
   double single[8], times[8];
 
   single[0]=atom.transfer_time()+ans.transfer_time();
   single[1]=nbor.time_nbor.total_seconds();
   single[2]=nbor.time_kernel.total_seconds();
   single[3]=time_pair.total_seconds();
   single[4]=atom.cast_time()+ans.cast_time();
   single[5]=gpu_overhead;
   single[6]=driver_overhead;
   single[7]=ans.cpu_idle_time();
 
   MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
 
   if (replica_me()==0)
     if (screen && times[3]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
       fprintf(screen,"      GPU Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
       if (procs_per_gpu()==1) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
         if (nbor.gpu_nbor())
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
         else
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
       }
       fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
 
       fprintf(screen,"-------------------------------------");
       fprintf(screen,"--------------------------------\n\n");
     }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, 
                                          UCL_Timer &time_out,
                                          UCL_Timer &time_map,
                                          UCL_Timer &time_rho,
                                          UCL_Timer &time_interp,
                                          PairGPUAns<numtyp,acctyp> &ans, 
                                          const double max_bytes, 
                                          const double cpu_time, FILE *screen) {
   double single[7], times[7];
 
 //  single[0]=atom.transfer_time()+ans.transfer_time()+time_in.total_seconds()+
 //            time_out.total_seconds();
 //  single[1]=atom.cast_time()+ans.cast_time();
 
   single[0]=time_out.total_seconds();
   single[1]=time_in.total_seconds();
   single[2]=time_map.total_seconds();
   single[3]=time_rho.total_seconds();
   single[4]=time_interp.total_seconds();
   single[5]=ans.transfer_time()+ans.cast_time();
   single[6]=cpu_time;
 
   MPI_Reduce(single,times,7,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
 
   if (replica_me()==0)
     if (screen && times[3]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
       fprintf(screen,"      GPU Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
       if (procs_per_gpu()==1) {
         fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
         fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Kernel (rho):    %.4f s.\n",times[3]/_replica_size);
         fprintf(screen,"Force interp:    %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Total rho:       %.4f s.\n",(times[0]+times[2]+times[3])/_replica_size);
         fprintf(screen,"Total interp:    %.4f s.\n",(times[1]+times[4])/_replica_size);
         fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
         fprintf(screen,"Total:           %.4f s.\n",
                 (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
                 _replica_size);
         fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
       }
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
       fprintf(screen,"-------------------------------------");
       fprintf(screen,"--------------------------------\n\n");
     }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::clear() {
   if (_init_count>0) {
+    _long_range_precompute=0;
     _init_count--;
     if (_init_count==0) {
       atom.clear();
       _nbor_shared.clear();
       if (_compiled) {
         k_zero.clear();
         delete dev_program;
         _compiled=false;
       }
     }
   }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::clear_device() {
   while (_init_count>0)
     clear();
   if (_device_init) {
     delete gpu;
     _device_init=false;
   }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::compile_kernels() {
   if (_compiled)
   	return;
   	
   std::string flags="-cl-mad-enable";
   dev_program=new UCL_Program(*gpu);
   dev_program->load_string(pair_gpu_dev_kernel,flags.c_str());
   k_zero.set_function(*dev_program,"kernel_zero");
   _compiled=true;
 }
 
 template <class numtyp, class acctyp>
 double PairGPUDeviceT::host_memory_usage() const {
   return atom.host_memory_usage()+4*sizeof(numtyp)+
          sizeof(PairGPUDevice<numtyp,acctyp>);
 }
 
 template class PairGPUDevice<PRECISION,ACC_PRECISION>;
 PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 
 bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                      const int last_gpu, const int gpu_mode, 
                      const double particle_split, const int nthreads) {
   return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
                                      particle_split,nthreads);
 }
 
 void lmp_clear_device() {
   pair_gpu_device.clear_device();
 }
 
 double lmp_gpu_forces(double **f, double **tor, double *eatom,
                       double **vatom, double *virial, double &ecoul) {
   return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
 }
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index ccb9d31c8..c0769bff9 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -1,227 +1,253 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #ifndef PAIR_GPU_DEVICE_H
 #define PAIR_GPU_DEVICE_H
 
 #include "pair_gpu_atom.h"
 #include "pair_gpu_ans.h"
 #include "pair_gpu_nbor.h"
+#include "pppm_gpu_memory.h"
 #include "mpi.h"
 #include <sstream>
 #include "stdio.h"
 #include <string>
 #include <queue>
 
+template <class numtyp, class acctyp, 
+          class grdtyp, class grdtyp4> class PPPMGPUMemory;
+
 template <class numtyp, class acctyp>
 class PairGPUDevice {
  public:
   PairGPUDevice();
   ~PairGPUDevice(); 
  
   /// Initialize the device for use by this process
   /** Sets up a per-device MPI communicator for load balancing and initializes
     * the device (>=first_gpu and <=last_gpu) that this proc will be using **/
   bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
                    const int last_gpu, const int gpu_mode, 
                    const double particle_split, const int nthreads);
 
   /// Initialize the device for Atom and Neighbor storage
   /** \param rot True if quaternions need to be stored
     * \param nlocal Total number of local particles to allocate memory for
     * \param host_nlocal Initial number of host particles to allocate memory for
     * \param nall Total number of local+ghost particles
     * \param gpu_nbor True if neighboring is performed on device
     * \param gpu_host 0 if host will not perform force calculations,
     *                 1 if gpu_nbor is true, and host needs a half nbor list,
     *                 2 if gpu_nbor is true, and host needs a full nbor list
     * \param max_nbors Initial number of rows in the neighbor matrix
     * \param cell_size cutoff+skin 
     * \param pre_cut True if cutoff test will be performed in separate kernel
     *                than the force kernel **/
   bool init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
             const int nlocal, const int host_nlocal, const int nall,
             PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
             const int max_nbors, const double cell_size, const bool pre_cut);
 
   /// Initialize the device for Atom storage only
   /** \param nlocal Total number of local particles to allocate memory for
     * \param nall Total number of local+ghost particles **/
   bool init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
 
   /// Output a message for pair_style acceleration with device stats
   void init_message(FILE *screen, const char *name,
                     const int first_gpu, const int last_gpu);
 
+  /// Perform charge assignment asynchronously for PPPM
+	void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
+	                                         float,_lgpu_float4> *pppm);
+
+  /// Perform charge assignment asynchronously for PPPM
+	void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
+	                                         double,_lgpu_double4> *pppm);
+
   /// Esimate the overhead from GPU calls from multiple procs
   /** \param kernel_calls Number of kernel calls/timestep for timing estimated
     *                     overhead
     * \param gpu_overhead Estimated gpu overhead per timestep (sec)
     * \param driver_overhead Estimated overhead from driver per timestep (s) **/
   void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
                              double &gpu_driver_overhead);
 
   /// Returns true if double precision is supported on card
   inline bool double_precision() { return gpu->double_precision(); }
   
   /// Output a message with timing information
   void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, 
                     PairGPUNbor &nbor, const double avg_split, 
                     const double max_bytes, const double gpu_overhead,
                     const double driver_overhead, FILE *screen);
 
   /// Output a message with timing information
   void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
                            UCL_Timer & time_map, UCL_Timer & time_rho,
                            UCL_Timer &time_interp, 
                            PairGPUAns<numtyp,acctyp> &ans, 
                            const double max_bytes, const double cpu_time,
                            FILE *screen);
 
   /// Clear all memory on host and device associated with atom and nbor data
   void clear();
   
   /// Clear all memory on host and device
   void clear_device();
 
   /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
   inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
     { ans_queue.push(ans); }
 
   /// Add "answers" (force,energies,etc.) into LAMMPS structures
   inline double fix_gpu(double **f, double **tor, double *eatom,
                         double **vatom, double *virial, double &ecoul) {
     atom.data_unavail();
     if (ans_queue.empty()==false) {
       stop_host_timer();
       double evdw=0.0;
       while (ans_queue.empty()==false) {
         evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
         ans_queue.pop();
       }                                                 
       return evdw;
     }
     return 0.0;
   }
 
   /// Start timer on host
   inline void start_host_timer() 
     { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
   
   /// Stop timer on host
   inline void stop_host_timer() { 
     if (_host_timer_started) {
       _cpu_full=MPI_Wtime()-_cpu_full; 
       _host_timer_started=false;
     }
   }
   
   /// Return host time
   inline double host_time() { return _cpu_full; }
 
   /// Return host memory usage in bytes
   double host_memory_usage() const;
 
   /// Return the number of procs sharing a device (size of device commincator)
   inline int procs_per_gpu() const { return _procs_per_gpu; }
   /// Return the number of threads per proc
   inline int num_threads() const { return _nthreads; }
   /// My rank within all processes
   inline int world_me() const { return _world_me; }
   /// Total number of processes
   inline int world_size() const { return _world_size; }
   /// MPI Barrier for world
   inline void world_barrier() { MPI_Barrier(_comm_world); }
   /// Return the replica MPI communicator
   inline MPI_Comm & replica() { return _comm_replica; }
   /// My rank within replica communicator
   inline int replica_me() const { return _replica_me; }
   /// Number of procs in replica communicator
   inline int replica_size() const { return _replica_size; }
   /// Return the per-GPU MPI communicator
   inline MPI_Comm & gpu_comm() { return _comm_gpu; }
   /// Return my rank in the device communicator
   inline int gpu_rank() const { return _gpu_rank; }
   /// MPI Barrier for gpu
   inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
   /// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH
   inline int gpu_mode() const { return _gpu_mode; }
   /// Index of first device used by a node
   inline int first_device() const { return _first_device; }
   /// Index of last device used by a node
   inline int last_device() const { return _last_device; }
   /// Particle split defined in fix
   inline double particle_split() const { return _particle_split; }
   /// Return the initialization count for the device
   inline int init_count() const { return _init_count; }
 
   // -------------------- SHARED DEVICE ROUTINES -------------------- 
   // Perform asynchronous zero of integer array 
   void zero(UCL_D_Vec<int> &mem, const int numel) {
     int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
                                     _block_size));
     k_zero.set_size(num_blocks,_block_size);
     k_zero.run(&mem.begin(),&numel);
   }
 
   // -------------------------- DEVICE DATA ------------------------- 
 
   /// Geryon Device
   UCL_Device *gpu;
 
   enum{GPU_FORCE, GPU_NEIGH};
 
   // --------------------------- ATOM DATA -------------------------- 
 
   /// Atom Data
   PairGPUAtom<numtyp,acctyp> atom;
 
   // --------------------------- NBOR DATA ----------------------------
   
   /// Neighbor Data
   PairGPUNborShared _nbor_shared;
 
+  // ------------------------ LONG RANGE DATA -------------------------
+  
+  // Long Range Data
+  int _long_range_precompute;
+  PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
+  PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
+  /// Precomputations for long range charge assignment (asynchronously)
+//  inline void precompute(const int ago, const int nlocal, const int nall,
+//                         double **host_x, int *host_type, bool &success,
+//                         double *charge, double *boxlo, const double delxinv,
+//                         const double delyinv, const double delzinv) {
+                         
+
+
  private:
   std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
   int _init_count;
   bool _device_init, _host_timer_started;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
   int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
       _replica_size;
   int _gpu_mode, _first_device, _last_device, _nthreads;
   double _particle_split;
   double _cpu_full;
 
   int _block_size;
   UCL_Program *dev_program;
   UCL_Kernel k_zero;
   bool _compiled;
   void compile_kernels();
 
   int _data_in_estimate, _data_out_estimate;
   
   template <class t>
   inline std::string toa(const t& in) {
     std::ostringstream o;
     o.precision(2);
     o << in;
     return o.str();
   }
 
 };
 
 #endif
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 6f1bbf0d2..c52d650ca 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -1,371 +1,404 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
  
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #ifdef USE_OPENCL
 #include "pppm_gpu_cl.h"
 #define MEM_THREADS 16
 #else
 #include "pppm_f_gpu_ptx.h"
 #include "pppm_d_gpu_ptx.h"
 #endif
 #include "pppm_gpu_memory.h"
 #include <cassert>
 
 // Maximum order for spline
 #define MAX_SPLINE 8
 // Thread block size for all kernels (Must be >=MAX_SPLINE^2)
 #define BLOCK_1D 64
 // Number of threads per pencil for charge spread
 //#define PENCIL_SIZE MEM_THREADS
 #define PENCIL_SIZE 32
 // Number of pencils per block for charge spread
 //#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE)
 #define BLOCK_PENCILS 2
 
 #define PPPMGPUMemoryT PPPMGPUMemory<numtyp, acctyp, grdtyp, grdtyp4>
 
 extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 PPPMGPUMemoryT::PPPMGPUMemory() : _allocated(false), _compiled(false),
                                   _max_bytes(0) {
   device=&pair_gpu_device;
   ans=new PairGPUAns<numtyp,acctyp>();
 }
 
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 PPPMGPUMemoryT::~PPPMGPUMemory() {
   clear(0.0);
   delete ans;
 }
 
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 int PPPMGPUMemoryT::bytes_per_atom() const {
   return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1;
 }
 
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
                               const int order, const int nxlo_out,
                               const int nylo_out, const int nzlo_out,
                               const int nxhi_out, const int nyhi_out,
                               const int nzhi_out, double **rho_coeff,
-                              grdtyp **vd_brick, int &flag) {
+                              grdtyp **vd_brick, const double slab_volfactor, 
+                              const int nx_pppm, const int ny_pppm,
+                              const int nz_pppm, int &flag) {
   _max_bytes=10;
   screen=_screen;
   bool success=true;
 
   if (!device->init(*ans,nlocal,nall)) {
     flag=-2;
     return 0;
   }
   if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
     flag=-5;
     return 0;
   }
-
+  
   ucl_device=device->gpu;
   atom=&device->atom;
 
   _block_size=BLOCK_1D;
   assert(BLOCK_PENCILS*PENCIL_SIZE==BLOCK_1D);
   assert(MAX_SPLINE*MAX_SPLINE<=BLOCK_1D);
   if (static_cast<size_t>(_block_size)>ucl_device->group_size())
     _block_size=ucl_device->group_size();
   compile_kernels(*ucl_device);
 
   // Initialize timers for the selected GPU
   time_in.init(*ucl_device);
   time_in.zero();
   time_out.init(*ucl_device);
   time_out.zero();
   time_map.init(*ucl_device);
   time_map.zero();
   time_rho.init(*ucl_device);
   time_rho.zero();
   time_interp.init(*ucl_device);
   time_interp.zero();
 
   pos_tex.bind_float(atom->dev_x,4);
   q_tex.bind_float(atom->dev_q,1);
 
   _allocated=true;
   _max_bytes=0;
   _max_an_bytes=ans->gpu_bytes();
   
   _order=order;
   _order_m_1=order-1;
   _order2=_order_m_1*_order;
   _nlower=-(_order-1)/2;
   _nupper=order/2;
   _nxlo_out=nxlo_out;
   _nylo_out=nylo_out;
   _nzlo_out=nzlo_out;
   _nxhi_out=nxhi_out;
   _nyhi_out=nyhi_out;
   _nzhi_out=nzhi_out;
+
+  _slab_volfactor=slab_volfactor;
+  _nx_pppm=nx_pppm;
+  _ny_pppm=ny_pppm;
+  _nz_pppm=nz_pppm;
+
   _max_brick_atoms=10;
 
   // Get rho_coeff on device
   int n2lo=(1-order)/2;
   int numel=order*( order/2 - n2lo + 1 );
   success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
                       UCL_SUCCESS);
   UCL_H_Vec<double> view;
   view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
   ucl_copy(d_rho_coeff,view,true);
   _max_bytes+=d_rho_coeff.row_bytes();
   
   // Allocate storage for grid
   _npts_x=nxhi_out-nxlo_out+1;
   _npts_y=nyhi_out-nylo_out+1;
   _npts_z=nzhi_out-nzlo_out+1;
   _npts_yx=_npts_x*_npts_y;
   success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
                       UCL_SUCCESS);
   success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
                       UCL_SUCCESS);
   success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
                       UCL_SUCCESS);
   *vd_brick=h_vd_brick.begin();
   _max_bytes+=d_brick.row_bytes();
 
   // Allocate vector with count of atoms assigned to each grid point
   _nlocal_x=_npts_x+_nlower-_nupper;
   _nlocal_y=_npts_y+_nlower-_nupper;
   _nlocal_z=_npts_z+_nlower-_nupper;
   _nlocal_yx=_nlocal_x*_nlocal_y;
   _atom_stride=_nlocal_x*_nlocal_y*_nlocal_z;
   success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)==
                       UCL_SUCCESS);
   _max_bytes+=d_brick_counts.row_bytes();
 
   // Allocate storage for atoms assigned to each grid point
   success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,
                                           *ucl_device)==UCL_SUCCESS);
   _max_bytes+=d_brick_atoms.row_bytes();
 
   // Allocate error flags for checking out of bounds atoms
   success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
   success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
                                          UCL_SUCCESS);
   if (!success) {
     flag=-3;
     return 0;
   }
   
   d_error_flag.zero();
   _max_bytes+=1;
 
   return h_brick.begin();
 }
 
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 void PPPMGPUMemoryT::clear(const double cpu_time) {
   if (!_allocated)
     return;
   _allocated=false;
+  _precompute_done=false;
   
   d_brick.clear();
   h_brick.clear();
   h_vd_brick.clear();
   d_brick_counts.clear();
   h_error_flag.clear();
   d_error_flag.clear();
   d_brick_atoms.clear();
   
   acc_timers();
   device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
                               *ans,_max_bytes+_max_an_bytes,cpu_time,screen);
 
   if (_compiled) {
     k_particle_map.clear();
     k_make_rho.clear();
     k_interp.clear();
     delete pppm_program;
     _compiled=false;
   }
 
   time_in.clear();
   time_out.clear();
   time_map.clear();
   time_rho.clear();
   time_interp.clear();
 
   device->clear();
 }
 
 // ---------------------------------------------------------------------------
-// Charge spreading stuff
+// Charge assignment that can be performed asynchronously
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
-int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall,
-                           double **host_x, int *host_type, bool &success,
-                           double *host_q, double *boxlo, 
-                           const double delxinv, const double delyinv,
-                           const double delzinv) {
+void PPPMGPUMemoryT::precompute(const int ago, const int nlocal, const int nall,
+                                double **host_x, int *host_type, bool &success,
+                                double *host_q, double *boxlo, 
+                                const double delxinv, const double delyinv,
+                                const double delzinv) {
+  if (_precompute_done)
+    return;
+    
   device->stop_host_timer();
   
   acc_timers();
   if (nlocal==0) {
     zero_timers();
-    return 0;
+    return;
   }
   
   ans->inum(nlocal);
 
   if (ago==0) {
     resize_atom(nlocal,nall,success);
     resize_local(nlocal,success);
     if (!success)
-      return 0;
+      return;
 
     double bytes=ans->gpu_bytes();
     if (bytes>_max_an_bytes)
       _max_an_bytes=bytes;
   }
 
   atom->cast_x_data(host_x,host_type);
   atom->cast_q_data(host_q);
   atom->add_x_data(host_x,host_type);
   atom->add_q_data();
 
   time_map.start();
 
   // Compute the block size and grid size to keep all cores busy
   int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int _max_atoms=10;
   int ainum=this->ans->inum();
   
   // Boxlo adjusted to be upper left brick and shift for even spline order
   double shift=0.0;
   if (_order % 2)
     shift=0.5;
   _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
   _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
   _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
   
   _delxinv=delxinv;
   _delyinv=delyinv;
   _delzinv=delzinv;
   double delvolinv = delxinv*delyinv*delzinv;
   grdtyp f_delvolinv = delvolinv;
 
   d_brick_counts.zero();
   k_particle_map.set_size(GX,BX);
   k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
                      &ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
                      &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, 
                      &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, 
                      &_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
   time_map.stop();
 
   time_rho.start();
   BX=block_size();
   GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/BLOCK_PENCILS));
   k_make_rho.set_size(GX,BX);
   k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
                  &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x,
                  &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y, &_nlocal_z,
                  &_order_m_1, &_order, &_order2);
   time_rho.stop();
 
   time_out.start();
   ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
-  ucl_copy(h_error_flag,d_error_flag,false);
+  ucl_copy(h_error_flag,d_error_flag,true);
   time_out.stop();
   
+  _precompute_done=true;
+}
+
+// ---------------------------------------------------------------------------
+// Charge spreading stuff
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall,
+                           double **host_x, int *host_type, bool &success,
+                           double *host_q, double *boxlo, 
+                           const double delxinv, const double delyinv,
+                           const double delzinv) {
+  precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
+             delyinv,delzinv);
+  if (!success || nlocal==0)
+    return 0;
+    
+  time_out.sync_stop();
+
+  _precompute_done=false;
+
   if (h_error_flag[0]==2) {
     // Not enough storage for atoms on the brick
     _max_brick_atoms*=2;
     d_error_flag.zero();
     d_brick_atoms.clear();
-    d_brick_atoms.alloc(_atom_stride*_max_atoms,*ucl_device);
+    d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
     _max_bytes+=d_brick_atoms.row_bytes();
     return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, 
                   delxinv,delyinv,delzinv);
   }
   
   return h_error_flag[0];
 }
 
+
 // ---------------------------------------------------------------------------
 // Charge spreading stuff
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 void PPPMGPUMemoryT::interp(const grdtyp qqrd2e_scale) {
   time_in.start();
   ucl_copy(d_brick,h_vd_brick,true);
   time_in.stop();
   
   time_interp.start();
   // Compute the block size and grid size to keep all cores busy
   int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
   int ainum=this->ans->inum();
   
   k_interp.set_size(GX,BX);
   k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, 
                &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
                &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
                &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
   time_interp.stop();
 
   ans->copy_answers(false,false,false,false);
   device->add_ans_object(ans);
 }
 
 
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 double PPPMGPUMemoryT::host_memory_usage() const {
   return device->atom.host_memory_usage()+
          sizeof(PPPMGPUMemory<numtyp,acctyp,grdtyp,grdtyp4>);
 }
 
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) {
   if (_compiled)
     return;
 
   std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
                     std::string(OCL_PRECISION_COMPILE);
   #ifdef USE_OPENCL
   flags+=std::string("-D grdtyp=")+ucl_template_name<grdtyp>()+" -D grdtyp4="+
          ucl_template_name<grdtyp>()+"4";
   #endif
 
   pppm_program=new UCL_Program(dev);
   if (sizeof(grdtyp)==sizeof(float))
     pppm_program->load_string(pppm_f_gpu_kernel,flags.c_str());
   else
     pppm_program->load_string(pppm_d_gpu_kernel,flags.c_str());
 
   k_particle_map.set_function(*pppm_program,"particle_map");
   k_make_rho.set_function(*pppm_program,"make_rho");
   k_interp.set_function(*pppm_program,"interp");
   pos_tex.get_texture(*pppm_program,"pos_tex");
   q_tex.get_texture(*pppm_program,"q_tex");
 
   _compiled=true;
 }
 
 template class PPPMGPUMemory<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
 template class PPPMGPUMemory<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
 
diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h
index f996cdb1a..7fa666675 100644
--- a/lib/gpu/pppm_gpu_memory.h
+++ b/lib/gpu/pppm_gpu_memory.h
@@ -1,174 +1,186 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
  
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #ifndef PPPM_GPU_MEMORY_H
 #define PPPM_GPU_MEMORY_H
 
-#include "pair_gpu_device.h"
-#include "pair_gpu_balance.h"
 #include "mpi.h"
+#include "pair_gpu_device.h"
 
 #ifdef USE_OPENCL
 #include "geryon/ocl_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
 
+template <class numtyp, class acctyp> class PairGPUDevice;
+
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 class PPPMGPUMemory {
  public:
   PPPMGPUMemory();
   virtual ~PPPMGPUMemory();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** Success will be:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -2 if GPU could not be found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,
                 const int nxlo_out, const int nylo_out, const int nzlo_out,
                 const int nxhi_out, const int nyhi_out, const int nzhi_out,
-                double **rho_coeff, grdtyp **vd_brick, int &success);
+                double **rho_coeff, grdtyp **vd_brick, 
+                const double slab_volfactor, const int nx_pppm, 
+                const int ny_pppm, const int nz_pppm, int &success);
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
     if (atom->resize(nall, success)) {
       pos_tex.bind_float(atom->dev_x,4);
       q_tex.bind_float(atom->dev_q,1);
     }
     ans->resize(inum,success);
   }
 
   /// Check if there is enough storage for local atoms and realloc if not
   inline void resize_local(const int inum, bool &success) {
   }
   
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear(const double cpu_time);
 
   /// Returns memory usage on device per atom
   int bytes_per_atom() const;
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
 
   /// Accumulate timers
   inline void acc_timers() {
     atom->acc_timers();
     ans->acc_timers();
     time_in.add_to_total();
     time_out.add_to_total();
     time_map.add_to_total();
     time_rho.add_to_total();
     time_interp.add_to_total();
   }
 
   /// Zero timers
   inline void zero_timers() {
     atom->zero_timers();
     ans->zero_timers();
     time_in.zero();
     time_out.zero();
     time_map.zero();
     time_rho.zero();
     time_interp.zero();
   }
 
+  /// Precomputations for charge assignment that can be done asynchronously
+  void precompute(const int ago, const int nlocal, const int nall,
+                  double **host_x, int *host_type, bool &success,
+                  double *charge, double *boxlo, const double delxinv,
+                  const double delyinv, const double delzinv);
+
   /// Returns non-zero if out of bounds atoms
-  int spread(const int ago,const int nlocal,const int nall,double **host_x,
-             int *host_type,bool &success,double *charge,double *boxlo,
-             const double delxinv,const double delyinv,const double delzinv);
+  int spread(const int ago, const int nlocal, const int nall, double **host_x,
+             int *host_type, bool &success, double *charge, double *boxlo,
+             const double delxinv, const double delyinv, const double delzinv);
 
   void interp(const grdtyp qqrd2e_scale);
 
   // -------------------------- DEVICE DATA ------------------------- 
 
   /// Device Properties and Atom and Neighbor storage
   PairGPUDevice<numtyp,acctyp> *device;
 
   /// Geryon device
   UCL_Device *ucl_device;
 
   /// Device Timers
   UCL_Timer time_in, time_out, time_map, time_rho, time_interp;
 
   /// LAMMPS pointer for screen output
   FILE *screen;
 
   // --------------------------- ATOM DATA --------------------------
 
   /// Atom Data
   PairGPUAtom<numtyp,acctyp> *atom;
 
 
   // --------------------------- GRID DATA --------------------------
 
   UCL_H_Vec<grdtyp> h_brick, h_vd_brick;
   UCL_D_Vec<grdtyp> d_brick;
   
   // Count of number of atoms assigned to each grid point
   UCL_D_Vec<int> d_brick_counts;
   // Atoms assigned to each grid point
   UCL_D_Vec<grdtyp4> d_brick_atoms;
   
   // Error checking for out of bounds atoms
   UCL_D_Vec<int> d_error_flag;
   UCL_H_Vec<int> h_error_flag;
   
   // Number of grid points in brick (including ghost)
   int _npts_x, _npts_y, _npts_z, _npts_yx;
   
   // Number of local grid points in brick
   int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride;
   
   // -------------------------- SPLINE DATA -------------------------
   UCL_D_Vec<grdtyp> d_rho_coeff;
   int _order, _nlower, _nupper, _order_m_1, _order2;
   int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
   PairGPUAns<numtyp,acctyp> *ans;
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pppm_program;
   UCL_Kernel k_particle_map, k_make_rho, k_interp;
   inline int block_size() { return _block_size; }
   inline int block_x_size() { return _block_x_size; }
   inline int block_y_size() { return _block_y_size; }
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
   UCL_Texture q_tex;
 
  protected:
-  bool _allocated, _compiled;
-  int _block_size, _block_x_size, _block_y_size, _max_brick_atoms;
+  bool _allocated, _compiled, _precompute_done;
+  int _block_size, _block_x_size, _block_y_size, _max_brick_atoms, _max_atoms;
   double  _max_bytes, _max_an_bytes;
   
   grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; 
 
+  double _slab_volfactor;
+  int _nx_pppm, _ny_pppm, _nz_pppm;
+  
   void compile_kernels(UCL_Device &dev);
 };
 
 #endif
 
diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp
index 6f207113f..1e434d66a 100644
--- a/lib/gpu/pppm_l_gpu.cpp
+++ b/lib/gpu/pppm_l_gpu.cpp
@@ -1,152 +1,165 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
  
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "pppm_gpu_memory.h"
 
 using namespace std;
 
 static PPPMGPUMemory<PRECISION,ACC_PRECISION,float,_lgpu_float4> PPPMF;
 static PPPMGPUMemory<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 template <class grdtyp, class memtyp>
 grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
                        FILE *screen, const int order, const int nxlo_out, 
                        const int nylo_out, const int nzlo_out,
                        const int nxhi_out, const int nyhi_out,
                        const int nzhi_out, double **rho_coeff,
-                       grdtyp **vd_brick, int &success) {
+                       grdtyp **vd_brick, const double slab_volfactor,
+                       const int nx_pppm, const int ny_pppm, const int nz_pppm,
+                       int &success) {
   pppm.clear(0.0);
   int first_gpu=pppm.device->first_device();
   int last_gpu=pppm.device->last_device();
   int world_me=pppm.device->world_me();
   int gpu_rank=pppm.device->gpu_rank();
   int procs_per_gpu=pppm.device->procs_per_gpu();
 
   pppm.device->init_message(screen,"pppm",first_gpu,last_gpu);
 
   bool message=false;
   if (pppm.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
     fprintf(screen,"Initializing GPU and compiling on process 0...");
     fflush(screen);
   }
 
   success=0;
   grdtyp * host_brick=NULL;
   if (world_me==0) {
     host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out,
-                         nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,success);
+                         nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
+                         slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
   }
 
   pppm.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
         fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
       else
         fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0) {
       host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,
                            nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
-                           vd_brick, success);
+                           vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
+                           success);
     }
     pppm.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
   return host_brick;
 }
 
 float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
                         const int order, const int nxlo_out, 
                         const int nylo_out, const int nzlo_out,
                         const int nxhi_out, const int nyhi_out,
                         const int nzhi_out, double **rho_coeff,
-                        float **vd_brick, int &success) {
-  return pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
-                       nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
-                       success);                        
+                        float **vd_brick, const double slab_volfactor,
+                        const int nx_pppm, const int ny_pppm, const int nz_pppm,
+                        int &success) {
+  float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
+                         nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
+                         slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
+  PPPMF.device->set_single_precompute(&PPPMF);                         
+  return b;
 }
 
 void pppm_gpu_clear_f(const double cpu_time) {
   PPPMF.clear(cpu_time);
 }
 
 int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall,
                      double **host_x, int *host_type, bool &success,
                      double *host_q, double *boxlo, const double delxinv,
                      const double delyinv, const double delzinv) {
   return PPPMF.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
                       delxinv,delyinv,delzinv);
 }
 
 void pppm_gpu_interp_f(const float qqrd2e_scale) {
   return PPPMF.interp(qqrd2e_scale);
 }
 
 double pppm_gpu_bytes_f() {
   return PPPMF.host_memory_usage();
 }
 
 double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
                          const int order, const int nxlo_out, 
                          const int nylo_out, const int nzlo_out,
                          const int nxhi_out, const int nyhi_out,
                          const int nzhi_out, double **rho_coeff,
-                         double **vd_brick, int &success) {
-  return pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
-                       nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
-                       success);                        
+                         double **vd_brick, const double slab_volfactor,
+                         const int nx_pppm, const int ny_pppm,
+                         const int nz_pppm, int &success) {
+  double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
+                          nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
+                          vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
+                          success);                        
+  PPPMF.device->set_single_precompute(&PPPMF);                         
+  return b;
 }
 
 void pppm_gpu_clear_d(const double cpu_time) {
   PPPMD.clear(cpu_time);
 }
 
 int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall,
                       double **host_x, int *host_type, bool &success,
                       double *host_q, double *boxlo, const double delxinv,
                       const double delyinv, const double delzinv) {
   return PPPMD.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
                       delxinv,delyinv,delzinv);
 }
 
 void pppm_gpu_interp_d(const double qqrd2e_scale) {
   return PPPMD.interp(qqrd2e_scale);
 }
 
 double pppm_gpu_bytes_d() {
   return PPPMD.host_memory_usage();
 }
 
diff --git a/src/GPU/pppm_gpu_double.cpp b/src/GPU/pppm_gpu_double.cpp
index 4deda350b..33a6bf8d0 100644
--- a/src/GPU/pppm_gpu_double.cpp
+++ b/src/GPU/pppm_gpu_double.cpp
@@ -1,227 +1,231 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
 #include "string.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "math.h"
 #include "pppm_gpu_double.h"
 #include "lmptype.h"
 #include "atom.h"
 #include "comm.h"
 #include "neighbor.h"
 #include "force.h"
 #include "pair.h"
 #include "bond.h"
 #include "angle.h"
 #include "domain.h"
 #include "fft3d_wrap.h"
 #include "remap_wrap.h"
 #include "memory.h"
 #include "error.h"
 
 #define grdtyp double
 
 // External functions from cuda library for atom decomposition
 
 grdtyp* pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
 		        const int order, const int nxlo_out, 
 			const int nylo_out, const int nzlo_out,
 			const int nxhi_out, const int nyhi_out,
 			const int nzhi_out, double **rho_coeff,
-			grdtyp **_vd_brick, int &success);
+			grdtyp **_vd_brick, const double slab_volfactor,
+			const int nx_pppm, const int ny_pppm,
+			const int nz_pppm, int &success);
 void pppm_gpu_clear_d(const double poisson_time);
 int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall,
                       double **host_x, int *host_type, bool &success,
                       double *host_q, double *boxlo, const double delxinv,
                       const double delyinv, const double delzinv);
 void pppm_gpu_interp_d(const grdtyp qqrd2e_scale);
 double pppm_gpu_bytes_d();
 
 using namespace LAMMPS_NS;
 
 #define MAXORDER 7
 #define OFFSET 16384
 #define SMALL 0.00001
 #define LARGE 10000.0
 #define EPS_HOC 1.0e-7
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 /* ---------------------------------------------------------------------- */
 
 PPPMGPUDouble::PPPMGPUDouble(LAMMPS *lmp, int narg, char **arg) :
   PPPMGPU<grdtyp>(lmp, narg, arg)
 {
 }
 
 /* ----------------------------------------------------------------------
    free all memory 
 ------------------------------------------------------------------------- */
 
 PPPMGPUDouble::~PPPMGPUDouble()
 {
   pppm_gpu_clear_d(poisson_time);
 }
 
 /* ----------------------------------------------------------------------
    called once before run 
 ------------------------------------------------------------------------- */
 
 void PPPMGPUDouble::init()
 {
   base_init();
 
   if (order>8)
     error->all("Cannot use order greater than 8 with pppm/gpu.");
   pppm_gpu_clear_d(poisson_time);
 
   int success;
   grdtyp *data, *h_brick;
   h_brick = pppm_gpu_init_d(atom->nlocal, atom->nlocal+atom->nghost, screen,
 			    order, nxlo_out, nylo_out, nzlo_out, nxhi_out,
-			    nyhi_out, nzhi_out, rho_coeff, &data, success);
+			    nyhi_out, nzhi_out, rho_coeff, &data,
+			    slab_volfactor, nx_pppm, ny_pppm, nz_pppm, 
+			    success);
 
   int all_success;
   MPI_Allreduce(&success, &all_success, 1, MPI_INT, MPI_MIN, world);
   if (all_success != 0) {
     if (all_success == -1)
       error->all("Could not find fix gpu"); 
     else if (all_success == -2)
       error->all("At least one node could not find specified GPU.");
     else if (all_success == -3)
       error->all("Out of memory on GPU.");
     else if (all_success == -4)
       error->all("GPU library not compiled for this GPU.");
     else if (all_success == -5)
       error->all("Double precision is not supported on this GPU.");
     else
       error->all("Unknown error in GPU library.");
   }
 
   density_brick =
     create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
 		     nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1);
   vd_brick =
     create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
 		     nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
 
   poisson_time=0;
 }
 
 /* ----------------------------------------------------------------------
    compute the PPPMGPU long-range force, energy, virial 
 ------------------------------------------------------------------------- */
 
 void PPPMGPUDouble::compute(int eflag, int vflag)
 {
   bool success = true;
   int flag=pppm_gpu_spread_d(neighbor->ago, atom->nlocal, atom->nlocal + 
 			     atom->nghost, atom->x, atom->type, success,
 			     atom->q, domain->boxlo, delxinv, delyinv,
 			     delzinv);
   if (!success)
     error->one("Out of memory on GPGPU");
   if (flag != 0)
     error->one("Out of range atoms - cannot compute PPPM");
 
   int i;
 
   // convert atoms from box to lamda coords
   
   if (triclinic == 0) boxlo = domain->boxlo;
   else {
     boxlo = domain->boxlo_lamda;
     domain->x2lamda(atom->nlocal);
   }
 
   energy = 0.0;
   if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
 
   double t3=MPI_Wtime();
 
   // all procs communicate density values from their ghost cells
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
   brick2fft();
 
   // compute potential gradient on my FFT grid and
   //   portion of e_long on this proc's FFT grid
   // return gradients (electric fields) in 3d brick decomposition
   
   poisson(eflag,vflag);
 
   // all procs communicate E-field values to fill ghost cells
   //   surrounding their 3d bricks
 
   fillbrick();
 
   poisson_time+=MPI_Wtime()-t3;
 
   // calculate the force on my particles
 
   grdtyp qqrd2e_scale=qqrd2e*scale;
   pppm_gpu_interp_d(qqrd2e_scale);
 
   // sum energy across procs and add in volume-dependent term
 
   if (eflag) {
     double energy_all;
     MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
     energy = energy_all;
    
     energy *= 0.5*volume;
     energy -= g_ewald*qsqsum/1.772453851 +
       0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume);
     energy *= qqrd2e*scale;
   }
 
   // sum virial across procs
 
   if (vflag) {
     double virial_all[6];
     MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
     for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i];
   }
 
   // 2d slab correction
 
   if (slabflag) slabcorr(eflag);
 
   // convert atoms back from lamda to box coords
   
   if (triclinic) domain->lamda2x(atom->nlocal);
 }
 
 /* ----------------------------------------------------------------------
    memory usage of local arrays 
 ------------------------------------------------------------------------- */
 
 double PPPMGPUDouble::memory_usage()
 {
   double bytes = nmax*3 * sizeof(double);
   int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * 
     (nzhi_out-nzlo_out+1);
   bytes += 4 * nbrick * sizeof(grdtyp);
   bytes += 6 * nfft_both * sizeof(double);
   bytes += nfft_both*6 * sizeof(double);
   bytes += 2 * nbuf * sizeof(double);
   return bytes + pppm_gpu_bytes_d();
 }
diff --git a/src/GPU/pppm_gpu_single.cpp b/src/GPU/pppm_gpu_single.cpp
index 693c47f43..b57ec0abb 100644
--- a/src/GPU/pppm_gpu_single.cpp
+++ b/src/GPU/pppm_gpu_single.cpp
@@ -1,227 +1,230 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
 #include "string.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "math.h"
 #include "pppm_gpu_single.h"
 #include "lmptype.h"
 #include "atom.h"
 #include "comm.h"
 #include "neighbor.h"
 #include "force.h"
 #include "pair.h"
 #include "bond.h"
 #include "angle.h"
 #include "domain.h"
 #include "fft3d_wrap.h"
 #include "remap_wrap.h"
 #include "memory.h"
 #include "error.h"
 
 #define grdtyp float
 
 // External functions from cuda library for atom decomposition
 
 grdtyp* pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
 		        const int order, const int nxlo_out, 
 			const int nylo_out, const int nzlo_out,
 			const int nxhi_out, const int nyhi_out,
 			const int nzhi_out, double **rho_coeff,
-			grdtyp **_vd_brick, int &success);
+			grdtyp **_vd_brick, const double slab_volfactor,
+			const int nx_pppm, const int ny_pppm,
+			const int nz_pppm, int &success);
 void pppm_gpu_clear_f(const double poisson_time);
 int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall,
                       double **host_x, int *host_type, bool &success,
                       double *host_q, double *boxlo, const double delxinv,
                       const double delyinv, const double delzinv);
 void pppm_gpu_interp_f(const grdtyp qqrd2e_scale);
 double pppm_gpu_bytes_f();
 
 using namespace LAMMPS_NS;
 
 #define MAXORDER 7
 #define OFFSET 16384
 #define SMALL 0.00001
 #define LARGE 10000.0
 #define EPS_HOC 1.0e-7
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 /* ---------------------------------------------------------------------- */
 
 PPPMGPUSingle::PPPMGPUSingle(LAMMPS *lmp, int narg, char **arg) :
   PPPMGPU<grdtyp>(lmp, narg, arg)
 {
 }
 
 /* ----------------------------------------------------------------------
    free all memory 
 ------------------------------------------------------------------------- */
 
 PPPMGPUSingle::~PPPMGPUSingle()
 {
   pppm_gpu_clear_f(poisson_time);
 }
 
 /* ----------------------------------------------------------------------
    called once before run 
 ------------------------------------------------------------------------- */
 
 void PPPMGPUSingle::init()
 {
   base_init();
 
   if (order>8)
     error->all("Cannot use order greater than 8 with pppm/gpu.");
   pppm_gpu_clear_f(poisson_time);
 
   int success;
   grdtyp *data, *h_brick;
   h_brick = pppm_gpu_init_f(atom->nlocal, atom->nlocal+atom->nghost, screen,
 			    order, nxlo_out, nylo_out, nzlo_out, nxhi_out,
-			    nyhi_out, nzhi_out, rho_coeff, &data, success);
+			    nyhi_out, nzhi_out, rho_coeff, &data, 
+			    slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
 
   int all_success;
   MPI_Allreduce(&success, &all_success, 1, MPI_INT, MPI_MIN, world);
   if (all_success != 0) {
     if (all_success == -1)
       error->all("Could not find fix gpu"); 
     else if (all_success == -2)
       error->all("At least one node could not find specified GPU.");
     else if (all_success == -3)
       error->all("Out of memory on GPU.");
     else if (all_success == -4)
       error->all("GPU library not compiled for this GPU.");
     else if (all_success == -5)
       error->all("Double precision is not supported on this GPU.");
     else
       error->all("Unknown error in GPU library.");
   }
 
   density_brick =
     create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
 		     nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1);
   vd_brick =
     create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
 		     nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
 
   poisson_time=0;
 }
 
 /* ----------------------------------------------------------------------
    compute the PPPMGPU long-range force, energy, virial 
 ------------------------------------------------------------------------- */
 
 void PPPMGPUSingle::compute(int eflag, int vflag)
 {
   bool success = true;
   int flag=pppm_gpu_spread_f(neighbor->ago, atom->nlocal, atom->nlocal + 
 			     atom->nghost, atom->x, atom->type, success,
 			     atom->q, domain->boxlo, delxinv, delyinv,
 			     delzinv);
   if (!success)
     error->one("Out of memory on GPGPU");
   if (flag != 0)
     error->one("Out of range atoms - cannot compute PPPM");
 
   int i;
 
   // convert atoms from box to lamda coords
   
   if (triclinic == 0) boxlo = domain->boxlo;
   else {
     boxlo = domain->boxlo_lamda;
     domain->x2lamda(atom->nlocal);
   }
 
   energy = 0.0;
   if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
 
   double t3=MPI_Wtime();
 
   // all procs communicate density values from their ghost cells
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
   brick2fft();
 
   // compute potential gradient on my FFT grid and
   //   portion of e_long on this proc's FFT grid
   // return gradients (electric fields) in 3d brick decomposition
   
   poisson(eflag,vflag);
 
   // all procs communicate E-field values to fill ghost cells
   //   surrounding their 3d bricks
 
   fillbrick();
 
   poisson_time+=MPI_Wtime()-t3;
 
   // calculate the force on my particles
 
   grdtyp qqrd2e_scale=qqrd2e*scale;
   pppm_gpu_interp_f(qqrd2e_scale);
 
   // sum energy across procs and add in volume-dependent term
 
   if (eflag) {
     double energy_all;
     MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
     energy = energy_all;
    
     energy *= 0.5*volume;
     energy -= g_ewald*qsqsum/1.772453851 +
       0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume);
     energy *= qqrd2e*scale;
   }
 
   // sum virial across procs
 
   if (vflag) {
     double virial_all[6];
     MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
     for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i];
   }
 
   // 2d slab correction
 
   if (slabflag) slabcorr(eflag);
 
   // convert atoms back from lamda to box coords
   
   if (triclinic) domain->lamda2x(atom->nlocal);
 }
 
 /* ----------------------------------------------------------------------
    memory usage of local arrays 
 ------------------------------------------------------------------------- */
 
 double PPPMGPUSingle::memory_usage()
 {
   double bytes = nmax*3 * sizeof(double);
   int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * 
     (nzhi_out-nzlo_out+1);
   bytes += 4 * nbrick * sizeof(grdtyp);
   bytes += 6 * nfft_both * sizeof(double);
   bytes += nfft_both*6 * sizeof(double);
   bytes += 2 * nbuf * sizeof(double);
   return bytes + pppm_gpu_bytes_f();
 }