diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index cc74d2ebd..3c859ffdd 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -1,249 +1,254 @@
 # /* ----------------------------------------------------------------------   
 #    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
 #    http://lammps.sandia.gov, Sandia National Laboratories                   
 #    Steve Plimpton, sjplimp@sandia.gov                                       
 #                                                                             
 #    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
 #    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
 #    certain rights in this software.  This software is distributed under      
 #    the GNU General Public License.                                          
 #                                                                             
 #    See the README file in the top-level LAMMPS directory.                   
 # ------------------------------------------------------------------------- */
 #                                                                             
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
 #                          Peng Wang (Nvidia), penwang@nvidia.com             
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
 CUDA  = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
              $(CUDA_PRECISION)
 CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
         -Icudpp_mini
 CUDA_LINK = $(CUDA_LIB) -lcudart
 
 GPU_LIB = $(LIB_DIR)/libgpu.a
 
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVC_H  = $(wildcard ./geryon/nvc*.h) $(UCL_H)
 NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) 
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor_shared.h pair_gpu_nbor.h \
-          pair_gpu_precision.h pair_gpu_device.h pair_gpu_balance.h
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
+          pair_gpu_balance.h
 
 ALL_H = $(NVD_H) $(PAIR_H)
 
 EXECS = $(BIN_DIR)/nvc_get_devices
 CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
         $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
         $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
-       $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_device.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
+       $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
        $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
        $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
        $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
        $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
        $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
        $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
        $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
        $(OBJ_DIR)/cmmc_msm_gpu_memory.o $(OBJ_DIR)/cmmc_msm_gpu.o \
        $(CUDPP)
 PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
        $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
        $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
        $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
        $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
        $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
        $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
        $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
        $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
        $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
        $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
        $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h \
        $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h
 
 all: $(GPU_LIB) $(EXECS)
 
 $(OBJ_DIR)/cudpp.o: cudpp_mini/cudpp.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp.cpp -Icudpp_mini
 
 $(OBJ_DIR)/cudpp_plan.o: cudpp_mini/cudpp_plan.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan.cpp -Icudpp_mini
 
 $(OBJ_DIR)/cudpp_maximal_launch.o: cudpp_mini/cudpp_maximal_launch.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_maximal_launch.cpp -Icudpp_mini
 
 $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
 
 $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
 	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
 
 $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
 	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
 
 $(OBJ_DIR)/pair_gpu_atom_kernel.ptx: pair_gpu_atom_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_atom_kernel.cu
 
 $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h
 
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
 	$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H)
+	$(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
 
 $(OBJ_DIR)/pair_gpu_nbor_ptx.h: $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h
 
 $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_build_kernel.cu
 
 $(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
 
 $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_device.cpp
 
 $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(CUDR) -o $@ -c atomic_gpu_memory.cpp
 
 $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(CUDR) -o $@ -c charge_gpu_memory.cpp
 
 $(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
 
 $(OBJ_DIR)/gb_gpu_kernel_lj.ptx: gb_gpu_kernel_lj.cu pair_gpu_precision.h gb_gpu_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_lj.cu
 
 $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx: gb_gpu_kernel_nbor.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_nbor.cu
 
 $(OBJ_DIR)/gb_gpu_ptx.h: $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h
 
 $(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_ptx.h
 	$(CUDR) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
 	$(CUDR) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_cut_gpu_kernel.ptx: lj_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ lj_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h
 
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
 	$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ ljc_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h
 
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
 	$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ ljcl_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h
 
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
 	$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
 
 $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h
 
 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
 	$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h
 
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
 	$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
 
 $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h
 
 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
 	$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_long_gpu_kernel.cu
 
 $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
 
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
 	$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx: cmmc_msm_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_msm_gpu_kernel.cu
 
 $(OBJ_DIR)/cmmc_msm_gpu_ptx.h: $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h
 
 $(OBJ_DIR)/cmmc_msm_gpu_memory.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu_memory.cpp $(OBJ_DIR)/cmmc_msm_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmmc_msm_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_msm_gpu.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu.cpp
 	$(CUDR) -o $@ -c cmmc_msm_gpu.cpp -I$(OBJ_DIR)
 
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) 
 
 $(GPU_LIB): $(OBJS)
 	$(AR) -crusv $(GPU_LIB) $(OBJS)
 
 clean:
 	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo
 
 veryclean: clean
 	rm -rf *~ *.linkinfo
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 829add735..e488a56bc 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -1,169 +1,171 @@
 # /* ----------------------------------------------------------------------   
 #    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
 #    http://lammps.sandia.gov, Sandia National Laboratories                   
 #    Steve Plimpton, sjplimp@sandia.gov                                       
 #                                                                             
 #    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
 #    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
 #    certain rights in this software.  This software is distributed under      
 #    the GNU General Public License.                                          
 #                                                                             
 #    See the README file in the top-level LAMMPS directory.                   
 # ------------------------------------------------------------------------- */
 #                                                                             
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
 #                          Peng Wang (Nvidia), penwang@nvidia.com             
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
 OCL  = $(OCL_CPP) $(OCL_PREC) -DUSE_OPENCL
 OCL_LIB = $(LIB_DIR)/libgpu.a
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H)
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor_shared.h pair_gpu_nbor.h \
-          pair_gpu_precision.h pair_gpu_device.h pair_gpu_balance.h
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
+          pair_gpu_balance.h
 
 ALL_H = $(OCL_H) $(PAIR_H)
 
 EXECS = $(BIN_DIR)/ocl_get_devices
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
-       $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_device.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
+       $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
        $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
        $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
        $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
        $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
        $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
        $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
        $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o 
 KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
        $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
        $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
        $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
        $(OBJ_DIR)/crml_gpu_cl.h \
        $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h 
-       
+
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
 
 all: $(OCL_LIB) $(EXECS)
 
 $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_cl.h
 
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
 	$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
 
 $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
 	$(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
 	$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
 	$(OCL) -o $@ -c pair_gpu_device.cpp
 
 $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(OCL) -o $@ -c atomic_gpu_memory.cpp
 
 $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(OCL) -o $@ -c charge_gpu_memory.cpp
 
 $(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
 	$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
 
 $(OBJ_DIR)/gb_gpu_cl.h: gb_gpu_kernel.cu gb_gpu_kernel_lj.cu gb_gpu_extra.h
 	cat gb_gpu_extra.h gb_gpu_kernel.cu > $(OBJ_DIR)/gb_gpu_kernel.tar; \
 	cat gb_gpu_extra.h gb_gpu_kernel_lj.cu > $(OBJ_DIR)/gb_gpu_kernel_lj.tar; \
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_cl.h; \
 	rm -f $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar
 
 $(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h
 	$(OCL) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
 	$(OCL) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_cl.h;
 
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp  $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
 	$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_cl.h;
 
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp  $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
 	$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_cl.h;
 
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp  $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
 	$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
 
 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp  $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
 	$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h;
 
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp  $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
 	$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
 
 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp  $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
 	$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_cl.h;
 
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp  $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
 	$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
 
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
 
 $(OCL_LIB): $(OBJS) $(PTXS)
 	$(AR) -crusv $(OCL_LIB) $(OBJS)
 
 opencl: $(OCL_EXECS)
 
 clean:
 	rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo
 
 veryclean: clean
 	rm -rf *~ *.linkinfo
 
diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_ans.cpp
similarity index 52%
copy from lib/gpu/pair_gpu_atom.cpp
copy to lib/gpu/pair_gpu_ans.cpp
index 46c9066e5..e6982e6eb 100644
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_ans.cpp
@@ -1,605 +1,409 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
  
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
-#include "pair_gpu_atom.h"
+#include "pair_gpu_ans.h"
 
-#define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
-
-#ifdef WINDLL
-#include <windows.h>
-typedef bool (*__win_sort_alloc)(const int max_atoms);
-typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin,
-                           int *particle_begin);
-__win_sort_alloc _win_sort_alloc;
-__win_sort _win_sort;
-#endif
+#define PairGPUAnsT PairGPUAns<numtyp,acctyp>
 
 template <class numtyp, class acctyp>
-PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
-                              _vflag(false),_inum(0),_ilist(NULL), 
-                              _newton(false) {
-  #ifndef USE_OPENCL
-  sort_config.op = CUDPP_ADD;
-  sort_config.datatype = CUDPP_UINT;
-  sort_config.algorithm = CUDPP_SORT_RADIX;
-  sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
-
-  #ifdef WINDLL
-  HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
-  if (hinstLib == NULL) {
-    printf("\nUnable to load gpu.dll\n");
-    exit(1);
-  }
-  _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc");
-  _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort");
-  #endif
-
-  #endif
+PairGPUAnsT::PairGPUAns() : _allocated(false),_eflag(false),_vflag(false),
+                            _inum(0),_ilist(NULL),_newton(false) {
 }
 
 template <class numtyp, class acctyp>
-int PairGPUAtomT::bytes_per_atom() const { 
-  int id_space=0;
-  if (_gpu_nbor)
-    id_space=2;
-  int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
+int PairGPUAnsT::bytes_per_atom() const { 
+  int bytes=11*sizeof(acctyp);
   if (_rot)
-    bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
+    bytes+=4*sizeof(acctyp);
   if (_charge)
-    bytes+=sizeof(numtyp);
+    bytes+=sizeof(acctyp);
   return bytes;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::alloc(const int inum, const int nall) {
-  _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-  if (_newton)
-    _max_local=_max_atoms;
-  else
-    _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
+bool PairGPUAnsT::alloc(const int inum) {
+  _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
 
   bool success=true;
   
   int ans_elements=4;
   if (_rot)
     ans_elements+=4;
   
   // Ignore host/device transfers?
   bool cpuview=false;
   if (dev->device_type()==UCL_CPU)
     cpuview=true;
     
-  // Allocate storage for CUDPP sort
-  #ifndef USE_OPENCL
-  #ifdef WINDLL
-  _win_sort_alloc(_max_atoms);
-  #else
-  if (_gpu_nbor) {
-    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
-    if (CUDPP_SUCCESS != result)
-      return false;
-  }
-  #endif
-  #endif
-
   // --------------------------   Host allocations
-  // Get a host write only buffer
-  #ifdef GPU_CAST
-  success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  success=success && (host_type_cast.alloc(_max_atoms,*dev,
-                                           UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #else
-  success=success && (host_x.alloc(_max_atoms*4,*dev,
-                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #endif                      
   success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
   success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
-  // Buffer for casting only if different precisions
-  if (_charge)
-    success=success && (host_q.alloc(_max_atoms,*dev,
-                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  // Buffer for casting only if different precisions
-  if (_rot)
-    success=success && (host_quat.alloc(_max_atoms*4,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-
     
   // ---------------------------  Device allocations
-  _gpu_bytes=0;
   if (cpuview) {
-    #ifdef GPU_CAST
-    assert(0==1);
-    #else
-    dev_x.view(host_x);
-    #endif
     dev_engv.view(host_engv);
     dev_ans.view(host_ans);
-    if (_rot)
-      dev_quat.view(host_quat);
-    if (_charge)
-      dev_q.view(host_q);
   } else {
-    #ifdef GPU_CAST
-    success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
-    success=success && (UCL_SUCCESS==
-                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
-    success=success && (UCL_SUCCESS==
-                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
-    #else
-    success=success && (UCL_SUCCESS==
-                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
-    #endif
     success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
                                        UCL_WRITE_ONLY)==UCL_SUCCESS);
     success=success && (dev_ans.alloc(ans_elements*_max_local,
                                       *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-    if (_charge) {
-      success=success && (dev_q.alloc(_max_atoms,*dev,
-                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_q.row_bytes();
-    }
-    if (_rot) {
-      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
-                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_quat.row_bytes();
-    }
   }
-  if (_gpu_nbor) {
-    success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-    _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
-    if (_bonds) {
-      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-      _gpu_bytes+=dev_tag.row_bytes();
-    }
-  }
-
-  _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
+  _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
   
   _allocated=true;  
   return success;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
-                        const bool rot, UCL_Device &devi, const bool gpu_nbor,
-                        const bool bonds) {
+bool PairGPUAnsT::init(const int inum, const bool charge, const bool rot,
+                       UCL_Device &devi) {
   clear();
 
   bool success=true;
-  _x_avail=false;
-  _q_avail=false;
-  _quat_avail=false;
-  _gpu_nbor=gpu_nbor;
-  _bonds=bonds;
   _charge=charge;
   _rot=rot;
   _other=_charge || _rot;
   dev=&devi;
 
   _e_fields=1;
   if (_charge)
     _e_fields++;
   _ev_fields=6+_e_fields;
     
   // Initialize atom and nbor data
   int ef_inum=inum;
   if (ef_inum==0)
     ef_inum=1000;
-  int ef_nall=nall;
-  if (ef_nall<=ef_inum)
-    ef_nall=ef_inum*2;
   
   // Initialize timers for the selected device
-  time_pos.init(*dev);
-  time_other.init(*dev);
   time_answer.init(*dev);
-  time_pos.zero();
-  time_other.zero();
   time_answer.zero();
   _time_cast=0.0;
   
-  #ifdef GPU_CAST
-  compile_kernels(*dev);
-  #endif
-  
-  return success && alloc(ef_inum,ef_nall);
+  return success && alloc(ef_inum);
 }
   
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::add_fields(const bool charge, const bool rot) {
+bool PairGPUAnsT::add_fields(const bool charge, const bool rot) {
   bool realloc=false;
   if (charge && _charge==false) {
     _charge=true;
     _e_fields++;
+    _ev_fields++;
     realloc=true;
   }
   if (rot && _rot==false) {
     _rot=true;
     realloc=true;
   }
   if (realloc) {
     _other=_charge || _rot;
     int inum=_max_local;
-    int nall=_max_atoms;
     clear_resize();
-    return alloc(inum,nall);
+    return alloc(inum);
   }
   return true;
 }
 
 template <class numtyp, class acctyp>
-void PairGPUAtomT::clear_resize() {
+void PairGPUAnsT::clear_resize() {
   if (!_allocated)
     return;
   _allocated=false;
 
-  dev_x.clear();
-  if (_charge) { 
-    dev_q.clear();
-    host_q.clear();
-  }
-  if (_rot) {
-    dev_quat.clear();
-    host_quat.clear();
-  }
   dev_ans.clear();
   dev_engv.clear();
-  #ifndef GPU_CAST
-  host_x.clear();
-  #else
-  host_x_cast.clear();
-  host_type_cast.clear();
-  #endif
   host_ans.clear();
   host_engv.clear();
-  dev_cell_id.clear();
-  dev_particle_id.clear();
-  dev_tag.clear();
-  #ifdef GPU_CAST
-  dev_x_cast.clear();
-  dev_type_cast.clear();
-  #endif
-
-  #ifndef USE_OPENCL
-  #ifndef WINDLL
-  if (_gpu_nbor) cudppDestroyPlan(sort_plan);
-  #endif
-  #endif
 }
 
 template <class numtyp, class acctyp>
-void PairGPUAtomT::clear() {
+void PairGPUAnsT::clear() {
   _gpu_bytes=0;
   if (!_allocated)
     return;
 
   time_pos.clear();
   time_other.clear();
   time_answer.clear();
   clear_resize();
   _inum=0;
   _eflag=false;
   _vflag=false;
-
-  #ifdef GPU_CAST
-  if (_compiled) {
-    k_cast_x.clear();
-    delete atom_program;
-    _compiled=false;
-  }
-  #endif
 }
 
 template <class numtyp, class acctyp>
-double PairGPUAtomT::host_memory_usage() const {
+double PairGPUAnsT::host_memory_usage() const {
   int atom_bytes=4;
   if (_charge) 
     atom_bytes+=1;
   if (_rot) 
     atom_bytes+=4;
   int ans_bytes=atom_bytes+_ev_fields;
-  return _max_atoms*atom_bytes*sizeof(numtyp)+
-         ans_bytes*(_max_local)*sizeof(acctyp)+
-         sizeof(PairGPUAtom<numtyp,acctyp>);
+  return ans_bytes*(_max_local)*sizeof(acctyp)+
+         sizeof(PairGPUAns<numtyp,acctyp>);
 }
   
 template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom) {
+void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
+                               const bool ef_atom, const bool vf_atom) {
   time_answer.start();
   _eflag=eflag;
   _vflag=vflag;
   _ef_atom=ef_atom;
   _vf_atom=vf_atom;
     
   int csize=_ev_fields;    
   if (!eflag)
     csize-=_e_fields;
   if (!vflag)
     csize-=6;
       
   if (csize>0)
     ucl_copy(host_engv,dev_engv,_inum*csize,true);
   if (_rot)
     ucl_copy(host_ans,dev_ans,_inum*4*2,true);
   else
     ucl_copy(host_ans,dev_ans,_inum*4,true);
   time_answer.stop();
+  dev->add_answer_object(this);
 }
 
 template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom,
-                                int *ilist) {
+void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
+                               const bool ef_atom, const bool vf_atom,
+                               int *ilist) {
   _ilist=ilist;
   copy_answers(eflag,vflag,ef_atom,vf_atom);
 }
 
 template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
-                                   double *virial) {
+double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
+                                  double *virial) {
   if (_eflag==false && _vflag==false)
     return 0.0;
 
   double evdwl=0.0;
   if (_gpu_nbor) {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       if (_eflag) {
         if (_ef_atom) {
           evdwl+=*ap;
           eatom[i]+=*ap*0.5;
           ap+=_inum;
         } else {
           evdwl+=*ap;
           ap+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
             vatom[i][j]+=*ap*0.5;
             virial[j]+=*ap;
             ap+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
             virial[j]+=*ap;
             ap+=_inum;
           }
         }
       }
     }
     for (int j=0; j<6; j++)
       virial[j]*=0.5;
   } else {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       int ii=_ilist[i];
       if (_eflag) {
         if (_ef_atom) {
           evdwl+=*ap;
           eatom[ii]+=*ap*0.5;
           ap+=_inum;
         } else {
           evdwl+=*ap;
           ap+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
             vatom[ii][j]+=*ap*0.5;
             virial[j]+=*ap;
             ap+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
             virial[j]+=*ap;
             ap+=_inum;
           }
         }
       }
     }
     for (int j=0; j<6; j++)
       virial[j]*=0.5;
   }
   
   evdwl*=0.5;
   return evdwl;
 }
 
 template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
+double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
                                    double *virial, double &ecoul) {
   if (_eflag==false && _vflag==false) {
     ecoul=0.0;
     return 0.0;
   }
 
   if (_charge==false)
     return energy_virial(eatom,vatom,virial);
 
   double evdwl=0.0;
   double _ecoul=0.0;
   if (_gpu_nbor) {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       if (_eflag) {
         if (_ef_atom) {
           evdwl+=*ap;
           eatom[i]+=*ap*0.5;
           ap+=_inum;
           _ecoul+=*ap;
           eatom[i]+=*ap*0.5;
           ap+=_inum;
         } else {
           evdwl+=*ap;
           ap+=_inum;
           _ecoul+=*ap;
           ap+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
             vatom[i][j]+=*ap*0.5;
             virial[j]+=*ap;
             ap+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
             virial[j]+=*ap;
             ap+=_inum;
           }
         }
       }
     }
     for (int j=0; j<6; j++)
       virial[j]*=0.5;
   } else {
     for (int i=0; i<_inum; i++) {
       acctyp *ap=host_engv.begin()+i;
       int ii=_ilist[i];
       if (_eflag) {
         if (_ef_atom) {
           evdwl+=*ap;
           eatom[ii]+=*ap*0.5;
           ap+=_inum;
           _ecoul+=*ap;
           eatom[ii]+=*ap*0.5;
           ap+=_inum;
         } else {
           evdwl+=*ap;
           ap+=_inum;
           _ecoul+=*ap;
           ap+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
             vatom[ii][j]+=*ap*0.5;
             virial[j]+=*ap;
             ap+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
             virial[j]+=*ap;
             ap+=_inum;
           }
         }
       }
     }
     for (int j=0; j<6; j++)
       virial[j]*=0.5;
   }
   
   evdwl*=0.5;
   ecoul+=_ecoul*0.5;
   return evdwl;
 }
 
 template <class numtyp, class acctyp>
-void PairGPUAtomT::get_answers(double **f, double **tor) {
+void PairGPUAnsT::get_answers(double **f, double **tor) {
   _x_avail=false;
   _q_avail=false;
   _quat_avail=false;
   acctyp *ap=host_ans.begin();
   if (_gpu_nbor) {
     for (int i=0; i<_inum; i++) {
       f[i][0]+=*ap;
       ap++;
       f[i][1]+=*ap;
       ap++;
       f[i][2]+=*ap;
       ap+=2;
     }
     if (_rot) {
       for (int i=0; i<_inum; i++) {
         tor[i][0]+=*ap;
         ap++;
         tor[i][1]+=*ap;
         ap++;
         tor[i][2]+=*ap;
         ap+=2;
       }
     }
   } else {
     for (int i=0; i<_inum; i++) {
       int ii=_ilist[i];
       f[ii][0]+=*ap;
       ap++;
       f[ii][1]+=*ap;
       ap++;
       f[ii][2]+=*ap;
       ap+=2;
     }
     if (_rot) {
       for (int i=0; i<_inum; i++) {
         int ii=_ilist[i];
         tor[ii][0]+=*ap;
         ap++;
         tor[ii][1]+=*ap;
         ap++;
         tor[ii][2]+=*ap;
         ap+=2;
       }
     }
   }
 }
 
-// Sort arrays for neighbor list calculation
-template <class numtyp, class acctyp>
-void PairGPUAtomT::sort_neighbor(const int num_atoms) {
-  #ifndef USE_OPENCL
-  #ifdef WINDLL
-  _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(),
-            (int *)dev_particle_id.begin());
-  #else
-  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), 
-                                 (int *)dev_particle_id.begin(), 
-                                 8*sizeof(unsigned), num_atoms);
-  if (CUDPP_SUCCESS != result) {
-    printf("Error in cudppSort\n");
-    NVD_GERYON_EXIT;
-  }
-  #endif
-  #endif
-}
-
-#ifdef GPU_CAST
-#ifdef USE_OPENCL
-#include "pair_gpu_atom_cl.h"
-#else
-#include "pair_gpu_atom_ptx.h"
-#endif
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::compile_kernels(UCL_Device &dev) {
-  atom_program=new UCL_Program(dev);
-  atom_program->load_string(pair_gpu_atom_kernel,"");
-  k_cast_x.set_function(*atom_program,"kernel_cast_x");
-  _compiled=true;
-}
-
-#endif
-
-template class PairGPUAtom<PRECISION,ACC_PRECISION>;
+template class PairGPUAns<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/pair_gpu_ans.h b/lib/gpu/pair_gpu_ans.h
new file mode 100644
index 000000000..a93ed6fcd
--- /dev/null
+++ b/lib/gpu/pair_gpu_ans.h
@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_ANS_H
+#define PAIR_GPU_ANS_H
+
+#include <math.h>
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+
+#include "geryon/ocl_timer.h"
+#include "geryon/ocl_mat.h"
+using namespace ucl_opencl;
+
+#else
+
+#include "cudpp.h"
+#include "geryon/nvd_timer.h"
+#include "geryon/nvd_mat.h"
+using namespace ucl_cudadr;
+
+#endif
+
+#include "pair_gpu_precision.h"
+
+template <class numtyp, class acctyp>
+class PairGPUAns {
+ public:
+  PairGPUAns();
+  ~PairGPUAns() { clear(); }
+
+  /// Current number of local atoms stored
+  inline int inum() const { return _inum; }
+  /// Set number of local atoms for future copy operations
+  inline void inum(const int n) { _inum=n; }
+  
+  /// Memory usage per atom in this class
+  int bytes_per_atom() const; 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
+  
+  /// Check if we have enough device storage and realloc if not
+  inline bool resize(const int inum, bool &success) {
+    _inum=inum;
+    if (inum>_max_local) {
+      clear_resize();
+      success = success && alloc(inum);
+      return true;
+    }
+    return false;
+  }
+  
+  /// If already initialized by another LAMMPS style, add fields as necessary
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool add_fields(const bool charge, const bool rot);
+  
+  /// Free all memory on host and device needed to realloc for more atoms
+  void clear_resize();
+
+  /// Free all memory on host and device
+  void clear();
+ 
+  /// Return the total amount of host memory used by class in bytes
+  double host_memory_usage() const;
+
+  /// Add copy times to timers
+  inline void acc_timers() {
+    time_answer.add_to_total();
+  }
+
+  /// Add copy times to timers
+  inline void zero_timers() {
+    time_answer.zero();
+  }
+
+  /// Return the total time for host/device data transfer
+  inline double transfer_time() {
+    return time_answer.total_seconds();
+  }
+  
+  /// Return the total time for data cast/pack
+  inline double cast_time() { return _time_cast; }
+  
+  /// Return number of bytes used on device
+  inline double gpu_bytes() { return _gpu_bytes; } 
+
+  // -------------------------COPY FROM GPU -------------------------------
+
+  /// Copy answers from device into read buffer asynchronously
+  void copy_answers(const bool eflag, const bool vflag,
+                    const bool ef_atom, const bool vf_atom);
+
+  /// Copy answers from device into read buffer asynchronously
+  void copy_answers(const bool eflag, const bool vflag,
+                    const bool ef_atom, const bool vf_atom, int *ilist);
+  
+  /// Copy energy and virial data into LAMMPS memory
+  double energy_virial(double *eatom, double **vatom, double *virial);
+
+  /// Copy energy and virial data into LAMMPS memory
+  double energy_virial(double *eatom, double **vatom, double *virial,
+                       double &ecoul);
+
+  /// Add forces and torques from the GPU into a LAMMPS pointer
+  void get_answers(double **f, double **tor);
+
+  // ------------------------------ DATA ----------------------------------
+
+  /// Force and possibly torque
+  UCL_D_Vec<acctyp> dev_ans;
+  /// Energy and virial per-atom storage
+  UCL_D_Vec<acctyp> dev_engv;
+  
+  /// Force and possibly torque data on host
+  UCL_H_Vec<acctyp> host_ans;
+  /// Energy/virial data on host
+  UCL_H_Vec<acctyp> host_engv;
+  
+  /// Device timers
+  UCL_Timer time_answer;
+  
+  /// Geryon device
+  UCL_Device *dev;
+
+ private:
+  bool alloc(const int inum);
+  
+  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
+  int _max_local, _inum, _e_fields, _ev_fields;
+  int *_ilist;
+  double _time_cast;
+  
+  double _gpu_bytes;
+  
+  bool _newton;
+};
+
+#endif
+
diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp
index 46c9066e5..812a7c82d 100644
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@@ -1,605 +1,319 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
  
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #include "pair_gpu_atom.h"
 
 #define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
 
 #ifdef WINDLL
 #include <windows.h>
 typedef bool (*__win_sort_alloc)(const int max_atoms);
 typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin,
                            int *particle_begin);
 __win_sort_alloc _win_sort_alloc;
 __win_sort _win_sort;
 #endif
 
 template <class numtyp, class acctyp>
-PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
-                              _vflag(false),_inum(0),_ilist(NULL), 
-                              _newton(false) {
+PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false) {
   #ifndef USE_OPENCL
   sort_config.op = CUDPP_ADD;
   sort_config.datatype = CUDPP_UINT;
   sort_config.algorithm = CUDPP_SORT_RADIX;
   sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
 
   #ifdef WINDLL
   HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll"));
   if (hinstLib == NULL) {
     printf("\nUnable to load gpu.dll\n");
     exit(1);
   }
   _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc");
   _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort");
   #endif
 
   #endif
 }
 
 template <class numtyp, class acctyp>
 int PairGPUAtomT::bytes_per_atom() const { 
   int id_space=0;
   if (_gpu_nbor)
     id_space=2;
-  int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
+  int bytes=4*sizeof(numtyp)+id_space;
   if (_rot)
-    bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
+    bytes+=4*sizeof(numtyp);
   if (_charge)
     bytes+=sizeof(numtyp);
   return bytes;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::alloc(const int inum, const int nall) {
+bool PairGPUAtomT::alloc(const int nall) {
   _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-  if (_newton)
-    _max_local=_max_atoms;
-  else
-    _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
 
   bool success=true;
   
-  int ans_elements=4;
-  if (_rot)
-    ans_elements+=4;
-  
   // Ignore host/device transfers?
   bool cpuview=false;
   if (dev->device_type()==UCL_CPU)
     cpuview=true;
     
   // Allocate storage for CUDPP sort
   #ifndef USE_OPENCL
   #ifdef WINDLL
   _win_sort_alloc(_max_atoms);
   #else
   if (_gpu_nbor) {
     CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
     if (CUDPP_SUCCESS != result)
       return false;
   }
   #endif
   #endif
 
   // --------------------------   Host allocations
   // Get a host write only buffer
   #ifdef GPU_CAST
   success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
                                         UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   success=success && (host_type_cast.alloc(_max_atoms,*dev,
                                            UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   #else
   success=success && (host_x.alloc(_max_atoms*4,*dev,
                       UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   #endif                      
-  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
-  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
   // Buffer for casting only if different precisions
   if (_charge)
     success=success && (host_q.alloc(_max_atoms,*dev,
                                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   // Buffer for casting only if different precisions
   if (_rot)
     success=success && (host_quat.alloc(_max_atoms*4,*dev,
                                         UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
 
     
   // ---------------------------  Device allocations
   _gpu_bytes=0;
   if (cpuview) {
     #ifdef GPU_CAST
     assert(0==1);
     #else
     dev_x.view(host_x);
     #endif
-    dev_engv.view(host_engv);
-    dev_ans.view(host_ans);
     if (_rot)
       dev_quat.view(host_quat);
     if (_charge)
       dev_q.view(host_q);
   } else {
     #ifdef GPU_CAST
     success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
     success=success && (UCL_SUCCESS==
                         dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
     success=success && (UCL_SUCCESS==
                         dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
     _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
     #else
     success=success && (UCL_SUCCESS==
                         dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
     #endif
-    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
-                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_ans.alloc(ans_elements*_max_local,
-                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
     if (_charge) {
       success=success && (dev_q.alloc(_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
       _gpu_bytes+=dev_q.row_bytes();
     }
     if (_rot) {
       success=success && (dev_quat.alloc(_max_atoms*4,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
       _gpu_bytes+=dev_quat.row_bytes();
     }
   }
   if (_gpu_nbor) {
     success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
     success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
     _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
     if (_bonds) {
       success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
       _gpu_bytes+=dev_tag.row_bytes();
     }
   }
 
-  _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
+  _gpu_bytes+=dev_x.row_bytes();
   
   _allocated=true;  
   return success;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
-                        const bool rot, UCL_Device &devi, const bool gpu_nbor,
+bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
+                        UCL_Device &devi, const bool gpu_nbor,
                         const bool bonds) {
   clear();
 
   bool success=true;
   _x_avail=false;
   _q_avail=false;
   _quat_avail=false;
   _gpu_nbor=gpu_nbor;
   _bonds=bonds;
   _charge=charge;
   _rot=rot;
   _other=_charge || _rot;
   dev=&devi;
 
-  _e_fields=1;
-  if (_charge)
-    _e_fields++;
-  _ev_fields=6+_e_fields;
-    
   // Initialize atom and nbor data
-  int ef_inum=inum;
-  if (ef_inum==0)
-    ef_inum=1000;
   int ef_nall=nall;
-  if (ef_nall<=ef_inum)
-    ef_nall=ef_inum*2;
+  if (ef_nall==0)
+    ef_nall=2000;
   
   // Initialize timers for the selected device
   time_pos.init(*dev);
   time_other.init(*dev);
-  time_answer.init(*dev);
   time_pos.zero();
   time_other.zero();
-  time_answer.zero();
   _time_cast=0.0;
   
   #ifdef GPU_CAST
   compile_kernels(*dev);
   #endif
   
-  return success && alloc(ef_inum,ef_nall);
+  return success && alloc(ef_nall);
 }
   
 template <class numtyp, class acctyp>
 bool PairGPUAtomT::add_fields(const bool charge, const bool rot) {
   bool realloc=false;
   if (charge && _charge==false) {
     _charge=true;
-    _e_fields++;
     realloc=true;
   }
   if (rot && _rot==false) {
     _rot=true;
     realloc=true;
   }
   if (realloc) {
     _other=_charge || _rot;
-    int inum=_max_local;
-    int nall=_max_atoms;
+    int max_atoms=_max_atoms;
     clear_resize();
-    return alloc(inum,nall);
+    return alloc(max_atoms);
   }
   return true;
 }
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear_resize() {
   if (!_allocated)
     return;
   _allocated=false;
 
   dev_x.clear();
   if (_charge) { 
     dev_q.clear();
     host_q.clear();
   }
   if (_rot) {
     dev_quat.clear();
     host_quat.clear();
   }
-  dev_ans.clear();
-  dev_engv.clear();
   #ifndef GPU_CAST
   host_x.clear();
   #else
   host_x_cast.clear();
   host_type_cast.clear();
   #endif
-  host_ans.clear();
-  host_engv.clear();
   dev_cell_id.clear();
   dev_particle_id.clear();
   dev_tag.clear();
   #ifdef GPU_CAST
   dev_x_cast.clear();
   dev_type_cast.clear();
   #endif
 
   #ifndef USE_OPENCL
   #ifndef WINDLL
   if (_gpu_nbor) cudppDestroyPlan(sort_plan);
   #endif
   #endif
 }
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
   _gpu_bytes=0;
   if (!_allocated)
     return;
 
   time_pos.clear();
   time_other.clear();
-  time_answer.clear();
   clear_resize();
-  _inum=0;
-  _eflag=false;
-  _vflag=false;
 
   #ifdef GPU_CAST
   if (_compiled) {
     k_cast_x.clear();
     delete atom_program;
     _compiled=false;
   }
   #endif
 }
 
 template <class numtyp, class acctyp>
 double PairGPUAtomT::host_memory_usage() const {
   int atom_bytes=4;
   if (_charge) 
     atom_bytes+=1;
   if (_rot) 
     atom_bytes+=4;
-  int ans_bytes=atom_bytes+_ev_fields;
   return _max_atoms*atom_bytes*sizeof(numtyp)+
-         ans_bytes*(_max_local)*sizeof(acctyp)+
          sizeof(PairGPUAtom<numtyp,acctyp>);
 }
   
-template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom) {
-  time_answer.start();
-  _eflag=eflag;
-  _vflag=vflag;
-  _ef_atom=ef_atom;
-  _vf_atom=vf_atom;
-    
-  int csize=_ev_fields;    
-  if (!eflag)
-    csize-=_e_fields;
-  if (!vflag)
-    csize-=6;
-      
-  if (csize>0)
-    ucl_copy(host_engv,dev_engv,_inum*csize,true);
-  if (_rot)
-    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
-  else
-    ucl_copy(host_ans,dev_ans,_inum*4,true);
-  time_answer.stop();
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom,
-                                int *ilist) {
-  _ilist=ilist;
-  copy_answers(eflag,vflag,ef_atom,vf_atom);
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
-                                   double *virial) {
-  if (_eflag==false && _vflag==false)
-    return 0.0;
-
-  double evdwl=0.0;
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  }
-  
-  evdwl*=0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
-                                   double *virial, double &ecoul) {
-  if (_eflag==false && _vflag==false) {
-    ecoul=0.0;
-    return 0.0;
-  }
-
-  if (_charge==false)
-    return energy_virial(eatom,vatom,virial);
-
-  double evdwl=0.0;
-  double _ecoul=0.0;
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  }
-  
-  evdwl*=0.5;
-  ecoul+=_ecoul*0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::get_answers(double **f, double **tor) {
-  _x_avail=false;
-  _q_avail=false;
-  _quat_avail=false;
-  acctyp *ap=host_ans.begin();
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      f[i][0]+=*ap;
-      ap++;
-      f[i][1]+=*ap;
-      ap++;
-      f[i][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        tor[i][0]+=*ap;
-        ap++;
-        tor[i][1]+=*ap;
-        ap++;
-        tor[i][2]+=*ap;
-        ap+=2;
-      }
-    }
-  } else {
-    for (int i=0; i<_inum; i++) {
-      int ii=_ilist[i];
-      f[ii][0]+=*ap;
-      ap++;
-      f[ii][1]+=*ap;
-      ap++;
-      f[ii][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        int ii=_ilist[i];
-        tor[ii][0]+=*ap;
-        ap++;
-        tor[ii][1]+=*ap;
-        ap++;
-        tor[ii][2]+=*ap;
-        ap+=2;
-      }
-    }
-  }
-}
-
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
 void PairGPUAtomT::sort_neighbor(const int num_atoms) {
   #ifndef USE_OPENCL
   #ifdef WINDLL
   _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(),
             (int *)dev_particle_id.begin());
   #else
   CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), 
                                  (int *)dev_particle_id.begin(), 
                                  8*sizeof(unsigned), num_atoms);
   if (CUDPP_SUCCESS != result) {
     printf("Error in cudppSort\n");
     NVD_GERYON_EXIT;
   }
   #endif
   #endif
 }
 
 #ifdef GPU_CAST
 #ifdef USE_OPENCL
 #include "pair_gpu_atom_cl.h"
 #else
 #include "pair_gpu_atom_ptx.h"
 #endif
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::compile_kernels(UCL_Device &dev) {
   atom_program=new UCL_Program(dev);
   atom_program->load_string(pair_gpu_atom_kernel,"");
   k_cast_x.set_function(*atom_program,"kernel_cast_x");
   _compiled=true;
 }
 
 #endif
 
 template class PairGPUAtom<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index 562ca0846..c4c6b1586 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -1,447 +1,404 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #ifndef PAIR_GPU_ATOM_H
 #define PAIR_GPU_ATOM_H
 
 #include <math.h>
 #include "mpi.h"
 
 #ifdef USE_OPENCL
 
-#include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
 using namespace ucl_opencl;
 
 #else
 
 #include "cudpp.h"
-#include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
 using namespace ucl_cudadr;
 
 #endif
 
-#ifndef int2
-struct int2 { int x; int y; };
-#endif
-
 #include "pair_gpu_precision.h"
 
 template <class numtyp, class acctyp>
 class PairGPUAtom {
  public:
   PairGPUAtom();
   ~PairGPUAtom() { clear(); }
 
   /// Maximum number of atoms that can be stored with current allocation
   inline int max_atoms() const { return _max_atoms; }
   /// Current number of local+ghost atoms stored
   inline int nall() const { return _nall; }
-  /// Current number of local atoms stored
-  inline int inum() const { return _inum; }
 
   /// Set number of local+ghost atoms for future copy operations
   inline void nall(const int n) { _nall=n; }
-  /// Set number of local atoms for future copy operations
-  inline void inum(const int n) { _inum=n; }
   
   /// Memory usage per atom in this class
   int bytes_per_atom() const; 
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor True if neighboring will be performed on device **/
-  bool init(const int inum, const int nall, const bool charge, const bool rot, 
+  bool init(const int nall, const bool charge, const bool rot, 
             UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
   
   /// Check if we have enough device storage and realloc if not
-  inline bool resize(const int inum, const int nall, bool &success) {
-    _inum=inum;
+  inline bool resize(const int nall, bool &success) {
     _nall=nall;
-    if (inum>_max_local || nall>_max_atoms) {
+    if (nall>_max_atoms) {
       clear_resize();
-      success = success && alloc(inum,nall);
+      success = success && alloc(nall);
       return true;
     }
     return false;
   }
   
   /// If already initialized by another LAMMPS style, add fields as necessary
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor True if neighboring will be performed on device **/
   bool add_fields(const bool charge, const bool rot);
   
-  /// True if charge data is available for kernels
-  bool charge_avail() const { return _charge; }
-
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
   
   /// Free all memory on host and device
   void clear();
  
   /// Return the total amount of host memory used by class in bytes
   double host_memory_usage() const;
 
   /// Sort arrays for neighbor list calculation on device
   void sort_neighbor(const int num_atoms);
   
   /// Add copy times to timers
   inline void acc_timers() {
     time_pos.add_to_total();
-    time_answer.add_to_total();
     if (_other)
       time_other.add_to_total();
   }
 
   /// Add copy times to timers
   inline void zero_timers() {
     time_pos.zero();
-    time_answer.zero();
     if (_other)
       time_other.zero();
   }
 
   /// Return the total time for host/device data transfer
   inline double transfer_time() {
-    double total=time_pos.total_seconds()+time_answer.total_seconds();
+    double total=time_pos.total_seconds();
     if (_other) total+=time_other.total_seconds();
     return total;
   }
   
   /// Return the total time for data cast/pack
   inline double cast_time() { return _time_cast; }
 
   /// Pack LAMMPS atom type constants into matrix and copy to device
   template <class dev_typ, class t1>
   inline void type_pack1(const int n, const int m_size,
 			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
 			 t1 **one) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
         buffer[ii]=static_cast<numtyp>(one[i][j]);
         ii++;
       }
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
 
   /// Pack LAMMPS atom type constants into 2 vectors and copy to device
   template <class dev_typ, class t1, class t2>
   inline void type_pack2(const int n, const int m_size,
 			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
 			 t1 **one, t2 **two) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
         buffer[ii*2]=static_cast<numtyp>(one[i][j]);
         buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
         ii++;
       }
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
 
   /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
   template <class dev_typ, class t1, class t2, class t3>
   inline void type_pack4(const int n, const int m_size,
 			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
 			 t1 **one, t2 **two, t3 **three) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
         buffer[ii*4]=static_cast<numtyp>(one[i][j]);
         buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
         buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
         ii++;
       }
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
   
   /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
   template <class dev_typ, class t1, class t2, class t3, class t4>
   inline void type_pack4(const int n, const int m_size,
 			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
 			 t1 **one, t2 **two, t3 **three, t4 **four) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
         buffer[ii*4]=static_cast<numtyp>(one[i][j]);
         buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
         buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
         buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
         ii++;
       }
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
 
   /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
   template <class dev_typ, class t1, class t2>
   inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, 
                          UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
     for (int i=0; i<n; i++) {
       buffer[i*2]=static_cast<numtyp>(one[i][i]);
       buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
     }
     UCL_H_Vec<dev_typ> view;
     view.view((dev_typ*)buffer.begin(),n,*dev);
     ucl_copy(dev_v,view,false);
   }
 
   // -------------------------COPY TO GPU ----------------------------------
 
+  /// Signal that we need to transfer atom data for next timestep
+  inline void data_unavail()
+    { _x_avail=false; _q_avail=false; _quat_avail=false; }
+
   /// Cast positions and types to write buffer
   inline void cast_x_data(double **host_ptr, const int *host_type) {
     if (_x_avail==false) {
       double t=MPI_Wtime();
       #ifdef GPU_CAST
       memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
       memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
       #else
       numtyp *_write_loc=host_x.begin();
       for (int i=0; i<_nall; i++) {
         *_write_loc=host_ptr[i][0];
         _write_loc++;
         *_write_loc=host_ptr[i][1];
         _write_loc++;
         *_write_loc=host_ptr[i][2];
         _write_loc++;
         *_write_loc=host_type[i];
         _write_loc++;
       }
       #endif
       _time_cast+=MPI_Wtime()-t;
     }
   }
 
   /// Copy positions and types to device asynchronously
   /** Copies nall() elements **/
   inline void add_x_data(double **host_ptr, int *host_type) { 
     time_pos.start();
     if (_x_avail==false) {
       #ifdef GPU_CAST
       ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
       ucl_copy(dev_type_cast,host_type_cast,_nall,true);
       int block_size=64;
       int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
       k_cast_x.set_size(GX,block_size);
       k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
                    &_nall);
       #else
       ucl_copy(dev_x,host_x,_nall*4,true);
       #endif
       _x_avail=true;
     }
     time_pos.stop();
   }
 
   /// Calls cast_x_data and add_x_data and times the routines
   inline void cast_copy_x(double **host_ptr, int *host_type) {
     cast_x_data(host_ptr,host_type);
     add_x_data(host_ptr,host_type);
   }
 
   // Cast charges to write buffer
   template<class cpytyp>
   inline void cast_q_data(cpytyp *host_ptr) {
     if (_q_avail==false) {
       double t=MPI_Wtime();
       if (dev->device_type()==UCL_CPU) {
         if (sizeof(numtyp)==sizeof(double)) {
           host_q.view((numtyp*)host_ptr,_nall,*dev);
           dev_q.view(host_q);
         } else
           for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
       } else {
         if (sizeof(numtyp)==sizeof(double))
           memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
         else
           for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
       }
       _time_cast+=MPI_Wtime()-t;
     }
   }
 
   // Copy charges to device asynchronously
   inline void add_q_data() {
     if (_q_avail==false) {
       ucl_copy(dev_q,host_q,_nall,true);
       _q_avail=true;
     }
   }
 
   // Cast quaternions to write buffer
   template<class cpytyp>
   inline void cast_quat_data(cpytyp *host_ptr) {
     if (_quat_avail==false) {
       double t=MPI_Wtime();
       if (dev->device_type()==UCL_CPU) {
         if (sizeof(numtyp)==sizeof(double)) {
           host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
           dev_quat.view(host_quat);
         } else
           for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
       } else {
         if (sizeof(numtyp)==sizeof(double))
           memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
         else
           for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
       }
       _time_cast+=MPI_Wtime()-t;
     }
   }
 
   // Copy quaternions to device
   /** Copies nall()*4 elements **/
   inline void add_quat_data() {
     if (_quat_avail==false) {
       ucl_copy(dev_quat,host_quat,_nall*4,true);
       _quat_avail=true;
     }
   }
 
   /// Copy data other than pos and type to device
   inline void add_other_data() {
     if (_other) {
       time_other.start();
       if (_charge)
         add_q_data();
       if (_rot)
         add_quat_data();
       time_other.stop();
     }
   }
   
   /// Return number of bytes used on device
   inline double gpu_bytes() { return _gpu_bytes; } 
 
-  // -------------------------COPY FROM GPU -------------------------------
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom);
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom, int *ilist);
-  
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial);
-
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial,
-                       double &ecoul);
-
-  /// Add forces and torques from the GPU into a LAMMPS pointer
-  void get_answers(double **f, double **tor);
-
   // ------------------------------ DATA ----------------------------------
 
   /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
   UCL_D_Vec<numtyp> dev_x;
   /// Charges
   UCL_D_Vec<numtyp> dev_q;
   /// Quaterions
   UCL_D_Vec<numtyp> dev_quat;
-  /// Force and possibly torque
-  UCL_D_Vec<acctyp> dev_ans;
-  /// Energy and virial per-atom storage
-  UCL_D_Vec<acctyp> dev_engv;
   
   #ifdef GPU_CAST
   UCL_D_Vec<double> dev_x_cast;
   UCL_D_Vec<int> dev_type_cast;
   UCL_H_Vec<double> host_x_cast;
   UCL_H_Vec<int> host_type_cast;
   #endif
 
   /// Buffer for moving positions to device
   UCL_H_Vec<numtyp> host_x;
   /// Buffer for moving charge data to GPU
   UCL_H_Vec<numtyp> host_q;
   /// Buffer for moving quat data to GPU
   UCL_H_Vec<numtyp> host_quat;
-  /// Force and possibly torque data on host
-  UCL_H_Vec<acctyp> host_ans;
-  /// Energy/virial data on host
-  UCL_H_Vec<acctyp> host_engv;
   
   /// Cell list identifiers for device nbor builds
   UCL_D_Vec<unsigned> dev_cell_id;
   /// Cell list identifiers for device nbor builds
   UCL_D_Vec<int> dev_particle_id;
   /// Atom tag information for device nbor builds
   UCL_D_Vec<int> dev_tag;
 
   /// Device timers
-  UCL_Timer time_pos, time_other, time_answer;
+  UCL_Timer time_pos, time_other;
   
   /// Geryon device
   UCL_Device *dev;
 
  private:
   #ifdef GPU_CAST
   UCL_Program *atom_program;
   UCL_Kernel k_cast_x;
   void compile_kernels(UCL_Device &dev);
   #endif
 
   bool _compiled;
   
   // True if data has been copied to device already
-  int _x_avail, _q_avail, _quat_avail;
+  bool _x_avail, _q_avail, _quat_avail;
 
-  bool alloc(const int inum, const int nall);
+  bool alloc(const int nall);
   
-  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
+  bool _allocated, _rot, _charge, _other;
+  int _max_atoms, _nall;
   bool _gpu_nbor, _bonds;
-  int *_ilist;
   double _time_cast;
   
   double _gpu_bytes;
   
-  bool _newton;
-
   #ifndef USE_OPENCL
   CUDPPConfiguration sort_config;
   CUDPPHandle sort_plan;
   #endif
 };
 
 #endif
 
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 30206d72e..718e9d9dd 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -1,289 +1,290 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #include "pair_gpu_device.h"
 #include "pair_gpu_precision.h"
 #include <map>
 #include <math.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
 #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
 
 template <class numtyp, class acctyp>
 PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
                                   _gpu_mode(GPU_FORCE), _first_device(0),
                                   _last_device(0) {
 }
 
 template <class numtyp, class acctyp>
 PairGPUDeviceT::~PairGPUDevice() {
   clear_device();
 }
 
 template <class numtyp, class acctyp>
 bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
                                  const int first_gpu, const int last_gpu,
                                  const int gpu_mode, const double p_split,
                                  const int nthreads) {
   _nthreads=nthreads;
   #ifdef _OPENMP
   omp_set_num_threads(nthreads);
   #endif
 
   if (_device_init)
     return true;
   _device_init=true;
   _comm_world=world;
   _comm_replica=replica;
   _first_device=first_gpu;
   _last_device=last_gpu;
   _gpu_mode=gpu_mode;
   _particle_split=p_split;
 
   // Get the rank/size within the world
   MPI_Comm_rank(_comm_world,&_world_me);
   MPI_Comm_size(_comm_world,&_world_size);
   // Get the rank/size within the replica
   MPI_Comm_rank(_comm_replica,&_replica_me);
   MPI_Comm_size(_comm_replica,&_replica_size);
 
   // Get the names of all nodes
   int name_length;
   char node_name[MPI_MAX_PROCESSOR_NAME];
   char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
   MPI_Get_processor_name(node_name,&name_length);
   MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
                 MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
   std::string node_string=std::string(node_name);
   
   // Get the number of procs per node                
   std::map<std::string,int> name_map;
   std::map<std::string,int>::iterator np;
   for (int i=0; i<_world_size; i++) {
     std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
     np=name_map.find(i_string);
     if (np==name_map.end())
       name_map[i_string]=1;
     else
       np->second++;
   }
   int procs_per_node=name_map.begin()->second;
 
   // Assign a unique id to each node
   int split_num=0, split_id=0;
   for (np=name_map.begin(); np!=name_map.end(); ++np) {
     if (np->first==node_string)
       split_id=split_num;
     split_num++;
   }
   
   // Set up a per node communicator and find rank within
   MPI_Comm node_comm;
   MPI_Comm_split(_comm_world, split_id, 0, &node_comm);  
   int node_rank;
   MPI_Comm_rank(node_comm,&node_rank);                  
 
   // set the device ID
   _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
                                        (last_gpu-first_gpu+1)));
   int my_gpu=node_rank/_procs_per_gpu;
   
   // Set up a per device communicator
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
   MPI_Comm_rank(_comm_gpu,&_gpu_rank);
 
   gpu=new UCL_Device();
   if (my_gpu>=gpu->num_devices())
     return false;
   
   gpu->set(my_gpu);
   return true;
 }
 
 template <class numtyp, class acctyp>
 bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, 
                           const int host_nlocal, const int nall,
                           PairGPUNbor *nbor, const int maxspecial, 
                           const bool gpu_nbor, const int gpu_host, 
                           const int max_nbors, const double cell_size,
                           const bool pre_cut) {
   if (!_device_init)
     return false;                          
 
   // Initial number of local particles
   int ef_nlocal=nlocal;
   if (_particle_split<1.0 && _particle_split>0.0)
     ef_nlocal=static_cast<int>(_particle_split*nlocal);
 
   if (_init_count==0) {
     // Initialize atom and nbor data
     if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
                    gpu_nbor && maxspecial>0))
       return false;
   } else
     atom.add_fields(charge,rot);
 
   if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                   *gpu,gpu_nbor,gpu_host,pre_cut))
     return false;
   nbor->cell_size(cell_size);
 
   _init_count++;
   return true;
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::init_message(FILE *screen, const char *name,
                                   const int first_gpu, const int last_gpu) {
   #ifdef USE_OPENCL
   std::string fs="";
   #else
   std::string fs=toa(gpu->free_gigabytes())+"/";
   #endif
   
   if (_replica_me == 0 && screen) {
     fprintf(screen,"\n-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
     fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
     fprintf(screen,"-  with %d procs per device.\n",_procs_per_gpu);
     #ifdef _OPENMP
     fprintf(screen,"-  with %d threads per proc.\n",_nthreads);
     #endif
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
 
     for (int i=first_gpu; i<=last_gpu; i++) {
       std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
                         toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
                         " GHZ (";
       if (sizeof(PRECISION)==4) {
         if (sizeof(ACC_PRECISION)==4)
           sname+="Single Precision)";
         else
           sname+="Mixed Precision)";
       } else
         sname+="Double Precision)";
 
       fprintf(screen,"GPU %d: %s\n",i,sname.c_str());         
     }
 
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n\n");
   }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUNbor &nbor,
                                   const double avg_split, 
                                   const double max_bytes, FILE *screen) {
   double single[5], times[5];
 
   single[0]=atom.transfer_time();
   single[1]=nbor.time_nbor.total_seconds();
   single[2]=nbor.time_kernel.total_seconds();
   single[3]=time_pair.total_seconds();
   single[4]=atom.cast_time();
 
   MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes;
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
 
   if (replica_me()==0)
     if (screen && times[3]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
       fprintf(screen,"      GPU Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
       if (procs_per_gpu()==1) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
         if (nbor.gpu_nbor())
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
         else
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
       }
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
       fprintf(screen,"-------------------------------------");
       fprintf(screen,"--------------------------------\n\n");
     }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::clear() {
   if (_init_count>0) {
     _init_count--;
     if (_init_count==0) {
       atom.clear();
       _nbor_shared.clear();
     }
   }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::clear_device() {
   while (_init_count>0)
     clear();
   if (_device_init) {
     delete gpu;
     _device_init=false;
   }
 }
 
 template <class numtyp, class acctyp>
 double PairGPUDeviceT::host_memory_usage() const {
   return atom.host_memory_usage()+4*sizeof(numtyp)+
          sizeof(PairGPUDevice<numtyp,acctyp>);
 }
 
 template class PairGPUDevice<PRECISION,ACC_PRECISION>;
 PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 
 bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                      const int last_gpu, const int gpu_mode, 
                      const double particle_split, const int nthreads) {
   return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
                                      particle_split,nthreads);
 }
 
 void lmp_clear_device() {
   pair_gpu_device.clear_device();
 }
 
 double lmp_gpu_forces(double **f, double **tor, double *eatom,
                       double **vatom, double *virial, double &ecoul) {
   if (pair_gpu_device.init_count()) {
     pair_gpu_device.stop_host_timer();
     pair_gpu_device.gpu->sync();
     double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
     pair_gpu_device.atom.get_answers(f,tor);
+    pair_gpu_device.atom.data_unavail();
 
     return evdw;
   }
   return 0.0;
 }
 
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index 8f24e0231..1ef85c78a 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -1,156 +1,162 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
 #ifndef PAIR_GPU_DEVICE_H
 #define PAIR_GPU_DEVICE_H
 
 #include "pair_gpu_atom.h"
+#include "pair_gpu_ans.h"
 #include "pair_gpu_nbor.h"
 #include "mpi.h"
 #include <sstream>
 #include "stdio.h"
 #include <string>
+#include <queue>
 
 template <class numtyp, class acctyp>
 class PairGPUDevice {
  public:
   PairGPUDevice();
   ~PairGPUDevice(); 
  
   /// Initialize the device for use by this process
   /** Sets up a per-device MPI communicator for load balancing and initializes
     * the device (>=first_gpu and <=last_gpu) that this proc will be using **/
   bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
                    const int last_gpu, const int gpu_mode, 
                    const double particle_split, const int nthreads);
 
   /// Initialize the device for Atom and Neighbor storage
   /** \param rot True if quaternions need to be stored
     * \param nlocal Total number of local particles to allocate memory for
     * \param host_nlocal Initial number of host particles to allocate memory for
     * \param nall Total number of local+ghost particles
     * \param gpu_nbor True if neighboring is performed on device
     * \param gpu_host 0 if host will not perform force calculations,
     *                 1 if gpu_nbor is true, and host needs a half nbor list,
     *                 2 if gpu_nbor is true, and host needs a full nbor list
     * \param max_nbors Initial number of rows in the neighbor matrix
     * \param cell_size cutoff+skin 
     * \param pre_cut True if cutoff test will be performed in separate kernel
     *                than the force kernel **/
   bool init(const bool charge, const bool rot, const int nlocal,
             const int host_nlocal, const int nall, PairGPUNbor *nbor, 
             const int maxspecial, const bool gpu_nbor, const int gpu_host, 
             const int max_nbors, const double cell_size, const bool pre_cut);
 
   /// Output a message for pair_style acceleration with device stats
   void init_message(FILE *screen, const char *name,
                     const int first_gpu, const int last_gpu);
 
   /// Output a message with timing information
   void output_times(UCL_Timer &time_pair, PairGPUNbor &nbor, 
                     const double avg_split, 
                     const double max_bytes, FILE *screen);
 
   /// Clear all memory on host and device associated with atom and nbor data
   void clear();
   
   /// Clear all memory on host and device
   void clear_device();
 
+  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
+  inline void add_ans_object(PairGPUAns *ans) { ans_queue.push(ans); }
+
   /// Start timer on host
   inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
   
   /// Stop timer on host
   inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
   
   /// Return host time
   inline double host_time() { return _cpu_full; }
 
   /// Return host memory usage in bytes
   double host_memory_usage() const;
 
   /// Return the number of procs sharing a device (size of device commincator)
   inline int procs_per_gpu() const { return _procs_per_gpu; }
   /// Return the number of threads per proc
   inline int num_threads() const { return _nthreads; }
   /// My rank within all processes
   inline int world_me() const { return _world_me; }
   /// Total number of processes
   inline int world_size() const { return _world_size; }
   /// MPI Barrier for world
   inline void world_barrier() { MPI_Barrier(_comm_world); }
   /// Return the replica MPI communicator
   inline MPI_Comm & replica() { return _comm_replica; }
   /// My rank within replica communicator
   inline int replica_me() const { return _replica_me; }
   /// Number of procs in replica communicator
   inline int replica_size() const { return _replica_size; }
   /// Return the per-GPU MPI communicator
   inline MPI_Comm & gpu_comm() { return _comm_gpu; }
   /// Return my rank in the device communicator
   inline int gpu_rank() const { return _gpu_rank; }
   /// MPI Barrier for gpu
   inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
   /// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH
   inline int gpu_mode() const { return _gpu_mode; }
   /// Index of first device used by a node
   inline int first_device() const { return _first_device; }
   /// Index of last device used by a node
   inline int last_device() const { return _last_device; }
   /// Particle split defined in fix
   inline double particle_split() const { return _particle_split; }
   /// Return the initialization count for the device
   inline int init_count() const { return _init_count; }
 
   // -------------------------- DEVICE DATA ------------------------- 
 
   /// Geryon Device
   UCL_Device *gpu;
 
   enum{GPU_FORCE, GPU_NEIGH};
 
   // --------------------------- ATOM DATA -------------------------- 
 
   /// Atom Data
   PairGPUAtom<numtyp,acctyp> atom;
 
   // --------------------------- NBOR DATA ----------------------------
   
   /// Neighbor Data
   PairGPUNborShared _nbor_shared;
 
  private:
+  std::queue<PairGPUAns *> ans_queue;
   int _init_count;
   bool _device_init;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
   int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
       _replica_size;
   int _gpu_mode, _first_device, _last_device, _nthreads;
   double _particle_split;
   double _cpu_full;
 
   template <class t>
   inline std::string toa(const t& in) {
     std::ostringstream o;
     o.precision(2);
     o << in;
     return o.str();
   }
 
 };
 
 #endif