diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index f626b9f68..a1b0ec052 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -1,296 +1,296 @@
 OCL  = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL
 OCL_LIB = $(LIB_DIR)/libgpu.a
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H)
 # Headers for Pair Stuff
 PAIR_H  = lal_atom.h lal_answer.h lal_neighbor_shared.h \
           lal_neighbor.h lal_precision.h lal_device.h \
           lal_balance.h lal_pppm.h
 # Headers for Preprocessor/Auxiliary Functions
 PRE1_H = lal_preprocessor.h lal_aux_fun1.h
 
 ALL_H = $(OCL_H) $(PAIR_H)
 
 EXECS = $(BIN_DIR)/ocl_get_devices
 OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
        $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
        $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
        $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
        $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
        $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
        $(OBJ_DIR)/lal_lj.o $(OBJ_DIR)/lal_lj_ext.o \
        $(OBJ_DIR)/lal_lj96.o $(OBJ_DIR)/lal_lj96_ext.o \
        $(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
        $(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
        $(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
        $(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
        $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
        $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
        $(OBJ_DIR)/lal_charmm_long.o $(OBJ_DIR)/lal_charmm_long_ext.o \
        $(OBJ_DIR)/lal_cg_cmm.o $(OBJ_DIR)/lal_cg_cmm_ext.o \
        $(OBJ_DIR)/lal_cg_cmm_long.o $(OBJ_DIR)/lal_cg_cmm_long_ext.o \
        $(OBJ_DIR)/lal_eam.o $(OBJ_DIR)/lal_eam_ext.o \
-       $(OBJ_DIR)/lal_buck.o $(OBJ_DIR)/lal_buck_ext.o \ 
+       $(OBJ_DIR)/lal_buck.o $(OBJ_DIR)/lal_buck_ext.o \
        $(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
        $(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
        $(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
        $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o
 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
        $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h \
        $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/re_squared_cl.h \
        $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lj96_cl.h \
        $(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lj_coul_cl.h \
        $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_class2_long_cl.h \
        $(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/morse_cl.h \
        $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/cg_cmm_cl.h \
        $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h \
        $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \
        $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \
        $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h
 
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
 
 all: $(OCL_LIB) $(EXECS)
 
 $(OBJ_DIR)/atom_cl.h: lal_atom.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh atom lal_preprocessor.h lal_atom.cu $(OBJ_DIR)/atom_cl.h
 
 $(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(OCL_H) $(OBJ_DIR)/atom_cl.h
 	$(OCL) -o $@ -c lal_atom.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_answer.o: lal_answer.cpp lal_answer.h $(OCL_H)
 	$(OCL) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/neighbor_cpu_cl.h: lal_neighbor_cpu.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu lal_preprocessor.h lal_neighbor_cpu.cu $(OBJ_DIR)/neighbor_cpu_cl.h
 
 $(OBJ_DIR)/neighbor_gpu_cl.h: lal_neighbor_gpu.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu lal_preprocessor.h lal_neighbor_gpu.cu $(OBJ_DIR)/neighbor_gpu_cl.h
 
 $(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OCL_H) $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h
 	$(OCL) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp lal_neighbor.h $(OCL_H) lal_neighbor_shared.h
 	$(OCL) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/device_cl.h: lal_device.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh device lal_preprocessor.h lal_device.cu $(OBJ_DIR)/device_cl.h
 
 $(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_cl.h
 	$(OCL) -o $@ -c lal_device.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_base_atomic.o: $(OCL_H) lal_base_atomic.h lal_base_atomic.cpp
 	$(OCL) -o $@ -c lal_base_atomic.cpp
 
 $(OBJ_DIR)/lal_base_charge.o: $(OCL_H) lal_base_charge.h lal_base_charge.cpp
 	$(OCL) -o $@ -c lal_base_charge.cpp
 
 $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h
 	$(OCL) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
 
 $(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp  $(OBJ_DIR)/pppm_cl.h $(OBJ_DIR)/pppm_cl.h
 	$(OCL) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp
 	$(OCL) -o $@ -c lal_pppm_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h
 
 $(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
 
 $(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
 
 $(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cl.h $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(OCL) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp
 	$(OCL) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
 
 $(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
 
 $(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cl.h $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(OCL) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/lal_re_squared.o lal_re_squared_ext.cpp
 	$(OCL) -o $@ -c lal_re_squared_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_cl.h: lal_lj.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh lj $(PRE1_H) lal_lj.cu $(OBJ_DIR)/lj_cl.h;
 
 $(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp  $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_lj.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_coul_cl.h: lal_lj_coul.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh lj_coul $(PRE1_H) lal_lj_coul.cu $(OBJ_DIR)/lj_coul_cl.h;
 
 $(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp  $(OBJ_DIR)/lj_coul_cl.h $(OBJ_DIR)/lj_coul_cl.h $(OBJ_DIR)/lal_base_charge.o
 	$(OCL) -o $@ -c lal_lj_coul.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_coul_ext.o: $(ALL_H) lal_lj_coul.h lal_lj_coul_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_lj_coul_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_coul_long_cl.h: lal_lj_coul_long.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh lj_coul_long $(PRE1_H) lal_lj_coul_long.cu $(OBJ_DIR)/lj_coul_long_cl.h;
 
 $(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp  $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o
 	$(OCL) -o $@ -c lal_lj_coul_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_class2_long_cl.h: lal_lj_class2_long.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(PRE1_H) lal_lj_class2_long.cu $(OBJ_DIR)/lj_class2_long_cl.h;
 
 $(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp  $(OBJ_DIR)/lj_class2_long_cl.h $(OBJ_DIR)/lal_base_charge.o
 	$(OCL) -o $@ -c lal_lj_class2_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_class2_long_ext.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_lj_class2_long_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/coul_long_cl.h: lal_coul_long.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh coul_long $(PRE1_H) lal_coul_long.cu $(OBJ_DIR)/coul_long_cl.h;
 
 $(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp  $(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o
 	$(OCL) -o $@ -c lal_coul_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_coul_long_ext.o: $(ALL_H) lal_coul_long.h lal_coul_long_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_coul_long_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/morse_cl.h: lal_morse.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh morse $(PRE1_H) lal_morse.cu $(OBJ_DIR)/morse_cl.h;
 
 $(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp  $(OBJ_DIR)/morse_cl.h $(OBJ_DIR)/morse_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_morse.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_morse_ext.o: $(ALL_H) lal_morse.h lal_morse_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_morse_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/charmm_long_cl.h: lal_charmm_long.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh charmm_long $(PRE1_H) lal_charmm_long.cu $(OBJ_DIR)/charmm_long_cl.h;
 
 $(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp  $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/lal_base_charge.o
 	$(OCL) -o $@ -c lal_charmm_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_charmm_long_ext.o: $(ALL_H) lal_charmm_long.h lal_charmm_long_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_charmm_long_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cl.h: lal_lj96.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh lj96 $(PRE1_H) lal_lj96.cu $(OBJ_DIR)/lj96_cl.h;
 
 $(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp  $(OBJ_DIR)/lj96_cl.h $(OBJ_DIR)/lj96_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_lj96.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj96_ext.o: $(ALL_H) lal_lj96.h lal_lj96_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_lj96_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj_expand_cl.h: lal_lj_expand.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh lj_expand $(PRE1_H) lal_lj_expand.cu $(OBJ_DIR)/lj_expand_cl.h;
 
 $(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp  $(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_lj_expand.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cg_cmm_cl.h: lal_cg_cmm.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh cg_cmm $(PRE1_H) lal_cg_cmm.cu $(OBJ_DIR)/cg_cmm_cl.h;
 
 $(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp  $(OBJ_DIR)/cg_cmm_cl.h $(OBJ_DIR)/cg_cmm_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_cg_cmm.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_cg_cmm_ext.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_cg_cmm_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cg_cmm_long_cl.h: lal_cg_cmm_long.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh cg_cmm_long $(PRE1_H) lal_cg_cmm_long.cu $(OBJ_DIR)/cg_cmm_long_cl.h;
 
 $(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp  $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_cg_cmm_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_cg_cmm_long_ext.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_cg_cmm_long_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/eam_cl.h: lal_eam.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh eam $(PRE1_H) lal_eam.cu $(OBJ_DIR)/eam_cl.h;
 
 $(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp  $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/lal_base_charge.o
 	$(OCL) -o $@ -c lal_eam.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_eam_ext.o: $(ALL_H) lal_eam.h lal_eam_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_eam_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/buck_cl.h: lal_buck.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh buck $(PRE1_H) lal_buck.cu $(OBJ_DIR)/buck_cl.h;
 
 $(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp  $(OBJ_DIR)/buck_cl.h $(OBJ_DIR)/buck_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_buck.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_buck_ext.o: $(ALL_H) lal_buck.h lal_buck_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_buck_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/buck_coul_coul_cl.h: lal_buck_coul.cu $(PRE1_H)
+$(OBJ_DIR)/buck_coul_cl.h: lal_buck_coul.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh buck_coul $(PRE1_H) lal_buck_coul.cu $(OBJ_DIR)/buck_coul_cl.h;
 
 $(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp  $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/lal_base_charge.o
 	$(OCL) -o $@ -c lal_buck_coul.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_buck_coul_ext.o: $(ALL_H) lal_buck_coul.h lal_buck_coul_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_buck_coul_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/buck_coul_coul_long_cl.h: lal_buck_coul_long.cu $(PRE1_H)
+$(OBJ_DIR)/buck_coul_long_cl.h: lal_buck_coul_long.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh buck_coul_long $(PRE1_H) lal_buck_coul_long.cu $(OBJ_DIR)/buck_coul_long_cl.h;
 
 $(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp  $(OBJ_DIR)/buck_coul_long_cl.h $(OBJ_DIR)/buck_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o
 	$(OCL) -o $@ -c lal_buck_coul_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_buck_coul_long_ext.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_buck_coul_long_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/table_cl.h: lal_table.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh table $(PRE1_H) lal_table.cu $(OBJ_DIR)/table_cl.h;
 
 $(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp  $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_table.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_table_ext.o: $(ALL_H) lal_table.h lal_table_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_table_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/yukawa_cl.h: lal_yukawa.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh yukawa $(PRE1_H) lal_yukawa.cu $(OBJ_DIR)/yukawa_cl.h;
 
 $(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp  $(OBJ_DIR)/yukawa_cl.h $(OBJ_DIR)/yukawa_cl.h $(OBJ_DIR)/lal_base_atomic.o
 	$(OCL) -o $@ -c lal_yukawa.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lal_yukawae_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
+$(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)
 
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
 
 $(OCL_LIB): $(OBJS) $(PTXS)
 	$(AR) -crusv $(OCL_LIB) $(OBJS)
 
 opencl: $(OCL_EXECS)
 
 clean:
 	rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo
 
 veryclean: clean
 	rm -rf *~ *.linkinfo
 
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index 07955373b..636bf19af 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -1,550 +1,558 @@
 /***************************************************************************
                                 lal_eam.cpp
                              -------------------
                       W. Michael Brown, Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the eam pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
  
 #ifdef USE_OPENCL
 #include "eam_cl.h"
 #else
 #include "eam_ptx.h"
 #endif
 
 #include "lal_eam.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define EAMT EAM<numtyp, acctyp>
 
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
-EAMT::EAM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
+EAMT::EAM() : BaseAtomic<numtyp,acctyp>(), 
+  _compiled_energy(false), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
 EAMT::~EAM() {
   clear();
 }
  
 template <class numtyp, class acctyp>
 int EAMT::init(const int ntypes, double host_cutforcesq,
               int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
               double ***host_rhor_spline, double ***host_z2r_spline,
               double ***host_frho_spline,
               double rdr, double rdrho, int nrhor, int nrho, 
               int nz2r, int nfrho, int nr,
               const int nlocal, const int nall, const int max_nbors,
               const int maxspecial, const double cell_size,
               const double gpu_split, FILE *_screen) 
 {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                             _screen,eam);
   
   if (success!=0)
     return success;
   
   // allocate fp
   
   bool cpuview=false;
   if (this->ucl_device->device_type()==UCL_CPU)
     cpuview=true;
   
   _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
   host_fp.alloc(_max_fp_size,*(this->ucl_device));
   if (cpuview)
     dev_fp.view(host_fp);
   else 
     dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY);
                                      
   k_energy.set_function(*(this->pair_program),"kernel_energy");
   k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast");
   fp_tex.get_texture(*(this->pair_program),"fp_tex");
   fp_tex.bind_float(dev_fp,1);
-
+  _compiled_energy = true;
+  
   // Initialize timers for selected GPU
   time_pair2.init(*(this->ucl_device));
   time_pair2.zero();
   
   time_fp1.init(*(this->ucl_device));
   time_fp1.zero();
   
   time_fp2.init(*(this->ucl_device));
   time_fp2.zero();
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
 
   int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
   }
   
   _ntypes=lj_types;
   _cutforcesq=host_cutforcesq;
   _rdr=rdr;
   _rdrho = rdrho;
   _nrhor=nrhor;
   _nrho=nrho;
   _nz2r=nz2r;
   _nfrho=nfrho;
   _nr=nr;
   
   UCL_H_Vec<numtyp> dview_type(lj_types*lj_types*2,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
   
   for (int i=0; i<lj_types*lj_types*2; i++)
     dview_type[i]=(numtyp)0.0; 
                                 
   // pack type2rhor and type2z2r
   type2rhor_z2r.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   
   this->atom->type_pack2(ntypes,lj_types,type2rhor_z2r,dview_type,
                         host_type2rhor,
                         host_type2z2r);
   
   // pack type2frho
   UCL_H_Vec<numtyp> dview_type2frho(ntypes,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
 
   type2frho.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<ntypes; i++)
     dview_type2frho[i]=(numtyp)host_type2frho[i];
   ucl_copy(type2frho,dview_type2frho,false);
                         
   // pack frho_spline
   UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nr+1),*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
                                
   for (int ix=0; ix<nfrho; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_frho_spline[ix*(nr+1)+iy].x=host_frho_spline[ix][iy][0];
     dview_frho_spline[ix*(nr+1)+iy].y=host_frho_spline[ix][iy][1];
     dview_frho_spline[ix*(nr+1)+iy].z=host_frho_spline[ix][iy][2];
     dview_frho_spline[ix*(nr+1)+iy].w=0;
   }
 
   frho_spline1.alloc(nfrho*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(frho_spline1,dview_frho_spline,false);
   frho_spline1_tex.get_texture(*(this->pair_program),"frho_sp1_tex");
   frho_spline1_tex.bind_float(frho_spline1,4);
 
   for (int ix=0; ix<nfrho; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_frho_spline[ix*(nr+1)+iy].x=host_frho_spline[ix][iy][3];
     dview_frho_spline[ix*(nr+1)+iy].y=host_frho_spline[ix][iy][4];
     dview_frho_spline[ix*(nr+1)+iy].z=host_frho_spline[ix][iy][5];
     dview_frho_spline[ix*(nr+1)+iy].w=host_frho_spline[ix][iy][6];
   }
 
   frho_spline2.alloc(nfrho*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(frho_spline2,dview_frho_spline,false);
   frho_spline2_tex.get_texture(*(this->pair_program),"frho_sp2_tex");
   frho_spline2_tex.bind_float(frho_spline2,4);
 
   // pack rhor_spline
   UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
                                
   for (int ix=0; ix<nrhor; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][0];
     dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][1];
     dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][2];
     dview_rhor_spline[ix*(nr+1)+iy].w=(numtyp)0;
   }
 
   rhor_spline1.alloc(nrhor*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(rhor_spline1,dview_rhor_spline,false);
   rhor_spline1_tex.get_texture(*(this->pair_program),"rhor_sp1_tex");
   rhor_spline1_tex.bind_float(rhor_spline1,4);
 
   for (int ix=0; ix<nrhor; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][3];
     dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][4];
     dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][5];
     dview_rhor_spline[ix*(nr+1)+iy].w=host_rhor_spline[ix][iy][6];
   }
 
   rhor_spline2.alloc(nrhor*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(rhor_spline2,dview_rhor_spline,false);
   rhor_spline2_tex.get_texture(*(this->pair_program),"rhor_sp2_tex");
   rhor_spline2_tex.bind_float(rhor_spline2,4);
 
   // pack z2r_spline
   UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
                                
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][0];
     dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][1];
     dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][2];
     dview_z2r_spline[ix*(nr+1)+iy].w=(numtyp)0;
   }
   
   z2r_spline1.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(z2r_spline1,dview_z2r_spline,false);
   z2r_spline1_tex.get_texture(*(this->pair_program),"z2r_sp1_tex");
   z2r_spline1_tex.bind_float(z2r_spline1,4);
   
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++) {
     dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][3];
     dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][4];
     dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][5];
     dview_z2r_spline[ix*(nr+1)+iy].w=host_z2r_spline[ix][iy][6];
   }
   
   z2r_spline2.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(z2r_spline2,dview_z2r_spline,false);
   z2r_spline2_tex.get_texture(*(this->pair_program),"z2r_sp2_tex");
   z2r_spline2_tex.bind_float(z2r_spline2,4);
 
   _allocated=true;
   this->_max_bytes=type2rhor_z2r.row_bytes()
         + type2frho.row_bytes()
         + rhor_spline1.row_bytes()
         + rhor_spline2.row_bytes()
         + frho_spline1.row_bytes()
         + frho_spline2.row_bytes()
         + z2r_spline1.row_bytes()
         + z2r_spline2.row_bytes()
         + dev_fp.row_bytes();
   return 0;
 }
 
 template <class numtyp, class acctyp>
 void EAMT::clear() {
   if (!_allocated)
     return;
   _allocated=false;
   
   type2rhor_z2r.clear();
   type2frho.clear();
   rhor_spline1.clear();
   rhor_spline2.clear();
   frho_spline1.clear();
   frho_spline2.clear();
   z2r_spline1.clear();
   z2r_spline2.clear();
   
   host_fp.clear();
   dev_fp.clear();
   
   time_pair2.clear();
   time_fp1.clear();
   time_fp2.clear();
   
+  if (_compiled_energy) {
+    k_energy_fast.clear();
+    k_energy.clear();
+    _compiled_energy=false;
+  }
+
   this->clear_atomic();
 }
 
 template <class numtyp, class acctyp>
 double EAMT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(EAM<numtyp,acctyp>);
 }
 
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then compute atom energies/forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::compute(const int f_ago, const int inum_full,
                    const int nall, double **host_x, int *host_type,
                    int *ilist, int *numj, int **firstneigh,
                    const bool eflag, const bool vflag,
                    const bool eatom, const bool vatom,
                    int &host_start, const double cpu_time,
                    bool &success, void **fp_ptr) {
   this->acc_timers();
   
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
     this->time_pair.add_time_to_total(time_pair2.time());
     
     // Add transfer time from device -> host after part 1
     this->atom->add_transfer_time(time_fp1.time());
     
     // Add transfer time from host -> device before part 2
     this->atom->add_transfer_time(time_fp2.time());
   }
   
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
     this->resize_atom(0,nall,success);
     this->zero_timers();
     return;
   }
   
   int ago=this->hd_balancer.ago_first(f_ago);
   int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
   this->ans->inum(inum);
   host_start=inum;
 
   // ------------------- Resize FP Array for EAM --------------------
   
   if (nall>_max_fp_size) {
     dev_fp.clear();
     host_fp.clear();
     
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
     host_fp.alloc(_max_fp_size,*(this->ucl_device));
     if (this->ucl_device->device_type()==UCL_CPU)
       dev_fp.view(host_fp);
     else 
       dev_fp.alloc(_max_fp_size,*(this->ucl_device));
     
     fp_tex.bind_float(dev_fp,1);
   }
   *fp_ptr=host_fp.begin();
 
   // -----------------------------------------------------------------
 
   if (ago==0) {
     this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
     if (!success)
       return;
   }
   
   this->atom->cast_x_data(host_x,host_type);
   this->atom->add_x_data(host_x,host_type);
 
   loop(eflag,vflag);
   
   // copy fp from device to host for comm
   time_fp1.start();
   ucl_copy(host_fp,dev_fp,false);
   time_fp1.stop();
 }
 
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU and then compute per-atom densities
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** EAMT::compute(const int ago, const int inum_full,
                     const int nall, double **host_x, int *host_type,
                     double *sublo, double *subhi, int *tag,
                     int **nspecial, int **special, const bool eflag, 
                     const bool vflag, const bool eatom,
                     const bool vatom, int &host_start,
                     int **ilist, int **jnum,
                     const double cpu_time, bool &success,
                     int &inum, void **fp_ptr) {
   this->acc_timers();
   
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
     this->time_pair.add_time_to_total(time_pair2.time());
     
     // Add transfer time from device -> host after part 1
     this->atom->add_transfer_time(time_fp1.time());
     
     // Add transfer time from host -> device before part 2
     this->atom->add_transfer_time(time_fp2.time());
   }
   
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
     this->resize_atom(0,nall,success);
     this->zero_timers();
     return NULL;
   }
   
   // load balance, returning the atom count on the device (inum)
   this->hd_balancer.balance(cpu_time);
   inum=this->hd_balancer.get_gpu_count(ago,inum_full);
   this->ans->inum(inum);
   host_start=inum;
  
   // ------------------- Resize FP Array for EAM --------------------
   
   if (nall>_max_fp_size) {
     dev_fp.clear();
     host_fp.clear();
     
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
     host_fp.alloc(_max_fp_size,*(this->ucl_device));
     if (this->ucl_device->device_type()==UCL_CPU)
       dev_fp.view(host_fp);
     else 
       dev_fp.alloc(_max_fp_size,*(this->ucl_device));
     
     fp_tex.bind_float(dev_fp,1);
   }      
   *fp_ptr=host_fp.begin();  
 
   // -----------------------------------------------------------------
 
   // Build neighbor list on GPU if necessary 
   if (ago==0) {
     this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
   } else {
     this->atom->cast_x_data(host_x,host_type);
     this->atom->add_x_data(host_x,host_type);
   }
   *ilist=this->nbor->host_ilist.begin();
   *jnum=this->nbor->host_acc.begin();
 
   loop(eflag,vflag);
   
   // copy fp from device to host for comm
   time_fp1.start();
   ucl_copy(host_fp,dev_fp,false);
   time_fp1.stop();
   
   return this->nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom) {
   this->hd_balancer.start_timer();
   time_fp2.start();
   this->add_fp_data();
   time_fp2.stop();
   
   loop2(eflag,vflag);
   if (ilist == NULL)
     this->ans->copy_answers(eflag,vflag,eatom,vatom);
   else
     this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist);
   
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
 
 // ---------------------------------------------------------------------------
 // Calculate per-atom energies and forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::loop(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
   if (_eflag)
     eflag=1;
   else
     eflag=0;
 
   if (_vflag)
     vflag=1;
   else
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   
   if (shared_types) {
     this->k_energy_fast.set_size(GX,BX);
     this->k_energy_fast.run(&this->atom->dev_x.begin(), 
                  &type2rhor_z2r.begin(), &type2frho.begin(),
                  &rhor_spline2.begin(),
                  &frho_spline1.begin(),&frho_spline2.begin(), 
                  &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(),
                  &dev_fp.begin(), 
                  &this->ans->dev_engv.begin(),
                  &eflag, &ainum,
                  &nbor_pitch, 
                  &_ntypes, &_cutforcesq, 
                  &_rdr, &_rdrho,
                  &_nrho, &_nr,
                  &this->_threads_per_atom);
   }
   else {
     this->k_energy.set_size(GX,BX);
     this->k_energy.run(&this->atom->dev_x.begin(), 
                  &type2rhor_z2r.begin(), &type2frho.begin(),
                  &rhor_spline2.begin(),
                  &frho_spline1.begin(),&frho_spline2.begin(), 
                  &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(),
                  &dev_fp.begin(), 
                  &this->ans->dev_engv.begin(),
                  &eflag, &ainum,
                  &nbor_pitch, 
                  &_ntypes, &_cutforcesq, 
                  &_rdr, &_rdrho,
                  &_nrho, &_nr,
                  &this->_threads_per_atom);
   }
 
   this->time_pair.stop();
 }
 
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void EAMT::loop2(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int eflag, vflag;
   if (_eflag)
     eflag=1;
   else
     eflag=0;
 
   if (_vflag)
     vflag=1;
   else
     vflag=0;
   
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair2.start();
   
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(), 
                    &type2rhor_z2r.begin(),
                    &rhor_spline1.begin(), 
                    &z2r_spline1.begin(),
                    &z2r_spline2.begin(), 
                    &this->nbor->dev_nbor.begin(),
                    &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                    &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                    &nbor_pitch, &_cutforcesq, &_rdr, &_nr,
                    &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(), 
                    &type2rhor_z2r.begin(),
                    &rhor_spline1.begin(), 
                    &z2r_spline1.begin(),
                    &z2r_spline2.begin(),
                    &this->nbor->dev_nbor.begin(),
                    &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                    &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                    &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr,
                    &this->_threads_per_atom);
   }
 
   this->time_pair2.stop();
 }
 
 template class EAM<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index b37bd18d6..60e550ffc 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -1,135 +1,138 @@
 /***************************************************************************
                               lal_eam.h
                              -------------------
                       W. Michael Brown, Trung Dac Nguyen (ORNL)
 
   Class for acceleration of the eam pair style.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
 #ifndef LAL_EAM_H
 #define LAL_EAM_H
 
 #include "lal_base_atomic.h"
 
 namespace LAMMPS_AL {
 
 template <class numtyp, class acctyp>
 class EAM : public BaseAtomic<numtyp, acctyp> {
  public:
   EAM();
   ~EAM();
                   
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * 
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double host_cutforcesq,
           int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
           double ***host_rhor_spline, double ***host_z2r_spline,
           double ***host_frho_spline,
           double rdr, double rdrho, int nrhor, int nrho, 
           int nz2r, int nfrho, int nr,
           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *_screen);
   
   // Copy charges to device asynchronously
   inline void add_fp_data() {
     ucl_copy(dev_fp,host_fp,this->atom->nall(),true);
   }
   
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
 
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
   
   /// Pair loop with host neighboring
   void compute(const int f_ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *ilist, int *numj,
                int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success,
                void **fp_ptr);
                
   /// Pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, int *tag, int **nspecial,
                 int **special, const bool eflag, const bool vflag, 
                 const bool eatom, const bool vatom, int &host_start, 
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 int &inum, void **fp_ptr);
 
   /// Pair loop with host neighboring
   void compute2(int *ilist, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom);
   
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Kernel k_energy, k_energy_fast;
   
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture fp_tex;
   UCL_Texture rhor_spline1_tex, rhor_spline2_tex;
   UCL_Texture frho_spline1_tex, frho_spline2_tex;
   UCL_Texture z2r_spline1_tex, z2r_spline2_tex;
 
   // --------------------------- DEVICE DATA --------------------------
   
   /// Device Timers
   UCL_Timer time_pair2, time_fp1, time_fp2;
   
   // --------------------------- TYPE DATA --------------------------
     
   UCL_D_Vec<numtyp2> type2rhor_z2r;
   
   UCL_D_Vec<numtyp> type2frho;
   
   UCL_D_Vec<numtyp4> z2r_spline1, z2r_spline2;
   UCL_D_Vec<numtyp4> frho_spline1, frho_spline2;
   UCL_D_Vec<numtyp4> rhor_spline1, rhor_spline2;
     
   numtyp _cutforcesq,_rdr,_rdrho;
   
   int _nfrho,_nrhor,_nrho,_nz2r,_nr;
   
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
   
   /// Number of atom types 
   int _ntypes;
   
   int _max_fp_size;
   
+  /// True of energy kernels are compiled
+  bool _compiled_energy;
+  
   /// Per-atom arrays
   UCL_H_Vec<numtyp> host_fp;
   UCL_D_Vec<numtyp> dev_fp;
   
 protected:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
   void loop2(const bool _eflag, const bool _vflag);
 };
 
 }
 
 #endif