diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index f626b9f68..a1b0ec052 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -1,296 +1,296 @@ OCL = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL OCL_LIB = $(LIB_DIR)/libgpu.a # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) # Headers for Pair Stuff PAIR_H = lal_atom.h lal_answer.h lal_neighbor_shared.h \ lal_neighbor.h lal_precision.h lal_device.h \ lal_balance.h lal_pppm.h # Headers for Preprocessor/Auxiliary Functions PRE1_H = lal_preprocessor.h lal_aux_fun1.h ALL_H = $(OCL_H) $(PAIR_H) EXECS = $(BIN_DIR)/ocl_get_devices OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ $(OBJ_DIR)/lal_lj.o $(OBJ_DIR)/lal_lj_ext.o \ $(OBJ_DIR)/lal_lj96.o $(OBJ_DIR)/lal_lj96_ext.o \ $(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \ $(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \ $(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \ $(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \ $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \ $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \ $(OBJ_DIR)/lal_charmm_long.o $(OBJ_DIR)/lal_charmm_long_ext.o \ $(OBJ_DIR)/lal_cg_cmm.o $(OBJ_DIR)/lal_cg_cmm_ext.o \ $(OBJ_DIR)/lal_cg_cmm_long.o $(OBJ_DIR)/lal_cg_cmm_long_ext.o \ $(OBJ_DIR)/lal_eam.o $(OBJ_DIR)/lal_eam_ext.o \ - $(OBJ_DIR)/lal_buck.o $(OBJ_DIR)/lal_buck_ext.o \ + $(OBJ_DIR)/lal_buck.o $(OBJ_DIR)/lal_buck_ext.o \ $(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \ $(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \ $(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \ $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \ $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h \ $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/re_squared_cl.h \ $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lj96_cl.h \ $(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lj_coul_cl.h \ $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_class2_long_cl.h \ $(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/morse_cl.h \ $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/cg_cmm_cl.h \ $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h \ $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \ $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \ $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h OCL_EXECS = $(BIN_DIR)/ocl_get_devices all: $(OCL_LIB) $(EXECS) $(OBJ_DIR)/atom_cl.h: lal_atom.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh atom lal_preprocessor.h lal_atom.cu $(OBJ_DIR)/atom_cl.h $(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(OCL_H) $(OBJ_DIR)/atom_cl.h $(OCL) -o $@ -c lal_atom.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_answer.o: lal_answer.cpp lal_answer.h $(OCL_H) $(OCL) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) $(OBJ_DIR)/neighbor_cpu_cl.h: lal_neighbor_cpu.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh neighbor_cpu lal_preprocessor.h lal_neighbor_cpu.cu $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h: lal_neighbor_gpu.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh neighbor_gpu lal_preprocessor.h lal_neighbor_gpu.cu $(OBJ_DIR)/neighbor_gpu_cl.h $(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OCL_H) $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h $(OCL) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp lal_neighbor.h $(OCL_H) lal_neighbor_shared.h $(OCL) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) $(OBJ_DIR)/device_cl.h: lal_device.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh device lal_preprocessor.h lal_device.cu $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_cl.h $(OCL) -o $@ -c lal_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_base_atomic.o: $(OCL_H) lal_base_atomic.h lal_base_atomic.cpp $(OCL) -o $@ -c lal_base_atomic.cpp $(OBJ_DIR)/lal_base_charge.o: $(OCL_H) lal_base_charge.h lal_base_charge.cpp $(OCL) -o $@ -c lal_base_charge.cpp $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OCL) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h; $(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_cl.h $(OBJ_DIR)/pppm_cl.h $(OCL) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp $(OCL) -o $@ -c lal_pppm_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h; $(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h; $(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cl.h $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o $(OCL) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp $(OCL) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h; $(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h; $(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cl.h $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o $(OCL) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/lal_re_squared.o lal_re_squared_ext.cpp $(OCL) -o $@ -c lal_re_squared_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_cl.h: lal_lj.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh lj $(PRE1_H) lal_lj.cu $(OBJ_DIR)/lj_cl.h; $(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lal_base_atomic.o $(OCL) -o $@ -c lal_lj.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_coul_cl.h: lal_lj_coul.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh lj_coul $(PRE1_H) lal_lj_coul.cu $(OBJ_DIR)/lj_coul_cl.h; $(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_cl.h $(OBJ_DIR)/lj_coul_cl.h $(OBJ_DIR)/lal_base_charge.o $(OCL) -o $@ -c lal_lj_coul.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_coul_ext.o: $(ALL_H) lal_lj_coul.h lal_lj_coul_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_lj_coul_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_coul_long_cl.h: lal_lj_coul_long.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh lj_coul_long $(PRE1_H) lal_lj_coul_long.cu $(OBJ_DIR)/lj_coul_long_cl.h; $(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o $(OCL) -o $@ -c lal_lj_coul_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_class2_long_cl.h: lal_lj_class2_long.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(PRE1_H) lal_lj_class2_long.cu $(OBJ_DIR)/lj_class2_long_cl.h; $(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_cl.h $(OBJ_DIR)/lal_base_charge.o $(OCL) -o $@ -c lal_lj_class2_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_class2_long_ext.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_lj_class2_long_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/coul_long_cl.h: lal_coul_long.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh coul_long $(PRE1_H) lal_coul_long.cu $(OBJ_DIR)/coul_long_cl.h; $(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o $(OCL) -o $@ -c lal_coul_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_coul_long_ext.o: $(ALL_H) lal_coul_long.h lal_coul_long_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_coul_long_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/morse_cl.h: lal_morse.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh morse $(PRE1_H) lal_morse.cu $(OBJ_DIR)/morse_cl.h; $(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_cl.h $(OBJ_DIR)/morse_cl.h $(OBJ_DIR)/lal_base_atomic.o $(OCL) -o $@ -c lal_morse.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_morse_ext.o: $(ALL_H) lal_morse.h lal_morse_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_morse_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/charmm_long_cl.h: lal_charmm_long.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh charmm_long $(PRE1_H) lal_charmm_long.cu $(OBJ_DIR)/charmm_long_cl.h; $(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/lal_base_charge.o $(OCL) -o $@ -c lal_charmm_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_charmm_long_ext.o: $(ALL_H) lal_charmm_long.h lal_charmm_long_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_charmm_long_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cl.h: lal_lj96.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh lj96 $(PRE1_H) lal_lj96.cu $(OBJ_DIR)/lj96_cl.h; $(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_cl.h $(OBJ_DIR)/lj96_cl.h $(OBJ_DIR)/lal_base_atomic.o $(OCL) -o $@ -c lal_lj96.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj96_ext.o: $(ALL_H) lal_lj96.h lal_lj96_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_lj96_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_expand_cl.h: lal_lj_expand.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh lj_expand $(PRE1_H) lal_lj_expand.cu $(OBJ_DIR)/lj_expand_cl.h; $(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lal_base_atomic.o $(OCL) -o $@ -c lal_lj_expand.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cg_cmm_cl.h: lal_cg_cmm.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh cg_cmm $(PRE1_H) lal_cg_cmm.cu $(OBJ_DIR)/cg_cmm_cl.h; $(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_cl.h $(OBJ_DIR)/cg_cmm_cl.h $(OBJ_DIR)/lal_base_atomic.o $(OCL) -o $@ -c lal_cg_cmm.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_cg_cmm_ext.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_cg_cmm_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cg_cmm_long_cl.h: lal_cg_cmm_long.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh cg_cmm_long $(PRE1_H) lal_cg_cmm_long.cu $(OBJ_DIR)/cg_cmm_long_cl.h; $(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/lal_base_atomic.o $(OCL) -o $@ -c lal_cg_cmm_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_cg_cmm_long_ext.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_cg_cmm_long_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/eam_cl.h: lal_eam.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh eam $(PRE1_H) lal_eam.cu $(OBJ_DIR)/eam_cl.h; $(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/lal_base_charge.o $(OCL) -o $@ -c lal_eam.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_eam_ext.o: $(ALL_H) lal_eam.h lal_eam_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_eam_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/buck_cl.h: lal_buck.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh buck $(PRE1_H) lal_buck.cu $(OBJ_DIR)/buck_cl.h; $(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_cl.h $(OBJ_DIR)/buck_cl.h $(OBJ_DIR)/lal_base_atomic.o $(OCL) -o $@ -c lal_buck.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_buck_ext.o: $(ALL_H) lal_buck.h lal_buck_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_buck_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/buck_coul_coul_cl.h: lal_buck_coul.cu $(PRE1_H) +$(OBJ_DIR)/buck_coul_cl.h: lal_buck_coul.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh buck_coul $(PRE1_H) lal_buck_coul.cu $(OBJ_DIR)/buck_coul_cl.h; $(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/lal_base_charge.o $(OCL) -o $@ -c lal_buck_coul.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_buck_coul_ext.o: $(ALL_H) lal_buck_coul.h lal_buck_coul_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_buck_coul_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/buck_coul_coul_long_cl.h: lal_buck_coul_long.cu $(PRE1_H) +$(OBJ_DIR)/buck_coul_long_cl.h: lal_buck_coul_long.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh buck_coul_long $(PRE1_H) lal_buck_coul_long.cu $(OBJ_DIR)/buck_coul_long_cl.h; $(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_cl.h $(OBJ_DIR)/buck_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o $(OCL) -o $@ -c lal_buck_coul_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_buck_coul_long_ext.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_buck_coul_long_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/table_cl.h: lal_table.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh table $(PRE1_H) lal_table.cu $(OBJ_DIR)/table_cl.h; $(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/lal_base_atomic.o $(OCL) -o $@ -c lal_table.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_table_ext.o: $(ALL_H) lal_table.h lal_table_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_table_ext.cpp -I$(OBJ_DIR) $(OBJ_DIR)/yukawa_cl.h: lal_yukawa.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh yukawa $(PRE1_H) lal_yukawa.cu $(OBJ_DIR)/yukawa_cl.h; $(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_cl.h $(OBJ_DIR)/yukawa_cl.h $(OBJ_DIR)/lal_base_atomic.o $(OCL) -o $@ -c lal_yukawa.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lal_yukawae_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h +$(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR) $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) $(OCL_LIB): $(OBJS) $(PTXS) $(AR) -crusv $(OCL_LIB) $(OBJS) opencl: $(OCL_EXECS) clean: rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo veryclean: clean rm -rf *~ *.linkinfo diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index 07955373b..636bf19af 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -1,550 +1,558 @@ /*************************************************************************** lal_eam.cpp ------------------- W. Michael Brown, Trung Dac Nguyen (ORNL) Class for acceleration of the eam pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ #ifdef USE_OPENCL #include "eam_cl.h" #else #include "eam_ptx.h" #endif #include "lal_eam.h" #include <cassert> using namespace LAMMPS_AL; #define EAMT EAM<numtyp, acctyp> extern Device<PRECISION,ACC_PRECISION> device; template <class numtyp, class acctyp> -EAMT::EAM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) { +EAMT::EAM() : BaseAtomic<numtyp,acctyp>(), + _compiled_energy(false), _allocated(false) { } template <class numtyp, class acctyp> EAMT::~EAM() { clear(); } template <class numtyp, class acctyp> int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double rdr, double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, _screen,eam); if (success!=0) return success; // allocate fp bool cpuview=false; if (this->ucl_device->device_type()==UCL_CPU) cpuview=true; _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10); host_fp.alloc(_max_fp_size,*(this->ucl_device)); if (cpuview) dev_fp.view(host_fp); else dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY); k_energy.set_function(*(this->pair_program),"kernel_energy"); k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast"); fp_tex.get_texture(*(this->pair_program),"fp_tex"); fp_tex.bind_float(dev_fp,1); - + _compiled_energy = true; + // Initialize timers for selected GPU time_pair2.init(*(this->ucl_device)); time_pair2.zero(); time_fp1.init(*(this->ucl_device)); time_fp1.zero(); time_fp2.init(*(this->ucl_device)); time_fp2.zero(); // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; } _ntypes=lj_types; _cutforcesq=host_cutforcesq; _rdr=rdr; _rdrho = rdrho; _nrhor=nrhor; _nrho=nrho; _nz2r=nz2r; _nfrho=nfrho; _nr=nr; UCL_H_Vec<numtyp> dview_type(lj_types*lj_types*2,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int i=0; i<lj_types*lj_types*2; i++) dview_type[i]=(numtyp)0.0; // pack type2rhor and type2z2r type2rhor_z2r.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack2(ntypes,lj_types,type2rhor_z2r,dview_type, host_type2rhor, host_type2z2r); // pack type2frho UCL_H_Vec<numtyp> dview_type2frho(ntypes,*(this->ucl_device), UCL_WRITE_OPTIMIZED); type2frho.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<ntypes; i++) dview_type2frho[i]=(numtyp)host_type2frho[i]; ucl_copy(type2frho,dview_type2frho,false); // pack frho_spline UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nr+1),*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int ix=0; ix<nfrho; ix++) for (int iy=0; iy<nr+1; iy++) { dview_frho_spline[ix*(nr+1)+iy].x=host_frho_spline[ix][iy][0]; dview_frho_spline[ix*(nr+1)+iy].y=host_frho_spline[ix][iy][1]; dview_frho_spline[ix*(nr+1)+iy].z=host_frho_spline[ix][iy][2]; dview_frho_spline[ix*(nr+1)+iy].w=0; } frho_spline1.alloc(nfrho*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(frho_spline1,dview_frho_spline,false); frho_spline1_tex.get_texture(*(this->pair_program),"frho_sp1_tex"); frho_spline1_tex.bind_float(frho_spline1,4); for (int ix=0; ix<nfrho; ix++) for (int iy=0; iy<nr+1; iy++) { dview_frho_spline[ix*(nr+1)+iy].x=host_frho_spline[ix][iy][3]; dview_frho_spline[ix*(nr+1)+iy].y=host_frho_spline[ix][iy][4]; dview_frho_spline[ix*(nr+1)+iy].z=host_frho_spline[ix][iy][5]; dview_frho_spline[ix*(nr+1)+iy].w=host_frho_spline[ix][iy][6]; } frho_spline2.alloc(nfrho*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(frho_spline2,dview_frho_spline,false); frho_spline2_tex.get_texture(*(this->pair_program),"frho_sp2_tex"); frho_spline2_tex.bind_float(frho_spline2,4); // pack rhor_spline UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int ix=0; ix<nrhor; ix++) for (int iy=0; iy<nr+1; iy++) { dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][0]; dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][1]; dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][2]; dview_rhor_spline[ix*(nr+1)+iy].w=(numtyp)0; } rhor_spline1.alloc(nrhor*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(rhor_spline1,dview_rhor_spline,false); rhor_spline1_tex.get_texture(*(this->pair_program),"rhor_sp1_tex"); rhor_spline1_tex.bind_float(rhor_spline1,4); for (int ix=0; ix<nrhor; ix++) for (int iy=0; iy<nr+1; iy++) { dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][3]; dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][4]; dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][5]; dview_rhor_spline[ix*(nr+1)+iy].w=host_rhor_spline[ix][iy][6]; } rhor_spline2.alloc(nrhor*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(rhor_spline2,dview_rhor_spline,false); rhor_spline2_tex.get_texture(*(this->pair_program),"rhor_sp2_tex"); rhor_spline2_tex.bind_float(rhor_spline2,4); // pack z2r_spline UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int ix=0; ix<nz2r; ix++) for (int iy=0; iy<nr+1; iy++) { dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][0]; dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][1]; dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][2]; dview_z2r_spline[ix*(nr+1)+iy].w=(numtyp)0; } z2r_spline1.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(z2r_spline1,dview_z2r_spline,false); z2r_spline1_tex.get_texture(*(this->pair_program),"z2r_sp1_tex"); z2r_spline1_tex.bind_float(z2r_spline1,4); for (int ix=0; ix<nz2r; ix++) for (int iy=0; iy<nr+1; iy++) { dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][3]; dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][4]; dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][5]; dview_z2r_spline[ix*(nr+1)+iy].w=host_z2r_spline[ix][iy][6]; } z2r_spline2.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(z2r_spline2,dview_z2r_spline,false); z2r_spline2_tex.get_texture(*(this->pair_program),"z2r_sp2_tex"); z2r_spline2_tex.bind_float(z2r_spline2,4); _allocated=true; this->_max_bytes=type2rhor_z2r.row_bytes() + type2frho.row_bytes() + rhor_spline1.row_bytes() + rhor_spline2.row_bytes() + frho_spline1.row_bytes() + frho_spline2.row_bytes() + z2r_spline1.row_bytes() + z2r_spline2.row_bytes() + dev_fp.row_bytes(); return 0; } template <class numtyp, class acctyp> void EAMT::clear() { if (!_allocated) return; _allocated=false; type2rhor_z2r.clear(); type2frho.clear(); rhor_spline1.clear(); rhor_spline2.clear(); frho_spline1.clear(); frho_spline2.clear(); z2r_spline1.clear(); z2r_spline2.clear(); host_fp.clear(); dev_fp.clear(); time_pair2.clear(); time_fp1.clear(); time_fp2.clear(); + if (_compiled_energy) { + k_energy_fast.clear(); + k_energy.clear(); + _compiled_energy=false; + } + this->clear_atomic(); } template <class numtyp, class acctyp> double EAMT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(EAM<numtyp,acctyp>); } // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then compute atom energies/forces // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void EAMT::compute(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { this->acc_timers(); if (this->device->time_device()) { // Put time from the second part to the total time_pair this->time_pair.add_time_to_total(time_pair2.time()); // Add transfer time from device -> host after part 1 this->atom->add_transfer_time(time_fp1.time()); // Add transfer time from host -> device before part 2 this->atom->add_transfer_time(time_fp2.time()); } if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style this->resize_atom(0,nall,success); this->zero_timers(); return; } int ago=this->hd_balancer.ago_first(f_ago); int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); this->ans->inum(inum); host_start=inum; // ------------------- Resize FP Array for EAM -------------------- if (nall>_max_fp_size) { dev_fp.clear(); host_fp.clear(); _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10); host_fp.alloc(_max_fp_size,*(this->ucl_device)); if (this->ucl_device->device_type()==UCL_CPU) dev_fp.view(host_fp); else dev_fp.alloc(_max_fp_size,*(this->ucl_device)); fp_tex.bind_float(dev_fp,1); } *fp_ptr=host_fp.begin(); // ----------------------------------------------------------------- if (ago==0) { this->reset_nbors(nall, inum, ilist, numj, firstneigh, success); if (!success) return; } this->atom->cast_x_data(host_x,host_type); this->atom->add_x_data(host_x,host_type); loop(eflag,vflag); // copy fp from device to host for comm time_fp1.start(); ucl_copy(host_fp,dev_fp,false); time_fp1.stop(); } // --------------------------------------------------------------------------- // Reneighbor on GPU and then compute per-atom densities // --------------------------------------------------------------------------- template <class numtyp, class acctyp> int** EAMT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr) { this->acc_timers(); if (this->device->time_device()) { // Put time from the second part to the total time_pair this->time_pair.add_time_to_total(time_pair2.time()); // Add transfer time from device -> host after part 1 this->atom->add_transfer_time(time_fp1.time()); // Add transfer time from host -> device before part 2 this->atom->add_transfer_time(time_fp2.time()); } if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style this->resize_atom(0,nall,success); this->zero_timers(); return NULL; } // load balance, returning the atom count on the device (inum) this->hd_balancer.balance(cpu_time); inum=this->hd_balancer.get_gpu_count(ago,inum_full); this->ans->inum(inum); host_start=inum; // ------------------- Resize FP Array for EAM -------------------- if (nall>_max_fp_size) { dev_fp.clear(); host_fp.clear(); _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10); host_fp.alloc(_max_fp_size,*(this->ucl_device)); if (this->ucl_device->device_type()==UCL_CPU) dev_fp.view(host_fp); else dev_fp.alloc(_max_fp_size,*(this->ucl_device)); fp_tex.bind_float(dev_fp,1); } *fp_ptr=host_fp.begin(); // ----------------------------------------------------------------- // Build neighbor list on GPU if necessary if (ago==0) { this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; } else { this->atom->cast_x_data(host_x,host_type); this->atom->add_x_data(host_x,host_type); } *ilist=this->nbor->host_ilist.begin(); *jnum=this->nbor->host_acc.begin(); loop(eflag,vflag); // copy fp from device to host for comm time_fp1.start(); ucl_copy(host_fp,dev_fp,false); time_fp1.stop(); return this->nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void EAMT::compute2(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom) { this->hd_balancer.start_timer(); time_fp2.start(); this->add_fp_data(); time_fp2.stop(); loop2(eflag,vflag); if (ilist == NULL) this->ans->copy_answers(eflag,vflag,eatom,vatom); else this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist); this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); } // --------------------------------------------------------------------------- // Calculate per-atom energies and forces // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void EAMT::loop(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { this->k_energy_fast.set_size(GX,BX); this->k_energy_fast.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(), &type2frho.begin(), &rhor_spline2.begin(), &frho_spline1.begin(),&frho_spline2.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &dev_fp.begin(), &this->ans->dev_engv.begin(), &eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr, &this->_threads_per_atom); } else { this->k_energy.set_size(GX,BX); this->k_energy.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(), &type2frho.begin(), &rhor_spline2.begin(), &frho_spline1.begin(),&frho_spline2.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &dev_fp.begin(), &this->ans->dev_engv.begin(), &eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr, &this->_threads_per_atom); } this->time_pair.stop(); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void EAMT::loop2(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair2.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(), &type2rhor_z2r.begin(), &rhor_spline1.begin(), &z2r_spline1.begin(), &z2r_spline2.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr, &_nr, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(), &type2rhor_z2r.begin(), &rhor_spline1.begin(), &z2r_spline1.begin(), &z2r_spline2.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr, &this->_threads_per_atom); } this->time_pair2.stop(); } template class EAM<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h index b37bd18d6..60e550ffc 100644 --- a/lib/gpu/lal_eam.h +++ b/lib/gpu/lal_eam.h @@ -1,135 +1,138 @@ /*************************************************************************** lal_eam.h ------------------- W. Michael Brown, Trung Dac Nguyen (ORNL) Class for acceleration of the eam pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ #ifndef LAL_EAM_H #define LAL_EAM_H #include "lal_base_atomic.h" namespace LAMMPS_AL { template <class numtyp, class acctyp> class EAM : public BaseAtomic<numtyp, acctyp> { public: EAM(); ~EAM(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double rdr, double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen); // Copy charges to device asynchronously inline void add_fp_data() { ucl_copy(dev_fp,host_fp,this->atom->nall(),true); } /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; /// Total host memory used by library for pair style double host_memory_usage() const; /// Pair loop with host neighboring void compute(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr); /// Pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, int &inum, void **fp_ptr); /// Pair loop with host neighboring void compute2(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom); // ------------------------- DEVICE KERNELS ------------------------- UCL_Kernel k_energy, k_energy_fast; // --------------------------- TEXTURES ----------------------------- UCL_Texture fp_tex; UCL_Texture rhor_spline1_tex, rhor_spline2_tex; UCL_Texture frho_spline1_tex, frho_spline2_tex; UCL_Texture z2r_spline1_tex, z2r_spline2_tex; // --------------------------- DEVICE DATA -------------------------- /// Device Timers UCL_Timer time_pair2, time_fp1, time_fp2; // --------------------------- TYPE DATA -------------------------- UCL_D_Vec<numtyp2> type2rhor_z2r; UCL_D_Vec<numtyp> type2frho; UCL_D_Vec<numtyp4> z2r_spline1, z2r_spline2; UCL_D_Vec<numtyp4> frho_spline1, frho_spline2; UCL_D_Vec<numtyp4> rhor_spline1, rhor_spline2; numtyp _cutforcesq,_rdr,_rdrho; int _nfrho,_nrhor,_nrho,_nz2r,_nr; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; /// Number of atom types int _ntypes; int _max_fp_size; + /// True of energy kernels are compiled + bool _compiled_energy; + /// Per-atom arrays UCL_H_Vec<numtyp> host_fp; UCL_D_Vec<numtyp> dev_fp; protected: bool _allocated; void loop(const bool _eflag, const bool _vflag); void loop2(const bool _eflag, const bool _vflag); }; } #endif