diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index f4334f9df..d6f8a6a81 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -1,311 +1,311 @@ # /* ---------------------------------------------------------------------- # LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator # http://lammps.sandia.gov, Sandia National Laboratories # Steve Plimpton, sjplimp@sandia.gov # # Copyright (2003) Sandia Corporation. Under the terms of Contract # DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains # certain rights in this software. This software is distributed under # the GNU General Public License. # # See the README file in the top-level LAMMPS directory. # ------------------------------------------------------------------------- */ # # /* ---------------------------------------------------------------------- # Contributing authors: Mike Brown (ORNL), brownw@ornl.gov # Peng Wang (Nvidia), penwang@nvidia.com # Inderaj Bains (NVIDIA), ibains@nvidia.com # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ CUDA = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \ $(CUDA_PRECISION) CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \ -Icudpp_mini CUDA_LINK = $(CUDA_LIB) -lcudart GPU_LIB = $(LIB_DIR)/libgpu.a # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H) NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) # Headers for Pair Stuff PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ - pair_gpu_balance.h + pair_gpu_balance.h pppm_gpu_memory.h ALL_H = $(NVD_H) $(PAIR_H) EXECS = $(BIN_DIR)/nvc_get_devices CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \ $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \ $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \ $(OBJ_DIR)/pair_gpu_device.o \ $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \ $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \ $(OBJ_DIR)/cmmc_msm_gpu_memory.o $(OBJ_DIR)/cmmc_msm_gpu.o \ $(CUDPP) PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \ $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \ $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \ $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \ $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \ $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \ $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \ $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \ $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \ $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \ $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \ $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \ $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \ $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \ $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \ $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h \ $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h all: $(GPU_LIB) $(EXECS) $(OBJ_DIR)/cudpp.o: cudpp_mini/cudpp.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp.cpp -Icudpp_mini $(OBJ_DIR)/cudpp_plan.o: cudpp_mini/cudpp_plan.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp_plan.cpp -Icudpp_mini $(OBJ_DIR)/cudpp_maximal_launch.o: cudpp_mini/cudpp_maximal_launch.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp_maximal_launch.cpp -Icudpp_mini $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu $(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu $(CUDA) -o $@ -c cudpp_mini/scan_app.cu $(OBJ_DIR)/pair_gpu_atom_kernel.ptx: pair_gpu_atom_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h $(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_ptx.h: $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_build_kernel.cu $(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h $(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(CUDR) -o $@ -c atomic_gpu_memory.cpp $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp $(CUDR) -o $@ -c charge_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h $(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h $(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp $(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu $(OBJ_DIR)/gb_gpu_kernel_lj.ptx: gb_gpu_kernel_lj.cu pair_gpu_precision.h gb_gpu_extra.h $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_lj.cu $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx: gb_gpu_kernel_nbor.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_ptx.h: $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h $(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_ptx.h $(CUDR) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp $(CUDR) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_cut_gpu_kernel.ptx: lj_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp $(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp $(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp $(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp $(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp $(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp $(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp $(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp $(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp $(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx: cmmc_msm_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_msm_gpu_kernel.cu $(OBJ_DIR)/cmmc_msm_gpu_ptx.h: $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h $(OBJ_DIR)/cmmc_msm_gpu_memory.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu_memory.cpp $(OBJ_DIR)/cmmc_msm_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmmc_msm_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_msm_gpu.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu.cpp $(CUDR) -o $@ -c cmmc_msm_gpu.cpp -I$(OBJ_DIR) $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H) $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) $(GPU_LIB): $(OBJS) $(AR) -crusv $(GPU_LIB) $(OBJS) clean: rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo veryclean: clean rm -rf *~ *.linkinfo diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 3e758bb75..9c601bc71 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -1,207 +1,207 @@ # /* ---------------------------------------------------------------------- # LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator # http://lammps.sandia.gov, Sandia National Laboratories # Steve Plimpton, sjplimp@sandia.gov # # Copyright (2003) Sandia Corporation. Under the terms of Contract # DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains # certain rights in this software. This software is distributed under # the GNU General Public License. # # See the README file in the top-level LAMMPS directory. # ------------------------------------------------------------------------- */ # # /* ---------------------------------------------------------------------- # Contributing authors: Mike Brown (ORNL), brownw@ornl.gov # Peng Wang (Nvidia), penwang@nvidia.com # Inderaj Bains (NVIDIA), ibains@nvidia.com # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ OCL = $(OCL_CPP) $(OCL_PREC) -DUSE_OPENCL OCL_LIB = $(LIB_DIR)/libgpu.a # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) # Headers for Pair Stuff PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ - pair_gpu_balance.h + pair_gpu_balance.h pppm_gpu_memory.h ALL_H = $(OCL_H) $(PAIR_H) EXECS = $(BIN_DIR)/ocl_get_devices OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \ $(OBJ_DIR)/pair_gpu_device.o \ $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \ $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \ $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \ $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \ $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \ $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \ $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \ $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \ $(OBJ_DIR)/cmmc_long_gpu_cl.h OCL_EXECS = $(BIN_DIR)/ocl_get_devices all: $(OCL_LIB) $(EXECS) $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu $(BSH) ./geryon/file_to_cstr.sh pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h $(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu $(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h $(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu $(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h $(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(OCL) -o $@ -c atomic_gpu_memory.cpp $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp $(OCL) -o $@ -c charge_gpu_memory.cpp $(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h; $(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h $(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp $(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu $(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h: gb_gpu_kernel.cu gb_gpu_kernel_lj.cu gb_gpu_extra.h cat gb_gpu_extra.h gb_gpu_kernel.cu > $(OBJ_DIR)/gb_gpu_kernel.tar; \ cat gb_gpu_extra.h gb_gpu_kernel_lj.cu > $(OBJ_DIR)/gb_gpu_kernel_lj.tar; \ $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_cl.h; \ rm -f $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h $(OCL) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp $(OCL) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_cl.h; $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp $(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_cl.h; $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp $(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_cl.h; $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp $(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h; $(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp $(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h; $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp $(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h; $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp $(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h; $(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp $(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h; $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp $(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_cl.h; $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp $(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) $(OCL_LIB): $(OBJS) $(PTXS) $(AR) -crusv $(OCL_LIB) $(OBJS) opencl: $(OCL_EXECS) clean: rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo veryclean: clean rm -rf *~ *.linkinfo diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index 837e5f52c..83d20a51c 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -1,529 +1,546 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #include "pair_gpu_device.h" #include "pair_gpu_precision.h" #include <map> #include <math.h> #ifdef _OPENMP #include <omp.h> #endif #ifdef USE_OPENCL #include "pair_gpu_dev_cl.h" #else #include "pair_gpu_dev_ptx.h" #endif #define BLOCK_1D 64 #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp> template <class numtyp, class acctyp> PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false), _gpu_mode(GPU_FORCE), _first_device(0), _last_device(0), _compiled(false) { } template <class numtyp, class acctyp> PairGPUDeviceT::~PairGPUDevice() { clear_device(); } template <class numtyp, class acctyp> bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double p_split, const int nthreads) { _nthreads=nthreads; #ifdef _OPENMP omp_set_num_threads(nthreads); #endif if (_device_init) return true; _device_init=true; _comm_world=world; _comm_replica=replica; _first_device=first_gpu; _last_device=last_gpu; _gpu_mode=gpu_mode; _particle_split=p_split; // Get the rank/size within the world MPI_Comm_rank(_comm_world,&_world_me); MPI_Comm_size(_comm_world,&_world_size); // Get the rank/size within the replica MPI_Comm_rank(_comm_replica,&_replica_me); MPI_Comm_size(_comm_replica,&_replica_size); // Get the names of all nodes int name_length; char node_name[MPI_MAX_PROCESSOR_NAME]; char node_names[MPI_MAX_PROCESSOR_NAME*_world_size]; MPI_Get_processor_name(node_name,&name_length); MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world); std::string node_string=std::string(node_name); // Get the number of procs per node std::map<std::string,int> name_map; std::map<std::string,int>::iterator np; for (int i=0; i<_world_size; i++) { std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); np=name_map.find(i_string); if (np==name_map.end()) name_map[i_string]=1; else np->second++; } int procs_per_node=name_map.begin()->second; // Assign a unique id to each node int split_num=0, split_id=0; for (np=name_map.begin(); np!=name_map.end(); ++np) { if (np->first==node_string) split_id=split_num; split_num++; } // Set up a per node communicator and find rank within MPI_Comm node_comm; MPI_Comm_split(_comm_world, split_id, 0, &node_comm); int node_rank; MPI_Comm_rank(node_comm,&node_rank); // set the device ID _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/ (last_gpu-first_gpu+1))); int my_gpu=node_rank/_procs_per_gpu+first_gpu; // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_rank(_comm_gpu,&_gpu_rank); gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) return false; gpu->set(my_gpu); _block_size=BLOCK_1D; if (static_cast<size_t>(_block_size)>gpu->group_size()) _block_size=gpu->group_size(); + _long_range_precompute=0; + return true; } template <class numtyp, class acctyp> bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge, const bool rot, const int nlocal, const int host_nlocal, const int nall, PairGPUNbor *nbor, const int maxspecial, const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut) { if (!_device_init) return false; // Counts of data transfers for timing overhead estimates _data_in_estimate=0; _data_out_estimate=1; // Initial number of local particles int ef_nlocal=nlocal; if (_particle_split<1.0 && _particle_split>0.0) ef_nlocal=static_cast<int>(_particle_split*nlocal); bool gpu_nbor=false; if (_gpu_mode==GPU_NEIGH) gpu_nbor=true; if (_init_count==0) { // Initialize atom and nbor data if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0)) return false; compile_kernels(); _data_in_estimate++; if (charge) _data_in_estimate++; if (rot) _data_in_estimate++; } else { if (atom.charge()==false && charge) _data_in_estimate++; if (atom.quat()==false && rot) _data_in_estimate++; atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial); } if (!ans.init(ef_nlocal,charge,rot,*gpu)) return false; if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, *gpu,gpu_nbor,gpu_host,pre_cut)) return false; nbor->cell_size(cell_size); _init_count++; return true; } template <class numtyp, class acctyp> bool PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall) { if (!_device_init) return false; if (_init_count==0) { // Initialize atom and nbor data if (!atom.init(nall,true,false,*gpu,false,false)) return false; compile_kernels(); } else atom.add_fields(true,false,false,false); if (!ans.init(nlocal,true,false,*gpu)) return false; _init_count++; return true; } +template <class numtyp, class acctyp> +void PairGPUDeviceT::set_single_precompute + (PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) { + _long_range_precompute=1; + pppm_single=pppm; +} + +template <class numtyp, class acctyp> +void PairGPUDeviceT::set_double_precompute + (PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) { + _long_range_precompute=2; + pppm_double=pppm; +} + template <class numtyp, class acctyp> void PairGPUDeviceT::init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu) { #ifdef USE_OPENCL std::string fs=""; #else std::string fs=toa(gpu->free_gigabytes())+"/"; #endif if (_replica_me == 0 && screen) { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu); #ifdef _OPENMP fprintf(screen,"- with %d threads per proc.\n",_nthreads); #endif fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); for (int i=first_gpu; i<=last_gpu; i++) { std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+ " GHZ ("; if (sizeof(PRECISION)==4) { if (sizeof(ACC_PRECISION)==4) sname+="Single Precision)"; else sname+="Mixed Precision)"; } else sname+="Double Precision)"; fprintf(screen,"GPU %d: %s\n",i,sname.c_str()); } fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n\n"); } } template <class numtyp, class acctyp> void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, double &gpu_driver_overhead) { UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL; UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL; UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL; UCL_Timer over_timer(*gpu); if (_data_in_estimate>0) { host_data_in=new UCL_H_Vec<int>[_data_in_estimate]; dev_data_in=new UCL_D_Vec<int>[_data_in_estimate]; timers_in=new UCL_Timer[_data_in_estimate]; } if (_data_out_estimate>0) { host_data_out=new UCL_H_Vec<int>[_data_out_estimate]; dev_data_out=new UCL_D_Vec<int>[_data_out_estimate]; timers_out=new UCL_Timer[_data_out_estimate]; } if (kernel_calls>0) { kernel_data=new UCL_D_Vec<int>[kernel_calls]; timers_kernel=new UCL_Timer[kernel_calls]; } for (int i=0; i<_data_in_estimate; i++) { host_data_in[i].alloc(1,*gpu); dev_data_in[i].alloc(1,*gpu); timers_in[i].init(*gpu); } for (int i=0; i<_data_out_estimate; i++) { host_data_out[i].alloc(1,*gpu); dev_data_out[i].alloc(1,*gpu); timers_out[i].init(*gpu); } for (int i=0; i<kernel_calls; i++) { kernel_data[i].alloc(1,*gpu); timers_kernel[i].init(*gpu); } gpu_overhead=0.0; gpu_driver_overhead=0.0; for (int i=0; i<10; i++) { gpu->sync(); gpu_barrier(); over_timer.start(); gpu->sync(); gpu_barrier(); double driver_time=MPI_Wtime(); for (int i=0; i<_data_in_estimate; i++) { timers_in[i].start(); ucl_copy(dev_data_in[i],host_data_in[i],true); timers_in[i].stop(); } for (int i=0; i<kernel_calls; i++) { timers_kernel[i].start(); zero(kernel_data[i],1); timers_kernel[i].stop(); } for (int i=0; i<_data_out_estimate; i++) { timers_out[i].start(); ucl_copy(host_data_out[i],dev_data_out[i],true); timers_out[i].stop(); } over_timer.stop(); double time=over_timer.seconds(); driver_time=MPI_Wtime()-driver_time; for (int i=0; i<_data_in_estimate; i++) timers_in[i].add_to_total(); for (int i=0; i<kernel_calls; i++) timers_kernel[i].add_to_total(); for (int i=0; i<_data_out_estimate; i++) timers_out[i].add_to_total(); double mpi_time, mpi_driver_time; MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm()); MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm()); gpu_overhead+=mpi_time; gpu_driver_overhead+=mpi_driver_time; } gpu_overhead/=10.0; gpu_driver_overhead/=10.0; if (_data_in_estimate>0) { delete [] host_data_in; delete [] dev_data_in; delete [] timers_in; } if (_data_out_estimate>0) { delete [] host_data_out; delete [] dev_data_out; delete [] timers_out; } if (kernel_calls>0) { delete [] kernel_data; delete [] timers_kernel; } } template <class numtyp, class acctyp> void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, PairGPUNbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, const double driver_overhead, FILE *screen) { double single[8], times[8]; single[0]=atom.transfer_time()+ans.transfer_time(); single[1]=nbor.time_nbor.total_seconds(); single[2]=nbor.time_kernel.total_seconds(); single[3]=time_pair.total_seconds(); single[4]=atom.cast_time()+ans.cast_time(); single[5]=gpu_overhead; single[6]=driver_overhead; single[7]=ans.cpu_idle_time(); MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); if (replica_me()==0) if (screen && times[3]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," GPU Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); if (procs_per_gpu()==1) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); if (nbor.gpu_nbor()) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } } template <class numtyp, class acctyp> void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, UCL_Timer &time_map, UCL_Timer &time_rho, UCL_Timer &time_interp, PairGPUAns<numtyp,acctyp> &ans, const double max_bytes, const double cpu_time, FILE *screen) { double single[7], times[7]; // single[0]=atom.transfer_time()+ans.transfer_time()+time_in.total_seconds()+ // time_out.total_seconds(); // single[1]=atom.cast_time()+ans.cast_time(); single[0]=time_out.total_seconds(); single[1]=time_in.total_seconds(); single[2]=time_map.total_seconds(); single[3]=time_rho.total_seconds(); single[4]=time_interp.total_seconds(); single[5]=ans.transfer_time()+ans.cast_time(); single[6]=cpu_time; MPI_Reduce(single,times,7,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); if (replica_me()==0) if (screen && times[3]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," GPU Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); if (procs_per_gpu()==1) { fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size); fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size); fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Total rho: %.4f s.\n",(times[0]+times[2]+times[3])/_replica_size); fprintf(screen,"Total interp: %.4f s.\n",(times[1]+times[4])/_replica_size); fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Total: %.4f s.\n", (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/ _replica_size); fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); } fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } } template <class numtyp, class acctyp> void PairGPUDeviceT::clear() { if (_init_count>0) { + _long_range_precompute=0; _init_count--; if (_init_count==0) { atom.clear(); _nbor_shared.clear(); if (_compiled) { k_zero.clear(); delete dev_program; _compiled=false; } } } } template <class numtyp, class acctyp> void PairGPUDeviceT::clear_device() { while (_init_count>0) clear(); if (_device_init) { delete gpu; _device_init=false; } } template <class numtyp, class acctyp> void PairGPUDeviceT::compile_kernels() { if (_compiled) return; std::string flags="-cl-mad-enable"; dev_program=new UCL_Program(*gpu); dev_program->load_string(pair_gpu_dev_kernel,flags.c_str()); k_zero.set_function(*dev_program,"kernel_zero"); _compiled=true; } template <class numtyp, class acctyp> double PairGPUDeviceT::host_memory_usage() const { return atom.host_memory_usage()+4*sizeof(numtyp)+ sizeof(PairGPUDevice<numtyp,acctyp>); } template class PairGPUDevice<PRECISION,ACC_PRECISION>; PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device; bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads) { return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, particle_split,nthreads); } void lmp_clear_device() { pair_gpu_device.clear_device(); } double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul); } diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index ccb9d31c8..c0769bff9 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -1,227 +1,253 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef PAIR_GPU_DEVICE_H #define PAIR_GPU_DEVICE_H #include "pair_gpu_atom.h" #include "pair_gpu_ans.h" #include "pair_gpu_nbor.h" +#include "pppm_gpu_memory.h" #include "mpi.h" #include <sstream> #include "stdio.h" #include <string> #include <queue> +template <class numtyp, class acctyp, + class grdtyp, class grdtyp4> class PPPMGPUMemory; + template <class numtyp, class acctyp> class PairGPUDevice { public: PairGPUDevice(); ~PairGPUDevice(); /// Initialize the device for use by this process /** Sets up a per-device MPI communicator for load balancing and initializes * the device (>=first_gpu and <=last_gpu) that this proc will be using **/ bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads); /// Initialize the device for Atom and Neighbor storage /** \param rot True if quaternions need to be stored * \param nlocal Total number of local particles to allocate memory for * \param host_nlocal Initial number of host particles to allocate memory for * \param nall Total number of local+ghost particles * \param gpu_nbor True if neighboring is performed on device * \param gpu_host 0 if host will not perform force calculations, * 1 if gpu_nbor is true, and host needs a half nbor list, * 2 if gpu_nbor is true, and host needs a full nbor list * \param max_nbors Initial number of rows in the neighbor matrix * \param cell_size cutoff+skin * \param pre_cut True if cutoff test will be performed in separate kernel * than the force kernel **/ bool init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot, const int nlocal, const int host_nlocal, const int nall, PairGPUNbor *nbor, const int maxspecial, const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut); /// Initialize the device for Atom storage only /** \param nlocal Total number of local particles to allocate memory for * \param nall Total number of local+ghost particles **/ bool init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall); /// Output a message for pair_style acceleration with device stats void init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu); + /// Perform charge assignment asynchronously for PPPM + void set_single_precompute(PPPMGPUMemory<numtyp,acctyp, + float,_lgpu_float4> *pppm); + + /// Perform charge assignment asynchronously for PPPM + void set_double_precompute(PPPMGPUMemory<numtyp,acctyp, + double,_lgpu_double4> *pppm); + /// Esimate the overhead from GPU calls from multiple procs /** \param kernel_calls Number of kernel calls/timestep for timing estimated * overhead * \param gpu_overhead Estimated gpu overhead per timestep (sec) * \param driver_overhead Estimated overhead from driver per timestep (s) **/ void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, double &gpu_driver_overhead); /// Returns true if double precision is supported on card inline bool double_precision() { return gpu->double_precision(); } /// Output a message with timing information void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, PairGPUNbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, const double driver_overhead, FILE *screen); /// Output a message with timing information void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, UCL_Timer & time_map, UCL_Timer & time_rho, UCL_Timer &time_interp, PairGPUAns<numtyp,acctyp> &ans, const double max_bytes, const double cpu_time, FILE *screen); /// Clear all memory on host and device associated with atom and nbor data void clear(); /// Clear all memory on host and device void clear_device(); /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans) { ans_queue.push(ans); } /// Add "answers" (force,energies,etc.) into LAMMPS structures inline double fix_gpu(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { atom.data_unavail(); if (ans_queue.empty()==false) { stop_host_timer(); double evdw=0.0; while (ans_queue.empty()==false) { evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul); ans_queue.pop(); } return evdw; } return 0.0; } /// Start timer on host inline void start_host_timer() { _cpu_full=MPI_Wtime(); _host_timer_started=true; } /// Stop timer on host inline void stop_host_timer() { if (_host_timer_started) { _cpu_full=MPI_Wtime()-_cpu_full; _host_timer_started=false; } } /// Return host time inline double host_time() { return _cpu_full; } /// Return host memory usage in bytes double host_memory_usage() const; /// Return the number of procs sharing a device (size of device commincator) inline int procs_per_gpu() const { return _procs_per_gpu; } /// Return the number of threads per proc inline int num_threads() const { return _nthreads; } /// My rank within all processes inline int world_me() const { return _world_me; } /// Total number of processes inline int world_size() const { return _world_size; } /// MPI Barrier for world inline void world_barrier() { MPI_Barrier(_comm_world); } /// Return the replica MPI communicator inline MPI_Comm & replica() { return _comm_replica; } /// My rank within replica communicator inline int replica_me() const { return _replica_me; } /// Number of procs in replica communicator inline int replica_size() const { return _replica_size; } /// Return the per-GPU MPI communicator inline MPI_Comm & gpu_comm() { return _comm_gpu; } /// Return my rank in the device communicator inline int gpu_rank() const { return _gpu_rank; } /// MPI Barrier for gpu inline void gpu_barrier() { MPI_Barrier(_comm_gpu); } /// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH inline int gpu_mode() const { return _gpu_mode; } /// Index of first device used by a node inline int first_device() const { return _first_device; } /// Index of last device used by a node inline int last_device() const { return _last_device; } /// Particle split defined in fix inline double particle_split() const { return _particle_split; } /// Return the initialization count for the device inline int init_count() const { return _init_count; } // -------------------- SHARED DEVICE ROUTINES -------------------- // Perform asynchronous zero of integer array void zero(UCL_D_Vec<int> &mem, const int numel) { int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/ _block_size)); k_zero.set_size(num_blocks,_block_size); k_zero.run(&mem.begin(),&numel); } // -------------------------- DEVICE DATA ------------------------- /// Geryon Device UCL_Device *gpu; enum{GPU_FORCE, GPU_NEIGH}; // --------------------------- ATOM DATA -------------------------- /// Atom Data PairGPUAtom<numtyp,acctyp> atom; // --------------------------- NBOR DATA ---------------------------- /// Neighbor Data PairGPUNborShared _nbor_shared; + // ------------------------ LONG RANGE DATA ------------------------- + + // Long Range Data + int _long_range_precompute; + PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single; + PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double; + /// Precomputations for long range charge assignment (asynchronously) +// inline void precompute(const int ago, const int nlocal, const int nall, +// double **host_x, int *host_type, bool &success, +// double *charge, double *boxlo, const double delxinv, +// const double delyinv, const double delzinv) { + + + private: std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue; int _init_count; bool _device_init, _host_timer_started; MPI_Comm _comm_world, _comm_replica, _comm_gpu; int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, _replica_size; int _gpu_mode, _first_device, _last_device, _nthreads; double _particle_split; double _cpu_full; int _block_size; UCL_Program *dev_program; UCL_Kernel k_zero; bool _compiled; void compile_kernels(); int _data_in_estimate, _data_out_estimate; template <class t> inline std::string toa(const t& in) { std::ostringstream o; o.precision(2); o << in; return o.str(); } }; #endif diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 6f1bbf0d2..c52d650ca 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -1,371 +1,404 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifdef USE_OPENCL #include "pppm_gpu_cl.h" #define MEM_THREADS 16 #else #include "pppm_f_gpu_ptx.h" #include "pppm_d_gpu_ptx.h" #endif #include "pppm_gpu_memory.h" #include <cassert> // Maximum order for spline #define MAX_SPLINE 8 // Thread block size for all kernels (Must be >=MAX_SPLINE^2) #define BLOCK_1D 64 // Number of threads per pencil for charge spread //#define PENCIL_SIZE MEM_THREADS #define PENCIL_SIZE 32 // Number of pencils per block for charge spread //#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE) #define BLOCK_PENCILS 2 #define PPPMGPUMemoryT PPPMGPUMemory<numtyp, acctyp, grdtyp, grdtyp4> extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device; template <class numtyp, class acctyp, class grdtyp, class grdtyp4> PPPMGPUMemoryT::PPPMGPUMemory() : _allocated(false), _compiled(false), _max_bytes(0) { device=&pair_gpu_device; ans=new PairGPUAns<numtyp,acctyp>(); } template <class numtyp, class acctyp, class grdtyp, class grdtyp4> PPPMGPUMemoryT::~PPPMGPUMemory() { clear(0.0); delete ans; } template <class numtyp, class acctyp, class grdtyp, class grdtyp4> int PPPMGPUMemoryT::bytes_per_atom() const { return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1; } template <class numtyp, class acctyp, class grdtyp, class grdtyp4> grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - grdtyp **vd_brick, int &flag) { + grdtyp **vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, + const int nz_pppm, int &flag) { _max_bytes=10; screen=_screen; bool success=true; if (!device->init(*ans,nlocal,nall)) { flag=-2; return 0; } if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) { flag=-5; return 0; } - + ucl_device=device->gpu; atom=&device->atom; _block_size=BLOCK_1D; assert(BLOCK_PENCILS*PENCIL_SIZE==BLOCK_1D); assert(MAX_SPLINE*MAX_SPLINE<=BLOCK_1D); if (static_cast<size_t>(_block_size)>ucl_device->group_size()) _block_size=ucl_device->group_size(); compile_kernels(*ucl_device); // Initialize timers for the selected GPU time_in.init(*ucl_device); time_in.zero(); time_out.init(*ucl_device); time_out.zero(); time_map.init(*ucl_device); time_map.zero(); time_rho.init(*ucl_device); time_rho.zero(); time_interp.init(*ucl_device); time_interp.zero(); pos_tex.bind_float(atom->dev_x,4); q_tex.bind_float(atom->dev_q,1); _allocated=true; _max_bytes=0; _max_an_bytes=ans->gpu_bytes(); _order=order; _order_m_1=order-1; _order2=_order_m_1*_order; _nlower=-(_order-1)/2; _nupper=order/2; _nxlo_out=nxlo_out; _nylo_out=nylo_out; _nzlo_out=nzlo_out; _nxhi_out=nxhi_out; _nyhi_out=nyhi_out; _nzhi_out=nzhi_out; + + _slab_volfactor=slab_volfactor; + _nx_pppm=nx_pppm; + _ny_pppm=ny_pppm; + _nz_pppm=nz_pppm; + _max_brick_atoms=10; // Get rho_coeff on device int n2lo=(1-order)/2; int numel=order*( order/2 - n2lo + 1 ); success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)== UCL_SUCCESS); UCL_H_Vec<double> view; view.view(rho_coeff[0]+n2lo,numel,*ucl_device); ucl_copy(d_rho_coeff,view,true); _max_bytes+=d_rho_coeff.row_bytes(); // Allocate storage for grid _npts_x=nxhi_out-nxlo_out+1; _npts_y=nyhi_out-nylo_out+1; _npts_z=nzhi_out-nzlo_out+1; _npts_yx=_npts_x*_npts_y; success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)== UCL_SUCCESS); success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)== UCL_SUCCESS); success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)== UCL_SUCCESS); *vd_brick=h_vd_brick.begin(); _max_bytes+=d_brick.row_bytes(); // Allocate vector with count of atoms assigned to each grid point _nlocal_x=_npts_x+_nlower-_nupper; _nlocal_y=_npts_y+_nlower-_nupper; _nlocal_z=_npts_z+_nlower-_nupper; _nlocal_yx=_nlocal_x*_nlocal_y; _atom_stride=_nlocal_x*_nlocal_y*_nlocal_z; success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)== UCL_SUCCESS); _max_bytes+=d_brick_counts.row_bytes(); // Allocate storage for atoms assigned to each grid point success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms, *ucl_device)==UCL_SUCCESS); _max_bytes+=d_brick_atoms.row_bytes(); // Allocate error flags for checking out of bounds atoms success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS); success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)== UCL_SUCCESS); if (!success) { flag=-3; return 0; } d_error_flag.zero(); _max_bytes+=1; return h_brick.begin(); } template <class numtyp, class acctyp, class grdtyp, class grdtyp4> void PPPMGPUMemoryT::clear(const double cpu_time) { if (!_allocated) return; _allocated=false; + _precompute_done=false; d_brick.clear(); h_brick.clear(); h_vd_brick.clear(); d_brick_counts.clear(); h_error_flag.clear(); d_error_flag.clear(); d_brick_atoms.clear(); acc_timers(); device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp, *ans,_max_bytes+_max_an_bytes,cpu_time,screen); if (_compiled) { k_particle_map.clear(); k_make_rho.clear(); k_interp.clear(); delete pppm_program; _compiled=false; } time_in.clear(); time_out.clear(); time_map.clear(); time_rho.clear(); time_interp.clear(); device->clear(); } // --------------------------------------------------------------------------- -// Charge spreading stuff +// Charge assignment that can be performed asynchronously // --------------------------------------------------------------------------- template <class numtyp, class acctyp, class grdtyp, class grdtyp4> -int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall, - double **host_x, int *host_type, bool &success, - double *host_q, double *boxlo, - const double delxinv, const double delyinv, - const double delzinv) { +void PPPMGPUMemoryT::precompute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, + const double delxinv, const double delyinv, + const double delzinv) { + if (_precompute_done) + return; + device->stop_host_timer(); acc_timers(); if (nlocal==0) { zero_timers(); - return 0; + return; } ans->inum(nlocal); if (ago==0) { resize_atom(nlocal,nall,success); resize_local(nlocal,success); if (!success) - return 0; + return; double bytes=ans->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; } atom->cast_x_data(host_x,host_type); atom->cast_q_data(host_q); atom->add_x_data(host_x,host_type); atom->add_q_data(); time_map.start(); // Compute the block size and grid size to keep all cores busy int BX=this->block_size(); int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX)); - int _max_atoms=10; int ainum=this->ans->inum(); // Boxlo adjusted to be upper left brick and shift for even spline order double shift=0.0; if (_order % 2) shift=0.5; _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv; _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv; _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv; _delxinv=delxinv; _delyinv=delyinv; _delzinv=delzinv; double delvolinv = delxinv*delyinv*delzinv; grdtyp f_delvolinv = delvolinv; d_brick_counts.zero(); k_particle_map.set_size(GX,BX); k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv, &ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(), &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, &d_error_flag.begin()); time_map.stop(); time_rho.start(); BX=block_size(); GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/BLOCK_PENCILS)); k_make_rho.set_size(GX,BX); k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(), &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y, &_nlocal_z, &_order_m_1, &_order, &_order2); time_rho.stop(); time_out.start(); ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true); - ucl_copy(h_error_flag,d_error_flag,false); + ucl_copy(h_error_flag,d_error_flag,true); time_out.stop(); + _precompute_done=true; +} + +// --------------------------------------------------------------------------- +// Charge spreading stuff +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp, class grdtyp, class grdtyp4> +int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, + const double delxinv, const double delyinv, + const double delzinv) { + precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv, + delyinv,delzinv); + if (!success || nlocal==0) + return 0; + + time_out.sync_stop(); + + _precompute_done=false; + if (h_error_flag[0]==2) { // Not enough storage for atoms on the brick _max_brick_atoms*=2; d_error_flag.zero(); d_brick_atoms.clear(); - d_brick_atoms.alloc(_atom_stride*_max_atoms,*ucl_device); + d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device); _max_bytes+=d_brick_atoms.row_bytes(); return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, delxinv,delyinv,delzinv); } return h_error_flag[0]; } + // --------------------------------------------------------------------------- // Charge spreading stuff // --------------------------------------------------------------------------- template <class numtyp, class acctyp, class grdtyp, class grdtyp4> void PPPMGPUMemoryT::interp(const grdtyp qqrd2e_scale) { time_in.start(); ucl_copy(d_brick,h_vd_brick,true); time_in.stop(); time_interp.start(); // Compute the block size and grid size to keep all cores busy int BX=this->block_size(); int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX)); int ainum=this->ans->inum(); k_interp.set_size(GX,BX); k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin()); time_interp.stop(); ans->copy_answers(false,false,false,false); device->add_ans_object(ans); } template <class numtyp, class acctyp, class grdtyp, class grdtyp4> double PPPMGPUMemoryT::host_memory_usage() const { return device->atom.host_memory_usage()+ sizeof(PPPMGPUMemory<numtyp,acctyp,grdtyp,grdtyp4>); } template <class numtyp, class acctyp, class grdtyp, class grdtyp4> void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) { if (_compiled) return; std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ std::string(OCL_PRECISION_COMPILE); #ifdef USE_OPENCL flags+=std::string("-D grdtyp=")+ucl_template_name<grdtyp>()+" -D grdtyp4="+ ucl_template_name<grdtyp>()+"4"; #endif pppm_program=new UCL_Program(dev); if (sizeof(grdtyp)==sizeof(float)) pppm_program->load_string(pppm_f_gpu_kernel,flags.c_str()); else pppm_program->load_string(pppm_d_gpu_kernel,flags.c_str()); k_particle_map.set_function(*pppm_program,"particle_map"); k_make_rho.set_function(*pppm_program,"make_rho"); k_interp.set_function(*pppm_program,"interp"); pos_tex.get_texture(*pppm_program,"pos_tex"); q_tex.get_texture(*pppm_program,"q_tex"); _compiled=true; } template class PPPMGPUMemory<PRECISION,ACC_PRECISION,float,_lgpu_float4>; template class PPPMGPUMemory<PRECISION,ACC_PRECISION,double,_lgpu_double4>; diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h index f996cdb1a..7fa666675 100644 --- a/lib/gpu/pppm_gpu_memory.h +++ b/lib/gpu/pppm_gpu_memory.h @@ -1,174 +1,186 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef PPPM_GPU_MEMORY_H #define PPPM_GPU_MEMORY_H -#include "pair_gpu_device.h" -#include "pair_gpu_balance.h" #include "mpi.h" +#include "pair_gpu_device.h" #ifdef USE_OPENCL #include "geryon/ocl_texture.h" #else #include "geryon/nvd_texture.h" #endif +template <class numtyp, class acctyp> class PairGPUDevice; + template <class numtyp, class acctyp, class grdtyp, class grdtyp4> class PPPMGPUMemory { public: PPPMGPUMemory(); virtual ~PPPMGPUMemory(); /// Clear any previous data and set up for a new LAMMPS run /** Success will be: * - 0 if successfull * - -1 if fix gpu not found * - -2 if GPU could not be found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, - double **rho_coeff, grdtyp **vd_brick, int &success); + double **rho_coeff, grdtyp **vd_brick, + const double slab_volfactor, const int nx_pppm, + const int ny_pppm, const int nz_pppm, int &success); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { if (atom->resize(nall, success)) { pos_tex.bind_float(atom->dev_x,4); q_tex.bind_float(atom->dev_q,1); } ans->resize(inum,success); } /// Check if there is enough storage for local atoms and realloc if not inline void resize_local(const int inum, bool &success) { } /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(const double cpu_time); /// Returns memory usage on device per atom int bytes_per_atom() const; /// Total host memory used by library for pair style double host_memory_usage() const; /// Accumulate timers inline void acc_timers() { atom->acc_timers(); ans->acc_timers(); time_in.add_to_total(); time_out.add_to_total(); time_map.add_to_total(); time_rho.add_to_total(); time_interp.add_to_total(); } /// Zero timers inline void zero_timers() { atom->zero_timers(); ans->zero_timers(); time_in.zero(); time_out.zero(); time_map.zero(); time_rho.zero(); time_interp.zero(); } + /// Precomputations for charge assignment that can be done asynchronously + void precompute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *charge, double *boxlo, const double delxinv, + const double delyinv, const double delzinv); + /// Returns non-zero if out of bounds atoms - int spread(const int ago,const int nlocal,const int nall,double **host_x, - int *host_type,bool &success,double *charge,double *boxlo, - const double delxinv,const double delyinv,const double delzinv); + int spread(const int ago, const int nlocal, const int nall, double **host_x, + int *host_type, bool &success, double *charge, double *boxlo, + const double delxinv, const double delyinv, const double delzinv); void interp(const grdtyp qqrd2e_scale); // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage PairGPUDevice<numtyp,acctyp> *device; /// Geryon device UCL_Device *ucl_device; /// Device Timers UCL_Timer time_in, time_out, time_map, time_rho, time_interp; /// LAMMPS pointer for screen output FILE *screen; // --------------------------- ATOM DATA -------------------------- /// Atom Data PairGPUAtom<numtyp,acctyp> *atom; // --------------------------- GRID DATA -------------------------- UCL_H_Vec<grdtyp> h_brick, h_vd_brick; UCL_D_Vec<grdtyp> d_brick; // Count of number of atoms assigned to each grid point UCL_D_Vec<int> d_brick_counts; // Atoms assigned to each grid point UCL_D_Vec<grdtyp4> d_brick_atoms; // Error checking for out of bounds atoms UCL_D_Vec<int> d_error_flag; UCL_H_Vec<int> h_error_flag; // Number of grid points in brick (including ghost) int _npts_x, _npts_y, _npts_z, _npts_yx; // Number of local grid points in brick int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride; // -------------------------- SPLINE DATA ------------------------- UCL_D_Vec<grdtyp> d_rho_coeff; int _order, _nlower, _nupper, _order_m_1, _order2; int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out; // ------------------------ FORCE/ENERGY DATA ----------------------- PairGPUAns<numtyp,acctyp> *ans; // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pppm_program; UCL_Kernel k_particle_map, k_make_rho, k_interp; inline int block_size() { return _block_size; } inline int block_x_size() { return _block_x_size; } inline int block_y_size() { return _block_y_size; } // --------------------------- TEXTURES ----------------------------- UCL_Texture pos_tex; UCL_Texture q_tex; protected: - bool _allocated, _compiled; - int _block_size, _block_x_size, _block_y_size, _max_brick_atoms; + bool _allocated, _compiled, _precompute_done; + int _block_size, _block_x_size, _block_y_size, _max_brick_atoms, _max_atoms; double _max_bytes, _max_an_bytes; grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; + double _slab_volfactor; + int _nx_pppm, _ny_pppm, _nz_pppm; + void compile_kernels(UCL_Device &dev); }; #endif diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp index 6f207113f..1e434d66a 100644 --- a/lib/gpu/pppm_l_gpu.cpp +++ b/lib/gpu/pppm_l_gpu.cpp @@ -1,152 +1,165 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #include <iostream> #include <cassert> #include <math.h> #include "pppm_gpu_memory.h" using namespace std; static PPPMGPUMemory<PRECISION,ACC_PRECISION,float,_lgpu_float4> PPPMF; static PPPMGPUMemory<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- template <class grdtyp, class memtyp> grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - grdtyp **vd_brick, int &success) { + grdtyp **vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, const int nz_pppm, + int &success) { pppm.clear(0.0); int first_gpu=pppm.device->first_device(); int last_gpu=pppm.device->last_device(); int world_me=pppm.device->world_me(); int gpu_rank=pppm.device->gpu_rank(); int procs_per_gpu=pppm.device->procs_per_gpu(); pppm.device->init_message(screen,"pppm",first_gpu,last_gpu); bool message=false; if (pppm.device->replica_me()==0 && screen) message=true; if (message) { fprintf(screen,"Initializing GPU and compiling on process 0..."); fflush(screen); } success=0; grdtyp * host_brick=NULL; if (world_me==0) { host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out, - nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,success); + nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, + slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success); } pppm.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); else fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) { host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, - vd_brick, success); + vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, + success); } pppm.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); return host_brick; } float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - float **vd_brick, int &success) { - return pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out, - nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, - success); + float **vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, const int nz_pppm, + int &success) { + float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out, + nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, + slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success); + PPPMF.device->set_single_precompute(&PPPMF); + return b; } void pppm_gpu_clear_f(const double cpu_time) { PPPMF.clear(cpu_time); } int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv) { return PPPMF.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, delxinv,delyinv,delzinv); } void pppm_gpu_interp_f(const float qqrd2e_scale) { return PPPMF.interp(qqrd2e_scale); } double pppm_gpu_bytes_f() { return PPPMF.host_memory_usage(); } double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - double **vd_brick, int &success) { - return pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out, - nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, - success); + double **vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, + const int nz_pppm, int &success) { + double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out, + nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, + vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, + success); + PPPMF.device->set_single_precompute(&PPPMF); + return b; } void pppm_gpu_clear_d(const double cpu_time) { PPPMD.clear(cpu_time); } int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv) { return PPPMD.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, delxinv,delyinv,delzinv); } void pppm_gpu_interp_d(const double qqrd2e_scale) { return PPPMD.interp(qqrd2e_scale); } double pppm_gpu_bytes_d() { return PPPMD.host_memory_usage(); } diff --git a/src/GPU/pppm_gpu_double.cpp b/src/GPU/pppm_gpu_double.cpp index 4deda350b..33a6bf8d0 100644 --- a/src/GPU/pppm_gpu_double.cpp +++ b/src/GPU/pppm_gpu_double.cpp @@ -1,227 +1,231 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL) ------------------------------------------------------------------------- */ #include "mpi.h" #include "string.h" #include "stdio.h" #include "stdlib.h" #include "math.h" #include "pppm_gpu_double.h" #include "lmptype.h" #include "atom.h" #include "comm.h" #include "neighbor.h" #include "force.h" #include "pair.h" #include "bond.h" #include "angle.h" #include "domain.h" #include "fft3d_wrap.h" #include "remap_wrap.h" #include "memory.h" #include "error.h" #define grdtyp double // External functions from cuda library for atom decomposition grdtyp* pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - grdtyp **_vd_brick, int &success); + grdtyp **_vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, + const int nz_pppm, int &success); void pppm_gpu_clear_d(const double poisson_time); int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv); void pppm_gpu_interp_d(const grdtyp qqrd2e_scale); double pppm_gpu_bytes_d(); using namespace LAMMPS_NS; #define MAXORDER 7 #define OFFSET 16384 #define SMALL 0.00001 #define LARGE 10000.0 #define EPS_HOC 1.0e-7 #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) /* ---------------------------------------------------------------------- */ PPPMGPUDouble::PPPMGPUDouble(LAMMPS *lmp, int narg, char **arg) : PPPMGPU<grdtyp>(lmp, narg, arg) { } /* ---------------------------------------------------------------------- free all memory ------------------------------------------------------------------------- */ PPPMGPUDouble::~PPPMGPUDouble() { pppm_gpu_clear_d(poisson_time); } /* ---------------------------------------------------------------------- called once before run ------------------------------------------------------------------------- */ void PPPMGPUDouble::init() { base_init(); if (order>8) error->all("Cannot use order greater than 8 with pppm/gpu."); pppm_gpu_clear_d(poisson_time); int success; grdtyp *data, *h_brick; h_brick = pppm_gpu_init_d(atom->nlocal, atom->nlocal+atom->nghost, screen, order, nxlo_out, nylo_out, nzlo_out, nxhi_out, - nyhi_out, nzhi_out, rho_coeff, &data, success); + nyhi_out, nzhi_out, rho_coeff, &data, + slab_volfactor, nx_pppm, ny_pppm, nz_pppm, + success); int all_success; MPI_Allreduce(&success, &all_success, 1, MPI_INT, MPI_MIN, world); if (all_success != 0) { if (all_success == -1) error->all("Could not find fix gpu"); else if (all_success == -2) error->all("At least one node could not find specified GPU."); else if (all_success == -3) error->all("Out of memory on GPU."); else if (all_success == -4) error->all("GPU library not compiled for this GPU."); else if (all_success == -5) error->all("Double precision is not supported on this GPU."); else error->all("Unknown error in GPU library."); } density_brick = create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1); vd_brick = create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, nxlo_out,nxhi_out,"pppm:vd_brick",data,4); poisson_time=0; } /* ---------------------------------------------------------------------- compute the PPPMGPU long-range force, energy, virial ------------------------------------------------------------------------- */ void PPPMGPUDouble::compute(int eflag, int vflag) { bool success = true; int flag=pppm_gpu_spread_d(neighbor->ago, atom->nlocal, atom->nlocal + atom->nghost, atom->x, atom->type, success, atom->q, domain->boxlo, delxinv, delyinv, delzinv); if (!success) error->one("Out of memory on GPGPU"); if (flag != 0) error->one("Out of range atoms - cannot compute PPPM"); int i; // convert atoms from box to lamda coords if (triclinic == 0) boxlo = domain->boxlo; else { boxlo = domain->boxlo_lamda; domain->x2lamda(atom->nlocal); } energy = 0.0; if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0; double t3=MPI_Wtime(); // all procs communicate density values from their ghost cells // to fully sum contribution in their 3d bricks // remap from 3d decomposition to FFT decomposition brick2fft(); // compute potential gradient on my FFT grid and // portion of e_long on this proc's FFT grid // return gradients (electric fields) in 3d brick decomposition poisson(eflag,vflag); // all procs communicate E-field values to fill ghost cells // surrounding their 3d bricks fillbrick(); poisson_time+=MPI_Wtime()-t3; // calculate the force on my particles grdtyp qqrd2e_scale=qqrd2e*scale; pppm_gpu_interp_d(qqrd2e_scale); // sum energy across procs and add in volume-dependent term if (eflag) { double energy_all; MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); energy = energy_all; energy *= 0.5*volume; energy -= g_ewald*qsqsum/1.772453851 + 0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume); energy *= qqrd2e*scale; } // sum virial across procs if (vflag) { double virial_all[6]; MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i]; } // 2d slab correction if (slabflag) slabcorr(eflag); // convert atoms back from lamda to box coords if (triclinic) domain->lamda2x(atom->nlocal); } /* ---------------------------------------------------------------------- memory usage of local arrays ------------------------------------------------------------------------- */ double PPPMGPUDouble::memory_usage() { double bytes = nmax*3 * sizeof(double); int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * (nzhi_out-nzlo_out+1); bytes += 4 * nbrick * sizeof(grdtyp); bytes += 6 * nfft_both * sizeof(double); bytes += nfft_both*6 * sizeof(double); bytes += 2 * nbuf * sizeof(double); return bytes + pppm_gpu_bytes_d(); } diff --git a/src/GPU/pppm_gpu_single.cpp b/src/GPU/pppm_gpu_single.cpp index 693c47f43..b57ec0abb 100644 --- a/src/GPU/pppm_gpu_single.cpp +++ b/src/GPU/pppm_gpu_single.cpp @@ -1,227 +1,230 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL) ------------------------------------------------------------------------- */ #include "mpi.h" #include "string.h" #include "stdio.h" #include "stdlib.h" #include "math.h" #include "pppm_gpu_single.h" #include "lmptype.h" #include "atom.h" #include "comm.h" #include "neighbor.h" #include "force.h" #include "pair.h" #include "bond.h" #include "angle.h" #include "domain.h" #include "fft3d_wrap.h" #include "remap_wrap.h" #include "memory.h" #include "error.h" #define grdtyp float // External functions from cuda library for atom decomposition grdtyp* pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - grdtyp **_vd_brick, int &success); + grdtyp **_vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, + const int nz_pppm, int &success); void pppm_gpu_clear_f(const double poisson_time); int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv); void pppm_gpu_interp_f(const grdtyp qqrd2e_scale); double pppm_gpu_bytes_f(); using namespace LAMMPS_NS; #define MAXORDER 7 #define OFFSET 16384 #define SMALL 0.00001 #define LARGE 10000.0 #define EPS_HOC 1.0e-7 #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) /* ---------------------------------------------------------------------- */ PPPMGPUSingle::PPPMGPUSingle(LAMMPS *lmp, int narg, char **arg) : PPPMGPU<grdtyp>(lmp, narg, arg) { } /* ---------------------------------------------------------------------- free all memory ------------------------------------------------------------------------- */ PPPMGPUSingle::~PPPMGPUSingle() { pppm_gpu_clear_f(poisson_time); } /* ---------------------------------------------------------------------- called once before run ------------------------------------------------------------------------- */ void PPPMGPUSingle::init() { base_init(); if (order>8) error->all("Cannot use order greater than 8 with pppm/gpu."); pppm_gpu_clear_f(poisson_time); int success; grdtyp *data, *h_brick; h_brick = pppm_gpu_init_f(atom->nlocal, atom->nlocal+atom->nghost, screen, order, nxlo_out, nylo_out, nzlo_out, nxhi_out, - nyhi_out, nzhi_out, rho_coeff, &data, success); + nyhi_out, nzhi_out, rho_coeff, &data, + slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success); int all_success; MPI_Allreduce(&success, &all_success, 1, MPI_INT, MPI_MIN, world); if (all_success != 0) { if (all_success == -1) error->all("Could not find fix gpu"); else if (all_success == -2) error->all("At least one node could not find specified GPU."); else if (all_success == -3) error->all("Out of memory on GPU."); else if (all_success == -4) error->all("GPU library not compiled for this GPU."); else if (all_success == -5) error->all("Double precision is not supported on this GPU."); else error->all("Unknown error in GPU library."); } density_brick = create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1); vd_brick = create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, nxlo_out,nxhi_out,"pppm:vd_brick",data,4); poisson_time=0; } /* ---------------------------------------------------------------------- compute the PPPMGPU long-range force, energy, virial ------------------------------------------------------------------------- */ void PPPMGPUSingle::compute(int eflag, int vflag) { bool success = true; int flag=pppm_gpu_spread_f(neighbor->ago, atom->nlocal, atom->nlocal + atom->nghost, atom->x, atom->type, success, atom->q, domain->boxlo, delxinv, delyinv, delzinv); if (!success) error->one("Out of memory on GPGPU"); if (flag != 0) error->one("Out of range atoms - cannot compute PPPM"); int i; // convert atoms from box to lamda coords if (triclinic == 0) boxlo = domain->boxlo; else { boxlo = domain->boxlo_lamda; domain->x2lamda(atom->nlocal); } energy = 0.0; if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0; double t3=MPI_Wtime(); // all procs communicate density values from their ghost cells // to fully sum contribution in their 3d bricks // remap from 3d decomposition to FFT decomposition brick2fft(); // compute potential gradient on my FFT grid and // portion of e_long on this proc's FFT grid // return gradients (electric fields) in 3d brick decomposition poisson(eflag,vflag); // all procs communicate E-field values to fill ghost cells // surrounding their 3d bricks fillbrick(); poisson_time+=MPI_Wtime()-t3; // calculate the force on my particles grdtyp qqrd2e_scale=qqrd2e*scale; pppm_gpu_interp_f(qqrd2e_scale); // sum energy across procs and add in volume-dependent term if (eflag) { double energy_all; MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); energy = energy_all; energy *= 0.5*volume; energy -= g_ewald*qsqsum/1.772453851 + 0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume); energy *= qqrd2e*scale; } // sum virial across procs if (vflag) { double virial_all[6]; MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i]; } // 2d slab correction if (slabflag) slabcorr(eflag); // convert atoms back from lamda to box coords if (triclinic) domain->lamda2x(atom->nlocal); } /* ---------------------------------------------------------------------- memory usage of local arrays ------------------------------------------------------------------------- */ double PPPMGPUSingle::memory_usage() { double bytes = nmax*3 * sizeof(double); int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * (nzhi_out-nzlo_out+1); bytes += 4 * nbrick * sizeof(grdtyp); bytes += 6 * nfft_both * sizeof(double); bytes += nfft_both*6 * sizeof(double); bytes += 2 * nbuf * sizeof(double); return bytes + pppm_gpu_bytes_f(); }