diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index cc74d2ebd..3c859ffdd 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -1,249 +1,254 @@ # /* ---------------------------------------------------------------------- # LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator # http://lammps.sandia.gov, Sandia National Laboratories # Steve Plimpton, sjplimp@sandia.gov # # Copyright (2003) Sandia Corporation. Under the terms of Contract # DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains # certain rights in this software. This software is distributed under # the GNU General Public License. # # See the README file in the top-level LAMMPS directory. # ------------------------------------------------------------------------- */ # # /* ---------------------------------------------------------------------- # Contributing authors: Mike Brown (ORNL), brownw@ornl.gov # Peng Wang (Nvidia), penwang@nvidia.com # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ CUDA = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \ $(CUDA_PRECISION) CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \ -Icudpp_mini CUDA_LINK = $(CUDA_LIB) -lcudart GPU_LIB = $(LIB_DIR)/libgpu.a # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H) NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) # Headers for Pair Stuff -PAIR_H = pair_gpu_atom.h pair_gpu_nbor_shared.h pair_gpu_nbor.h \ - pair_gpu_precision.h pair_gpu_device.h pair_gpu_balance.h +PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ + pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ + pair_gpu_balance.h ALL_H = $(NVD_H) $(PAIR_H) EXECS = $(BIN_DIR)/nvc_get_devices CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \ $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \ $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o -OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ - $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_device.o \ +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ + $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \ + $(OBJ_DIR)/pair_gpu_device.o \ $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \ $(OBJ_DIR)/cmmc_msm_gpu_memory.o $(OBJ_DIR)/cmmc_msm_gpu.o \ $(CUDPP) PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \ $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \ $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \ $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \ $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \ $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \ $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \ $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \ $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \ $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \ $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h \ $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h all: $(GPU_LIB) $(EXECS) $(OBJ_DIR)/cudpp.o: cudpp_mini/cudpp.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp.cpp -Icudpp_mini $(OBJ_DIR)/cudpp_plan.o: cudpp_mini/cudpp_plan.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp_plan.cpp -Icudpp_mini $(OBJ_DIR)/cudpp_maximal_launch.o: cudpp_mini/cudpp_maximal_launch.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp_maximal_launch.cpp -Icudpp_mini $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu $(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu $(CUDA) -o $@ -c cudpp_mini/scan_app.cu $(OBJ_DIR)/pair_gpu_atom_kernel.ptx: pair_gpu_atom_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h $(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H) + $(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_ptx.h: $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_build_kernel.cu $(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_device.cpp $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(CUDR) -o $@ -c atomic_gpu_memory.cpp $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp $(CUDR) -o $@ -c charge_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu $(OBJ_DIR)/gb_gpu_kernel_lj.ptx: gb_gpu_kernel_lj.cu pair_gpu_precision.h gb_gpu_extra.h $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_lj.cu $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx: gb_gpu_kernel_nbor.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_ptx.h: $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h $(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_ptx.h $(CUDR) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp $(CUDR) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_cut_gpu_kernel.ptx: lj_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp $(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp $(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp $(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp $(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp $(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp $(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp $(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx: cmmc_msm_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_msm_gpu_kernel.cu $(OBJ_DIR)/cmmc_msm_gpu_ptx.h: $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h $(OBJ_DIR)/cmmc_msm_gpu_memory.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu_memory.cpp $(OBJ_DIR)/cmmc_msm_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmmc_msm_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_msm_gpu.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu.cpp $(CUDR) -o $@ -c cmmc_msm_gpu.cpp -I$(OBJ_DIR) $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H) $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) $(GPU_LIB): $(OBJS) $(AR) -crusv $(GPU_LIB) $(OBJS) clean: rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo veryclean: clean rm -rf *~ *.linkinfo diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 829add735..e488a56bc 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -1,169 +1,171 @@ # /* ---------------------------------------------------------------------- # LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator # http://lammps.sandia.gov, Sandia National Laboratories # Steve Plimpton, sjplimp@sandia.gov # # Copyright (2003) Sandia Corporation. Under the terms of Contract # DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains # certain rights in this software. This software is distributed under # the GNU General Public License. # # See the README file in the top-level LAMMPS directory. # ------------------------------------------------------------------------- */ # # /* ---------------------------------------------------------------------- # Contributing authors: Mike Brown (ORNL), brownw@ornl.gov # Peng Wang (Nvidia), penwang@nvidia.com # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ OCL = $(OCL_CPP) $(OCL_PREC) -DUSE_OPENCL OCL_LIB = $(LIB_DIR)/libgpu.a # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) # Headers for Pair Stuff -PAIR_H = pair_gpu_atom.h pair_gpu_nbor_shared.h pair_gpu_nbor.h \ - pair_gpu_precision.h pair_gpu_device.h pair_gpu_balance.h +PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ + pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ + pair_gpu_balance.h ALL_H = $(OCL_H) $(PAIR_H) EXECS = $(BIN_DIR)/ocl_get_devices -OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \ - $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_device.o \ +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ + $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \ + $(OBJ_DIR)/pair_gpu_device.o \ $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \ $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \ $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \ $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \ $(OBJ_DIR)/crml_gpu_cl.h \ $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h - + OCL_EXECS = $(BIN_DIR)/ocl_get_devices all: $(OCL_LIB) $(EXECS) $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu $(BSH) ./geryon/file_to_cstr.sh pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h $(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu $(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h $(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H) $(OCL) -o $@ -c pair_gpu_device.cpp $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(OCL) -o $@ -c atomic_gpu_memory.cpp $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp $(OCL) -o $@ -c charge_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu $(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h: gb_gpu_kernel.cu gb_gpu_kernel_lj.cu gb_gpu_extra.h cat gb_gpu_extra.h gb_gpu_kernel.cu > $(OBJ_DIR)/gb_gpu_kernel.tar; \ cat gb_gpu_extra.h gb_gpu_kernel_lj.cu > $(OBJ_DIR)/gb_gpu_kernel_lj.tar; \ $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_cl.h; \ rm -f $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h $(OCL) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp $(OCL) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_cl.h; $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp $(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_cl.h; $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp $(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_cl.h; $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp $(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h; $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp $(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h; $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp $(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h; $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp $(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_cl.h; $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp $(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) $(OCL_LIB): $(OBJS) $(PTXS) $(AR) -crusv $(OCL_LIB) $(OBJS) opencl: $(OCL_EXECS) clean: rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo veryclean: clean rm -rf *~ *.linkinfo diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_ans.cpp similarity index 52% copy from lib/gpu/pair_gpu_atom.cpp copy to lib/gpu/pair_gpu_ans.cpp index 46c9066e5..e6982e6eb 100644 --- a/lib/gpu/pair_gpu_atom.cpp +++ b/lib/gpu/pair_gpu_ans.cpp @@ -1,605 +1,409 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ -#include "pair_gpu_atom.h" +#include "pair_gpu_ans.h" -#define PairGPUAtomT PairGPUAtom<numtyp,acctyp> - -#ifdef WINDLL -#include <windows.h> -typedef bool (*__win_sort_alloc)(const int max_atoms); -typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin, - int *particle_begin); -__win_sort_alloc _win_sort_alloc; -__win_sort _win_sort; -#endif +#define PairGPUAnsT PairGPUAns<numtyp,acctyp> template <class numtyp, class acctyp> -PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false), - _vflag(false),_inum(0),_ilist(NULL), - _newton(false) { - #ifndef USE_OPENCL - sort_config.op = CUDPP_ADD; - sort_config.datatype = CUDPP_UINT; - sort_config.algorithm = CUDPP_SORT_RADIX; - sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; - - #ifdef WINDLL - HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll")); - if (hinstLib == NULL) { - printf("\nUnable to load gpu.dll\n"); - exit(1); - } - _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc"); - _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort"); - #endif - - #endif +PairGPUAnsT::PairGPUAns() : _allocated(false),_eflag(false),_vflag(false), + _inum(0),_ilist(NULL),_newton(false) { } template <class numtyp, class acctyp> -int PairGPUAtomT::bytes_per_atom() const { - int id_space=0; - if (_gpu_nbor) - id_space=2; - int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space; +int PairGPUAnsT::bytes_per_atom() const { + int bytes=11*sizeof(acctyp); if (_rot) - bytes+=4*sizeof(numtyp)+4*sizeof(acctyp); + bytes+=4*sizeof(acctyp); if (_charge) - bytes+=sizeof(numtyp); + bytes+=sizeof(acctyp); return bytes; } template <class numtyp, class acctyp> -bool PairGPUAtomT::alloc(const int inum, const int nall) { - _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10); - if (_newton) - _max_local=_max_atoms; - else - _max_local=static_cast<int>(static_cast<double>(inum)*1.10); +bool PairGPUAnsT::alloc(const int inum) { + _max_local=static_cast<int>(static_cast<double>(inum)*1.10); bool success=true; int ans_elements=4; if (_rot) ans_elements+=4; // Ignore host/device transfers? bool cpuview=false; if (dev->device_type()==UCL_CPU) cpuview=true; - // Allocate storage for CUDPP sort - #ifndef USE_OPENCL - #ifdef WINDLL - _win_sort_alloc(_max_atoms); - #else - if (_gpu_nbor) { - CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); - if (CUDPP_SUCCESS != result) - return false; - } - #endif - #endif - // -------------------------- Host allocations - // Get a host write only buffer - #ifdef GPU_CAST - success=success && (host_x_cast.alloc(_max_atoms*3,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - success=success && (host_type_cast.alloc(_max_atoms,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - #else - success=success && (host_x.alloc(_max_atoms*4,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - #endif success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); - // Buffer for casting only if different precisions - if (_charge) - success=success && (host_q.alloc(_max_atoms,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - // Buffer for casting only if different precisions - if (_rot) - success=success && (host_quat.alloc(_max_atoms*4,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - // --------------------------- Device allocations - _gpu_bytes=0; if (cpuview) { - #ifdef GPU_CAST - assert(0==1); - #else - dev_x.view(host_x); - #endif dev_engv.view(host_engv); dev_ans.view(host_ans); - if (_rot) - dev_quat.view(host_quat); - if (_charge) - dev_q.view(host_q); } else { - #ifdef GPU_CAST - success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev)); - success=success && (UCL_SUCCESS== - dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)); - success=success && (UCL_SUCCESS== - dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)); - _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); - #else - success=success && (UCL_SUCCESS== - dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); - #endif success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); success=success && (dev_ans.alloc(ans_elements*_max_local, *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); - if (_charge) { - success=success && (dev_q.alloc(_max_atoms,*dev, - UCL_READ_ONLY)==UCL_SUCCESS); - _gpu_bytes+=dev_q.row_bytes(); - } - if (_rot) { - success=success && (dev_quat.alloc(_max_atoms*4,*dev, - UCL_READ_ONLY)==UCL_SUCCESS); - _gpu_bytes+=dev_quat.row_bytes(); - } } - if (_gpu_nbor) { - success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); - success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); - _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); - if (_bonds) { - success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); - _gpu_bytes+=dev_tag.row_bytes(); - } - } - - _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes(); + _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes(); _allocated=true; return success; } template <class numtyp, class acctyp> -bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, - const bool rot, UCL_Device &devi, const bool gpu_nbor, - const bool bonds) { +bool PairGPUAnsT::init(const int inum, const bool charge, const bool rot, + UCL_Device &devi) { clear(); bool success=true; - _x_avail=false; - _q_avail=false; - _quat_avail=false; - _gpu_nbor=gpu_nbor; - _bonds=bonds; _charge=charge; _rot=rot; _other=_charge || _rot; dev=&devi; _e_fields=1; if (_charge) _e_fields++; _ev_fields=6+_e_fields; // Initialize atom and nbor data int ef_inum=inum; if (ef_inum==0) ef_inum=1000; - int ef_nall=nall; - if (ef_nall<=ef_inum) - ef_nall=ef_inum*2; // Initialize timers for the selected device - time_pos.init(*dev); - time_other.init(*dev); time_answer.init(*dev); - time_pos.zero(); - time_other.zero(); time_answer.zero(); _time_cast=0.0; - #ifdef GPU_CAST - compile_kernels(*dev); - #endif - - return success && alloc(ef_inum,ef_nall); + return success && alloc(ef_inum); } template <class numtyp, class acctyp> -bool PairGPUAtomT::add_fields(const bool charge, const bool rot) { +bool PairGPUAnsT::add_fields(const bool charge, const bool rot) { bool realloc=false; if (charge && _charge==false) { _charge=true; _e_fields++; + _ev_fields++; realloc=true; } if (rot && _rot==false) { _rot=true; realloc=true; } if (realloc) { _other=_charge || _rot; int inum=_max_local; - int nall=_max_atoms; clear_resize(); - return alloc(inum,nall); + return alloc(inum); } return true; } template <class numtyp, class acctyp> -void PairGPUAtomT::clear_resize() { +void PairGPUAnsT::clear_resize() { if (!_allocated) return; _allocated=false; - dev_x.clear(); - if (_charge) { - dev_q.clear(); - host_q.clear(); - } - if (_rot) { - dev_quat.clear(); - host_quat.clear(); - } dev_ans.clear(); dev_engv.clear(); - #ifndef GPU_CAST - host_x.clear(); - #else - host_x_cast.clear(); - host_type_cast.clear(); - #endif host_ans.clear(); host_engv.clear(); - dev_cell_id.clear(); - dev_particle_id.clear(); - dev_tag.clear(); - #ifdef GPU_CAST - dev_x_cast.clear(); - dev_type_cast.clear(); - #endif - - #ifndef USE_OPENCL - #ifndef WINDLL - if (_gpu_nbor) cudppDestroyPlan(sort_plan); - #endif - #endif } template <class numtyp, class acctyp> -void PairGPUAtomT::clear() { +void PairGPUAnsT::clear() { _gpu_bytes=0; if (!_allocated) return; time_pos.clear(); time_other.clear(); time_answer.clear(); clear_resize(); _inum=0; _eflag=false; _vflag=false; - - #ifdef GPU_CAST - if (_compiled) { - k_cast_x.clear(); - delete atom_program; - _compiled=false; - } - #endif } template <class numtyp, class acctyp> -double PairGPUAtomT::host_memory_usage() const { +double PairGPUAnsT::host_memory_usage() const { int atom_bytes=4; if (_charge) atom_bytes+=1; if (_rot) atom_bytes+=4; int ans_bytes=atom_bytes+_ev_fields; - return _max_atoms*atom_bytes*sizeof(numtyp)+ - ans_bytes*(_max_local)*sizeof(acctyp)+ - sizeof(PairGPUAtom<numtyp,acctyp>); + return ans_bytes*(_max_local)*sizeof(acctyp)+ + sizeof(PairGPUAns<numtyp,acctyp>); } template <class numtyp, class acctyp> -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom) { +void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom) { time_answer.start(); _eflag=eflag; _vflag=vflag; _ef_atom=ef_atom; _vf_atom=vf_atom; int csize=_ev_fields; if (!eflag) csize-=_e_fields; if (!vflag) csize-=6; if (csize>0) ucl_copy(host_engv,dev_engv,_inum*csize,true); if (_rot) ucl_copy(host_ans,dev_ans,_inum*4*2,true); else ucl_copy(host_ans,dev_ans,_inum*4,true); time_answer.stop(); + dev->add_answer_object(this); } template <class numtyp, class acctyp> -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, - int *ilist) { +void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom, + int *ilist) { _ilist=ilist; copy_answers(eflag,vflag,ef_atom,vf_atom); } template <class numtyp, class acctyp> -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, - double *virial) { +double PairGPUAnsT::energy_virial(double *eatom, double **vatom, + double *virial) { if (_eflag==false && _vflag==false) return 0.0; double evdwl=0.0; if (_gpu_nbor) { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[i]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[i][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } else { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; int ii=_ilist[i]; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[ii]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[ii][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } evdwl*=0.5; return evdwl; } template <class numtyp, class acctyp> -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, +double PairGPUAnsT::energy_virial(double *eatom, double **vatom, double *virial, double &ecoul) { if (_eflag==false && _vflag==false) { ecoul=0.0; return 0.0; } if (_charge==false) return energy_virial(eatom,vatom,virial); double evdwl=0.0; double _ecoul=0.0; if (_gpu_nbor) { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[i]+=*ap*0.5; ap+=_inum; _ecoul+=*ap; eatom[i]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; _ecoul+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[i][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } else { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; int ii=_ilist[i]; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[ii]+=*ap*0.5; ap+=_inum; _ecoul+=*ap; eatom[ii]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; _ecoul+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[ii][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } evdwl*=0.5; ecoul+=_ecoul*0.5; return evdwl; } template <class numtyp, class acctyp> -void PairGPUAtomT::get_answers(double **f, double **tor) { +void PairGPUAnsT::get_answers(double **f, double **tor) { _x_avail=false; _q_avail=false; _quat_avail=false; acctyp *ap=host_ans.begin(); if (_gpu_nbor) { for (int i=0; i<_inum; i++) { f[i][0]+=*ap; ap++; f[i][1]+=*ap; ap++; f[i][2]+=*ap; ap+=2; } if (_rot) { for (int i=0; i<_inum; i++) { tor[i][0]+=*ap; ap++; tor[i][1]+=*ap; ap++; tor[i][2]+=*ap; ap+=2; } } } else { for (int i=0; i<_inum; i++) { int ii=_ilist[i]; f[ii][0]+=*ap; ap++; f[ii][1]+=*ap; ap++; f[ii][2]+=*ap; ap+=2; } if (_rot) { for (int i=0; i<_inum; i++) { int ii=_ilist[i]; tor[ii][0]+=*ap; ap++; tor[ii][1]+=*ap; ap++; tor[ii][2]+=*ap; ap+=2; } } } } -// Sort arrays for neighbor list calculation -template <class numtyp, class acctyp> -void PairGPUAtomT::sort_neighbor(const int num_atoms) { - #ifndef USE_OPENCL - #ifdef WINDLL - _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(), - (int *)dev_particle_id.begin()); - #else - CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), - (int *)dev_particle_id.begin(), - 8*sizeof(unsigned), num_atoms); - if (CUDPP_SUCCESS != result) { - printf("Error in cudppSort\n"); - NVD_GERYON_EXIT; - } - #endif - #endif -} - -#ifdef GPU_CAST -#ifdef USE_OPENCL -#include "pair_gpu_atom_cl.h" -#else -#include "pair_gpu_atom_ptx.h" -#endif - -template <class numtyp, class acctyp> -void PairGPUAtomT::compile_kernels(UCL_Device &dev) { - atom_program=new UCL_Program(dev); - atom_program->load_string(pair_gpu_atom_kernel,""); - k_cast_x.set_function(*atom_program,"kernel_cast_x"); - _compiled=true; -} - -#endif - -template class PairGPUAtom<PRECISION,ACC_PRECISION>; +template class PairGPUAns<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/pair_gpu_ans.h b/lib/gpu/pair_gpu_ans.h new file mode 100644 index 000000000..a93ed6fcd --- /dev/null +++ b/lib/gpu/pair_gpu_ans.h @@ -0,0 +1,158 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef PAIR_GPU_ANS_H +#define PAIR_GPU_ANS_H + +#include <math.h> +#include "mpi.h" + +#ifdef USE_OPENCL + +#include "geryon/ocl_timer.h" +#include "geryon/ocl_mat.h" +using namespace ucl_opencl; + +#else + +#include "cudpp.h" +#include "geryon/nvd_timer.h" +#include "geryon/nvd_mat.h" +using namespace ucl_cudadr; + +#endif + +#include "pair_gpu_precision.h" + +template <class numtyp, class acctyp> +class PairGPUAns { + public: + PairGPUAns(); + ~PairGPUAns() { clear(); } + + /// Current number of local atoms stored + inline int inum() const { return _inum; } + /// Set number of local atoms for future copy operations + inline void inum(const int n) { _inum=n; } + + /// Memory usage per atom in this class + int bytes_per_atom() const; + + /// Clear any previous data and set up for a new LAMMPS run + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor True if neighboring will be performed on device **/ + bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev); + + /// Check if we have enough device storage and realloc if not + inline bool resize(const int inum, bool &success) { + _inum=inum; + if (inum>_max_local) { + clear_resize(); + success = success && alloc(inum); + return true; + } + return false; + } + + /// If already initialized by another LAMMPS style, add fields as necessary + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor True if neighboring will be performed on device **/ + bool add_fields(const bool charge, const bool rot); + + /// Free all memory on host and device needed to realloc for more atoms + void clear_resize(); + + /// Free all memory on host and device + void clear(); + + /// Return the total amount of host memory used by class in bytes + double host_memory_usage() const; + + /// Add copy times to timers + inline void acc_timers() { + time_answer.add_to_total(); + } + + /// Add copy times to timers + inline void zero_timers() { + time_answer.zero(); + } + + /// Return the total time for host/device data transfer + inline double transfer_time() { + return time_answer.total_seconds(); + } + + /// Return the total time for data cast/pack + inline double cast_time() { return _time_cast; } + + /// Return number of bytes used on device + inline double gpu_bytes() { return _gpu_bytes; } + + // -------------------------COPY FROM GPU ------------------------------- + + /// Copy answers from device into read buffer asynchronously + void copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom); + + /// Copy answers from device into read buffer asynchronously + void copy_answers(const bool eflag, const bool vflag, + const bool ef_atom, const bool vf_atom, int *ilist); + + /// Copy energy and virial data into LAMMPS memory + double energy_virial(double *eatom, double **vatom, double *virial); + + /// Copy energy and virial data into LAMMPS memory + double energy_virial(double *eatom, double **vatom, double *virial, + double &ecoul); + + /// Add forces and torques from the GPU into a LAMMPS pointer + void get_answers(double **f, double **tor); + + // ------------------------------ DATA ---------------------------------- + + /// Force and possibly torque + UCL_D_Vec<acctyp> dev_ans; + /// Energy and virial per-atom storage + UCL_D_Vec<acctyp> dev_engv; + + /// Force and possibly torque data on host + UCL_H_Vec<acctyp> host_ans; + /// Energy/virial data on host + UCL_H_Vec<acctyp> host_engv; + + /// Device timers + UCL_Timer time_answer; + + /// Geryon device + UCL_Device *dev; + + private: + bool alloc(const int inum); + + bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; + int _max_local, _inum, _e_fields, _ev_fields; + int *_ilist; + double _time_cast; + + double _gpu_bytes; + + bool _newton; +}; + +#endif + diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp index 46c9066e5..812a7c82d 100644 --- a/lib/gpu/pair_gpu_atom.cpp +++ b/lib/gpu/pair_gpu_atom.cpp @@ -1,605 +1,319 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #include "pair_gpu_atom.h" #define PairGPUAtomT PairGPUAtom<numtyp,acctyp> #ifdef WINDLL #include <windows.h> typedef bool (*__win_sort_alloc)(const int max_atoms); typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin, int *particle_begin); __win_sort_alloc _win_sort_alloc; __win_sort _win_sort; #endif template <class numtyp, class acctyp> -PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false), - _vflag(false),_inum(0),_ilist(NULL), - _newton(false) { +PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false) { #ifndef USE_OPENCL sort_config.op = CUDPP_ADD; sort_config.datatype = CUDPP_UINT; sort_config.algorithm = CUDPP_SORT_RADIX; sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; #ifdef WINDLL HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll")); if (hinstLib == NULL) { printf("\nUnable to load gpu.dll\n"); exit(1); } _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc"); _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort"); #endif #endif } template <class numtyp, class acctyp> int PairGPUAtomT::bytes_per_atom() const { int id_space=0; if (_gpu_nbor) id_space=2; - int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space; + int bytes=4*sizeof(numtyp)+id_space; if (_rot) - bytes+=4*sizeof(numtyp)+4*sizeof(acctyp); + bytes+=4*sizeof(numtyp); if (_charge) bytes+=sizeof(numtyp); return bytes; } template <class numtyp, class acctyp> -bool PairGPUAtomT::alloc(const int inum, const int nall) { +bool PairGPUAtomT::alloc(const int nall) { _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10); - if (_newton) - _max_local=_max_atoms; - else - _max_local=static_cast<int>(static_cast<double>(inum)*1.10); bool success=true; - int ans_elements=4; - if (_rot) - ans_elements+=4; - // Ignore host/device transfers? bool cpuview=false; if (dev->device_type()==UCL_CPU) cpuview=true; // Allocate storage for CUDPP sort #ifndef USE_OPENCL #ifdef WINDLL _win_sort_alloc(_max_atoms); #else if (_gpu_nbor) { CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); if (CUDPP_SUCCESS != result) return false; } #endif #endif // -------------------------- Host allocations // Get a host write only buffer #ifdef GPU_CAST success=success && (host_x_cast.alloc(_max_atoms*3,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); success=success && (host_type_cast.alloc(_max_atoms,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #else success=success && (host_x.alloc(_max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #endif - success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); - success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_charge) success=success && (host_q.alloc(_max_atoms,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_rot) success=success && (host_quat.alloc(_max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); // --------------------------- Device allocations _gpu_bytes=0; if (cpuview) { #ifdef GPU_CAST assert(0==1); #else dev_x.view(host_x); #endif - dev_engv.view(host_engv); - dev_ans.view(host_ans); if (_rot) dev_quat.view(host_quat); if (_charge) dev_q.view(host_q); } else { #ifdef GPU_CAST success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev)); success=success && (UCL_SUCCESS== dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)); success=success && (UCL_SUCCESS== dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)); _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); #else success=success && (UCL_SUCCESS== dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); #endif - success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, - UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && (dev_ans.alloc(ans_elements*_max_local, - *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); if (_charge) { success=success && (dev_q.alloc(_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_q.row_bytes(); } if (_rot) { success=success && (dev_quat.alloc(_max_atoms*4,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_quat.row_bytes(); } } if (_gpu_nbor) { success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); if (_bonds) { success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); _gpu_bytes+=dev_tag.row_bytes(); } } - _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes(); + _gpu_bytes+=dev_x.row_bytes(); _allocated=true; return success; } template <class numtyp, class acctyp> -bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, - const bool rot, UCL_Device &devi, const bool gpu_nbor, +bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot, + UCL_Device &devi, const bool gpu_nbor, const bool bonds) { clear(); bool success=true; _x_avail=false; _q_avail=false; _quat_avail=false; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; _rot=rot; _other=_charge || _rot; dev=&devi; - _e_fields=1; - if (_charge) - _e_fields++; - _ev_fields=6+_e_fields; - // Initialize atom and nbor data - int ef_inum=inum; - if (ef_inum==0) - ef_inum=1000; int ef_nall=nall; - if (ef_nall<=ef_inum) - ef_nall=ef_inum*2; + if (ef_nall==0) + ef_nall=2000; // Initialize timers for the selected device time_pos.init(*dev); time_other.init(*dev); - time_answer.init(*dev); time_pos.zero(); time_other.zero(); - time_answer.zero(); _time_cast=0.0; #ifdef GPU_CAST compile_kernels(*dev); #endif - return success && alloc(ef_inum,ef_nall); + return success && alloc(ef_nall); } template <class numtyp, class acctyp> bool PairGPUAtomT::add_fields(const bool charge, const bool rot) { bool realloc=false; if (charge && _charge==false) { _charge=true; - _e_fields++; realloc=true; } if (rot && _rot==false) { _rot=true; realloc=true; } if (realloc) { _other=_charge || _rot; - int inum=_max_local; - int nall=_max_atoms; + int max_atoms=_max_atoms; clear_resize(); - return alloc(inum,nall); + return alloc(max_atoms); } return true; } template <class numtyp, class acctyp> void PairGPUAtomT::clear_resize() { if (!_allocated) return; _allocated=false; dev_x.clear(); if (_charge) { dev_q.clear(); host_q.clear(); } if (_rot) { dev_quat.clear(); host_quat.clear(); } - dev_ans.clear(); - dev_engv.clear(); #ifndef GPU_CAST host_x.clear(); #else host_x_cast.clear(); host_type_cast.clear(); #endif - host_ans.clear(); - host_engv.clear(); dev_cell_id.clear(); dev_particle_id.clear(); dev_tag.clear(); #ifdef GPU_CAST dev_x_cast.clear(); dev_type_cast.clear(); #endif #ifndef USE_OPENCL #ifndef WINDLL if (_gpu_nbor) cudppDestroyPlan(sort_plan); #endif #endif } template <class numtyp, class acctyp> void PairGPUAtomT::clear() { _gpu_bytes=0; if (!_allocated) return; time_pos.clear(); time_other.clear(); - time_answer.clear(); clear_resize(); - _inum=0; - _eflag=false; - _vflag=false; #ifdef GPU_CAST if (_compiled) { k_cast_x.clear(); delete atom_program; _compiled=false; } #endif } template <class numtyp, class acctyp> double PairGPUAtomT::host_memory_usage() const { int atom_bytes=4; if (_charge) atom_bytes+=1; if (_rot) atom_bytes+=4; - int ans_bytes=atom_bytes+_ev_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+ - ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(PairGPUAtom<numtyp,acctyp>); } -template <class numtyp, class acctyp> -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom) { - time_answer.start(); - _eflag=eflag; - _vflag=vflag; - _ef_atom=ef_atom; - _vf_atom=vf_atom; - - int csize=_ev_fields; - if (!eflag) - csize-=_e_fields; - if (!vflag) - csize-=6; - - if (csize>0) - ucl_copy(host_engv,dev_engv,_inum*csize,true); - if (_rot) - ucl_copy(host_ans,dev_ans,_inum*4*2,true); - else - ucl_copy(host_ans,dev_ans,_inum*4,true); - time_answer.stop(); -} - -template <class numtyp, class acctyp> -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, - int *ilist) { - _ilist=ilist; - copy_answers(eflag,vflag,ef_atom,vf_atom); -} - -template <class numtyp, class acctyp> -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, - double *virial) { - if (_eflag==false && _vflag==false) - return 0.0; - - double evdwl=0.0; - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } else { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - int ii=_ilist[i]; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } - - evdwl*=0.5; - return evdwl; -} - -template <class numtyp, class acctyp> -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, - double *virial, double &ecoul) { - if (_eflag==false && _vflag==false) { - ecoul=0.0; - return 0.0; - } - - if (_charge==false) - return energy_virial(eatom,vatom,virial); - - double evdwl=0.0; - double _ecoul=0.0; - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } else { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - int ii=_ilist[i]; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } - - evdwl*=0.5; - ecoul+=_ecoul*0.5; - return evdwl; -} - -template <class numtyp, class acctyp> -void PairGPUAtomT::get_answers(double **f, double **tor) { - _x_avail=false; - _q_avail=false; - _quat_avail=false; - acctyp *ap=host_ans.begin(); - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - f[i][0]+=*ap; - ap++; - f[i][1]+=*ap; - ap++; - f[i][2]+=*ap; - ap+=2; - } - if (_rot) { - for (int i=0; i<_inum; i++) { - tor[i][0]+=*ap; - ap++; - tor[i][1]+=*ap; - ap++; - tor[i][2]+=*ap; - ap+=2; - } - } - } else { - for (int i=0; i<_inum; i++) { - int ii=_ilist[i]; - f[ii][0]+=*ap; - ap++; - f[ii][1]+=*ap; - ap++; - f[ii][2]+=*ap; - ap+=2; - } - if (_rot) { - for (int i=0; i<_inum; i++) { - int ii=_ilist[i]; - tor[ii][0]+=*ap; - ap++; - tor[ii][1]+=*ap; - ap++; - tor[ii][2]+=*ap; - ap+=2; - } - } - } -} - // Sort arrays for neighbor list calculation template <class numtyp, class acctyp> void PairGPUAtomT::sort_neighbor(const int num_atoms) { #ifndef USE_OPENCL #ifdef WINDLL _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(), (int *)dev_particle_id.begin()); #else CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), (int *)dev_particle_id.begin(), 8*sizeof(unsigned), num_atoms); if (CUDPP_SUCCESS != result) { printf("Error in cudppSort\n"); NVD_GERYON_EXIT; } #endif #endif } #ifdef GPU_CAST #ifdef USE_OPENCL #include "pair_gpu_atom_cl.h" #else #include "pair_gpu_atom_ptx.h" #endif template <class numtyp, class acctyp> void PairGPUAtomT::compile_kernels(UCL_Device &dev) { atom_program=new UCL_Program(dev); atom_program->load_string(pair_gpu_atom_kernel,""); k_cast_x.set_function(*atom_program,"kernel_cast_x"); _compiled=true; } #endif template class PairGPUAtom<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index 562ca0846..c4c6b1586 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -1,447 +1,404 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef PAIR_GPU_ATOM_H #define PAIR_GPU_ATOM_H #include <math.h> #include "mpi.h" #ifdef USE_OPENCL -#include "geryon/ocl_device.h" #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" #include "geryon/ocl_kernel.h" using namespace ucl_opencl; #else #include "cudpp.h" -#include "geryon/nvd_device.h" #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" #include "geryon/nvd_kernel.h" using namespace ucl_cudadr; #endif -#ifndef int2 -struct int2 { int x; int y; }; -#endif - #include "pair_gpu_precision.h" template <class numtyp, class acctyp> class PairGPUAtom { public: PairGPUAtom(); ~PairGPUAtom() { clear(); } /// Maximum number of atoms that can be stored with current allocation inline int max_atoms() const { return _max_atoms; } /// Current number of local+ghost atoms stored inline int nall() const { return _nall; } - /// Current number of local atoms stored - inline int inum() const { return _inum; } /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } - /// Set number of local atoms for future copy operations - inline void inum(const int n) { _inum=n; } /// Memory usage per atom in this class int bytes_per_atom() const; /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions * \param gpu_nbor True if neighboring will be performed on device **/ - bool init(const int inum, const int nall, const bool charge, const bool rot, + bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false); /// Check if we have enough device storage and realloc if not - inline bool resize(const int inum, const int nall, bool &success) { - _inum=inum; + inline bool resize(const int nall, bool &success) { _nall=nall; - if (inum>_max_local || nall>_max_atoms) { + if (nall>_max_atoms) { clear_resize(); - success = success && alloc(inum,nall); + success = success && alloc(nall); return true; } return false; } /// If already initialized by another LAMMPS style, add fields as necessary /** \param rot True if atom storage needs quaternions * \param gpu_nbor True if neighboring will be performed on device **/ bool add_fields(const bool charge, const bool rot); - /// True if charge data is available for kernels - bool charge_avail() const { return _charge; } - /// Only free matrices of length inum or nall for resizing void clear_resize(); /// Free all memory on host and device void clear(); /// Return the total amount of host memory used by class in bytes double host_memory_usage() const; /// Sort arrays for neighbor list calculation on device void sort_neighbor(const int num_atoms); /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); - time_answer.add_to_total(); if (_other) time_other.add_to_total(); } /// Add copy times to timers inline void zero_timers() { time_pos.zero(); - time_answer.zero(); if (_other) time_other.zero(); } /// Return the total time for host/device data transfer inline double transfer_time() { - double total=time_pos.total_seconds()+time_answer.total_seconds(); + double total=time_pos.total_seconds(); if (_other) total+=time_other.total_seconds(); return total; } /// Return the total time for data cast/pack inline double cast_time() { return _time_cast; } /// Pack LAMMPS atom type constants into matrix and copy to device template <class dev_typ, class t1> inline void type_pack1(const int n, const int m_size, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one) { int ii=0; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { buffer[ii]=static_cast<numtyp>(one[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants into 2 vectors and copy to device template <class dev_typ, class t1, class t2> inline void type_pack2(const int n, const int m_size, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) { int ii=0; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { buffer[ii*2]=static_cast<numtyp>(one[i][j]); buffer[ii*2+1]=static_cast<numtyp>(two[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device template <class dev_typ, class t1, class t2, class t3> inline void type_pack4(const int n, const int m_size, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two, t3 **three) { int ii=0; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { buffer[ii*4]=static_cast<numtyp>(one[i][j]); buffer[ii*4+1]=static_cast<numtyp>(two[i][j]); buffer[ii*4+2]=static_cast<numtyp>(three[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device template <class dev_typ, class t1, class t2, class t3, class t4> inline void type_pack4(const int n, const int m_size, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two, t3 **three, t4 **four) { int ii=0; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { buffer[ii*4]=static_cast<numtyp>(one[i][j]); buffer[ii*4+1]=static_cast<numtyp>(two[i][j]); buffer[ii*4+2]=static_cast<numtyp>(three[i][j]); buffer[ii*4+3]=static_cast<numtyp>(four[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device template <class dev_typ, class t1, class t2> inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) { for (int i=0; i<n; i++) { buffer[i*2]=static_cast<numtyp>(one[i][i]); buffer[i*2+1]=static_cast<numtyp>(two[i][i]); } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),n,*dev); ucl_copy(dev_v,view,false); } // -------------------------COPY TO GPU ---------------------------------- + /// Signal that we need to transfer atom data for next timestep + inline void data_unavail() + { _x_avail=false; _q_avail=false; _quat_avail=false; } + /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { if (_x_avail==false) { double t=MPI_Wtime(); #ifdef GPU_CAST memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); #else numtyp *_write_loc=host_x.begin(); for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i][0]; _write_loc++; *_write_loc=host_ptr[i][1]; _write_loc++; *_write_loc=host_ptr[i][2]; _write_loc++; *_write_loc=host_type[i]; _write_loc++; } #endif _time_cast+=MPI_Wtime()-t; } } /// Copy positions and types to device asynchronously /** Copies nall() elements **/ inline void add_x_data(double **host_ptr, int *host_type) { time_pos.start(); if (_x_avail==false) { #ifdef GPU_CAST ucl_copy(dev_x_cast,host_x_cast,_nall*3,true); ucl_copy(dev_type_cast,host_type_cast,_nall,true); int block_size=64; int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size)); k_cast_x.set_size(GX,block_size); k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), &_nall); #else ucl_copy(dev_x,host_x,_nall*4,true); #endif _x_avail=true; } time_pos.stop(); } /// Calls cast_x_data and add_x_data and times the routines inline void cast_copy_x(double **host_ptr, int *host_type) { cast_x_data(host_ptr,host_type); add_x_data(host_ptr,host_type); } // Cast charges to write buffer template<class cpytyp> inline void cast_q_data(cpytyp *host_ptr) { if (_q_avail==false) { double t=MPI_Wtime(); if (dev->device_type()==UCL_CPU) { if (sizeof(numtyp)==sizeof(double)) { host_q.view((numtyp*)host_ptr,_nall,*dev); dev_q.view(host_q); } else for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; } else { if (sizeof(numtyp)==sizeof(double)) memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); else for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; } _time_cast+=MPI_Wtime()-t; } } // Copy charges to device asynchronously inline void add_q_data() { if (_q_avail==false) { ucl_copy(dev_q,host_q,_nall,true); _q_avail=true; } } // Cast quaternions to write buffer template<class cpytyp> inline void cast_quat_data(cpytyp *host_ptr) { if (_quat_avail==false) { double t=MPI_Wtime(); if (dev->device_type()==UCL_CPU) { if (sizeof(numtyp)==sizeof(double)) { host_quat.view((numtyp*)host_ptr,_nall*4,*dev); dev_quat.view(host_quat); } else for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; } else { if (sizeof(numtyp)==sizeof(double)) memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); else for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; } _time_cast+=MPI_Wtime()-t; } } // Copy quaternions to device /** Copies nall()*4 elements **/ inline void add_quat_data() { if (_quat_avail==false) { ucl_copy(dev_quat,host_quat,_nall*4,true); _quat_avail=true; } } /// Copy data other than pos and type to device inline void add_other_data() { if (_other) { time_other.start(); if (_charge) add_q_data(); if (_rot) add_quat_data(); time_other.stop(); } } /// Return number of bytes used on device inline double gpu_bytes() { return _gpu_bytes; } - // -------------------------COPY FROM GPU ------------------------------- - - /// Copy answers from device into read buffer asynchronously - void copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom); - - /// Copy answers from device into read buffer asynchronously - void copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, int *ilist); - - /// Copy energy and virial data into LAMMPS memory - double energy_virial(double *eatom, double **vatom, double *virial); - - /// Copy energy and virial data into LAMMPS memory - double energy_virial(double *eatom, double **vatom, double *virial, - double &ecoul); - - /// Add forces and torques from the GPU into a LAMMPS pointer - void get_answers(double **f, double **tor); - // ------------------------------ DATA ---------------------------------- /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type UCL_D_Vec<numtyp> dev_x; /// Charges UCL_D_Vec<numtyp> dev_q; /// Quaterions UCL_D_Vec<numtyp> dev_quat; - /// Force and possibly torque - UCL_D_Vec<acctyp> dev_ans; - /// Energy and virial per-atom storage - UCL_D_Vec<acctyp> dev_engv; #ifdef GPU_CAST UCL_D_Vec<double> dev_x_cast; UCL_D_Vec<int> dev_type_cast; UCL_H_Vec<double> host_x_cast; UCL_H_Vec<int> host_type_cast; #endif /// Buffer for moving positions to device UCL_H_Vec<numtyp> host_x; /// Buffer for moving charge data to GPU UCL_H_Vec<numtyp> host_q; /// Buffer for moving quat data to GPU UCL_H_Vec<numtyp> host_quat; - /// Force and possibly torque data on host - UCL_H_Vec<acctyp> host_ans; - /// Energy/virial data on host - UCL_H_Vec<acctyp> host_engv; /// Cell list identifiers for device nbor builds UCL_D_Vec<unsigned> dev_cell_id; /// Cell list identifiers for device nbor builds UCL_D_Vec<int> dev_particle_id; /// Atom tag information for device nbor builds UCL_D_Vec<int> dev_tag; /// Device timers - UCL_Timer time_pos, time_other, time_answer; + UCL_Timer time_pos, time_other; /// Geryon device UCL_Device *dev; private: #ifdef GPU_CAST UCL_Program *atom_program; UCL_Kernel k_cast_x; void compile_kernels(UCL_Device &dev); #endif bool _compiled; // True if data has been copied to device already - int _x_avail, _q_avail, _quat_avail; + bool _x_avail, _q_avail, _quat_avail; - bool alloc(const int inum, const int nall); + bool alloc(const int nall); - bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; - int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields; + bool _allocated, _rot, _charge, _other; + int _max_atoms, _nall; bool _gpu_nbor, _bonds; - int *_ilist; double _time_cast; double _gpu_bytes; - bool _newton; - #ifndef USE_OPENCL CUDPPConfiguration sort_config; CUDPPHandle sort_plan; #endif }; #endif diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index 30206d72e..718e9d9dd 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -1,289 +1,290 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #include "pair_gpu_device.h" #include "pair_gpu_precision.h" #include <map> #include <math.h> #ifdef _OPENMP #include <omp.h> #endif #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp> template <class numtyp, class acctyp> PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false), _gpu_mode(GPU_FORCE), _first_device(0), _last_device(0) { } template <class numtyp, class acctyp> PairGPUDeviceT::~PairGPUDevice() { clear_device(); } template <class numtyp, class acctyp> bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double p_split, const int nthreads) { _nthreads=nthreads; #ifdef _OPENMP omp_set_num_threads(nthreads); #endif if (_device_init) return true; _device_init=true; _comm_world=world; _comm_replica=replica; _first_device=first_gpu; _last_device=last_gpu; _gpu_mode=gpu_mode; _particle_split=p_split; // Get the rank/size within the world MPI_Comm_rank(_comm_world,&_world_me); MPI_Comm_size(_comm_world,&_world_size); // Get the rank/size within the replica MPI_Comm_rank(_comm_replica,&_replica_me); MPI_Comm_size(_comm_replica,&_replica_size); // Get the names of all nodes int name_length; char node_name[MPI_MAX_PROCESSOR_NAME]; char node_names[MPI_MAX_PROCESSOR_NAME*_world_size]; MPI_Get_processor_name(node_name,&name_length); MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world); std::string node_string=std::string(node_name); // Get the number of procs per node std::map<std::string,int> name_map; std::map<std::string,int>::iterator np; for (int i=0; i<_world_size; i++) { std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); np=name_map.find(i_string); if (np==name_map.end()) name_map[i_string]=1; else np->second++; } int procs_per_node=name_map.begin()->second; // Assign a unique id to each node int split_num=0, split_id=0; for (np=name_map.begin(); np!=name_map.end(); ++np) { if (np->first==node_string) split_id=split_num; split_num++; } // Set up a per node communicator and find rank within MPI_Comm node_comm; MPI_Comm_split(_comm_world, split_id, 0, &node_comm); int node_rank; MPI_Comm_rank(node_comm,&node_rank); // set the device ID _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/ (last_gpu-first_gpu+1))); int my_gpu=node_rank/_procs_per_gpu; // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_rank(_comm_gpu,&_gpu_rank); gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) return false; gpu->set(my_gpu); return true; } template <class numtyp, class acctyp> bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, const int host_nlocal, const int nall, PairGPUNbor *nbor, const int maxspecial, const bool gpu_nbor, const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut) { if (!_device_init) return false; // Initial number of local particles int ef_nlocal=nlocal; if (_particle_split<1.0 && _particle_split>0.0) ef_nlocal=static_cast<int>(_particle_split*nlocal); if (_init_count==0) { // Initialize atom and nbor data if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor, gpu_nbor && maxspecial>0)) return false; } else atom.add_fields(charge,rot); if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, *gpu,gpu_nbor,gpu_host,pre_cut)) return false; nbor->cell_size(cell_size); _init_count++; return true; } template <class numtyp, class acctyp> void PairGPUDeviceT::init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu) { #ifdef USE_OPENCL std::string fs=""; #else std::string fs=toa(gpu->free_gigabytes())+"/"; #endif if (_replica_me == 0 && screen) { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu); #ifdef _OPENMP fprintf(screen,"- with %d threads per proc.\n",_nthreads); #endif fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); for (int i=first_gpu; i<=last_gpu; i++) { std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+ " GHZ ("; if (sizeof(PRECISION)==4) { if (sizeof(ACC_PRECISION)==4) sname+="Single Precision)"; else sname+="Mixed Precision)"; } else sname+="Double Precision)"; fprintf(screen,"GPU %d: %s\n",i,sname.c_str()); } fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n\n"); } } template <class numtyp, class acctyp> void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUNbor &nbor, const double avg_split, const double max_bytes, FILE *screen) { double single[5], times[5]; single[0]=atom.transfer_time(); single[1]=nbor.time_nbor.total_seconds(); single[2]=nbor.time_kernel.total_seconds(); single[3]=time_pair.total_seconds(); single[4]=atom.cast_time(); MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes; double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); if (replica_me()==0) if (screen && times[3]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," GPU Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); if (procs_per_gpu()==1) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); if (nbor.gpu_nbor()) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } } template <class numtyp, class acctyp> void PairGPUDeviceT::clear() { if (_init_count>0) { _init_count--; if (_init_count==0) { atom.clear(); _nbor_shared.clear(); } } } template <class numtyp, class acctyp> void PairGPUDeviceT::clear_device() { while (_init_count>0) clear(); if (_device_init) { delete gpu; _device_init=false; } } template <class numtyp, class acctyp> double PairGPUDeviceT::host_memory_usage() const { return atom.host_memory_usage()+4*sizeof(numtyp)+ sizeof(PairGPUDevice<numtyp,acctyp>); } template class PairGPUDevice<PRECISION,ACC_PRECISION>; PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device; bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads) { return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, particle_split,nthreads); } void lmp_clear_device() { pair_gpu_device.clear_device(); } double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { if (pair_gpu_device.init_count()) { pair_gpu_device.stop_host_timer(); pair_gpu_device.gpu->sync(); double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul); pair_gpu_device.atom.get_answers(f,tor); + pair_gpu_device.atom.data_unavail(); return evdw; } return 0.0; } diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index 8f24e0231..1ef85c78a 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -1,156 +1,162 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef PAIR_GPU_DEVICE_H #define PAIR_GPU_DEVICE_H #include "pair_gpu_atom.h" +#include "pair_gpu_ans.h" #include "pair_gpu_nbor.h" #include "mpi.h" #include <sstream> #include "stdio.h" #include <string> +#include <queue> template <class numtyp, class acctyp> class PairGPUDevice { public: PairGPUDevice(); ~PairGPUDevice(); /// Initialize the device for use by this process /** Sets up a per-device MPI communicator for load balancing and initializes * the device (>=first_gpu and <=last_gpu) that this proc will be using **/ bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads); /// Initialize the device for Atom and Neighbor storage /** \param rot True if quaternions need to be stored * \param nlocal Total number of local particles to allocate memory for * \param host_nlocal Initial number of host particles to allocate memory for * \param nall Total number of local+ghost particles * \param gpu_nbor True if neighboring is performed on device * \param gpu_host 0 if host will not perform force calculations, * 1 if gpu_nbor is true, and host needs a half nbor list, * 2 if gpu_nbor is true, and host needs a full nbor list * \param max_nbors Initial number of rows in the neighbor matrix * \param cell_size cutoff+skin * \param pre_cut True if cutoff test will be performed in separate kernel * than the force kernel **/ bool init(const bool charge, const bool rot, const int nlocal, const int host_nlocal, const int nall, PairGPUNbor *nbor, const int maxspecial, const bool gpu_nbor, const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut); /// Output a message for pair_style acceleration with device stats void init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu); /// Output a message with timing information void output_times(UCL_Timer &time_pair, PairGPUNbor &nbor, const double avg_split, const double max_bytes, FILE *screen); /// Clear all memory on host and device associated with atom and nbor data void clear(); /// Clear all memory on host and device void clear_device(); + /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS + inline void add_ans_object(PairGPUAns *ans) { ans_queue.push(ans); } + /// Start timer on host inline void start_host_timer() { _cpu_full=MPI_Wtime(); } /// Stop timer on host inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; } /// Return host time inline double host_time() { return _cpu_full; } /// Return host memory usage in bytes double host_memory_usage() const; /// Return the number of procs sharing a device (size of device commincator) inline int procs_per_gpu() const { return _procs_per_gpu; } /// Return the number of threads per proc inline int num_threads() const { return _nthreads; } /// My rank within all processes inline int world_me() const { return _world_me; } /// Total number of processes inline int world_size() const { return _world_size; } /// MPI Barrier for world inline void world_barrier() { MPI_Barrier(_comm_world); } /// Return the replica MPI communicator inline MPI_Comm & replica() { return _comm_replica; } /// My rank within replica communicator inline int replica_me() const { return _replica_me; } /// Number of procs in replica communicator inline int replica_size() const { return _replica_size; } /// Return the per-GPU MPI communicator inline MPI_Comm & gpu_comm() { return _comm_gpu; } /// Return my rank in the device communicator inline int gpu_rank() const { return _gpu_rank; } /// MPI Barrier for gpu inline void gpu_barrier() { MPI_Barrier(_comm_gpu); } /// Return the 'mode' for acceleration: GPU_FORCE or GPU_NEIGH inline int gpu_mode() const { return _gpu_mode; } /// Index of first device used by a node inline int first_device() const { return _first_device; } /// Index of last device used by a node inline int last_device() const { return _last_device; } /// Particle split defined in fix inline double particle_split() const { return _particle_split; } /// Return the initialization count for the device inline int init_count() const { return _init_count; } // -------------------------- DEVICE DATA ------------------------- /// Geryon Device UCL_Device *gpu; enum{GPU_FORCE, GPU_NEIGH}; // --------------------------- ATOM DATA -------------------------- /// Atom Data PairGPUAtom<numtyp,acctyp> atom; // --------------------------- NBOR DATA ---------------------------- /// Neighbor Data PairGPUNborShared _nbor_shared; private: + std::queue<PairGPUAns *> ans_queue; int _init_count; bool _device_init; MPI_Comm _comm_world, _comm_replica, _comm_gpu; int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, _replica_size; int _gpu_mode, _first_device, _last_device, _nthreads; double _particle_split; double _cpu_full; template <class t> inline std::string toa(const t& in) { std::ostringstream o; o.precision(2); o << in; return o.str(); } }; #endif