diff --git a/lib/gpu/Makefile.nvidia b/lib/gpu/Makefile.nvidia index 47e1fefbe..4bb2c3fef 100644 --- a/lib/gpu/Makefile.nvidia +++ b/lib/gpu/Makefile.nvidia @@ -1,75 +1,65 @@ #*************************************************************************** # Makefile # ------------------- # W. Michael Brown # # _________________________________________________________________________ # Build for the LAMMPS GPU Force Library # # _________________________________________________________________________ # # begin : Tue June 23 2009 # copyright : (C) 2009 by W. Michael Brown # email : wmbrown@sandia.gov # ***************************************************************************/ BIN_DIR = . OBJ_DIR = . AR = ar CUDA_CPP = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math CUDA_ARCH = -maxrregcount 128 #-arch=sm_13 CUDA_PREC = -D_SINGLE_SINGLE CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB) CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC) CUDA_LIB = $(OBJ_DIR)/libgpu.a # Headers for CUDA Stuff -NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h +NVC_H = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h # Headers for Pair Stuff PAIR_H = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h +# Dependencies for the Texture Tar +TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \ + lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \ + gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu ALL_H = $(NVC_H) $(PAIR_H) EXECS = $(BIN_DIR)/nvc_get_devices -OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/gb_gpu.cu_o \ - $(OBJ_DIR)/gb_gpu_memory.cu_o $(OBJ_DIR)/lj_gpu.cu_o \ - $(OBJ_DIR)/lj_gpu_memory.cu_o $(OBJ_DIR)/pair_gpu_atom.cu_o \ - $(OBJ_DIR)/pair_gpu_nbor.cu_o +OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/pair_gpu_nbor.cu_o \ + $(OBJ_DIR)/pair_tex_tar.cu_o all: $(CUDA_LIB) $(EXECS) $(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H) $(CUDA) -o $@ -c nvc_device.cu -$(OBJ_DIR)/pair_gpu_atom.cu_o: pair_gpu_atom.cu pair_gpu_texture.h pair_gpu_atom.h $(NVC_H) - $(CUDA) -o $@ -c pair_gpu_atom.cu - $(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H) $(CUDA) -o $@ -c pair_gpu_nbor.cu -$(OBJ_DIR)/lj_gpu_memory.cu_o: lj_gpu_memory.cu lj_gpu_memory.h $(ALL_H) - $(CUDA) -o $@ -c lj_gpu_memory.cu - -$(OBJ_DIR)/lj_gpu.cu_o: lj_gpu.cu $(ALL_H) lj_gpu_memory.h lj_gpu_kernel.h - $(CUDA) -o $@ -c lj_gpu.cu - -$(OBJ_DIR)/gb_gpu_memory.cu_o: gb_gpu_memory.cu gb_gpu_memory.h $(ALL_H) - $(CUDA) -o $@ -c gb_gpu_memory.cu - -$(OBJ_DIR)/gb_gpu.cu_o: gb_gpu.cu $(ALL_H) gb_gpu_memory.h gb_gpu_kernel.h gb_gpu_extra.h - $(CUDA) -o $@ -c gb_gpu.cu +$(OBJ_DIR)/pair_tex_tar.cu_o: $(TAR_H) + $(CUDA) -o $@ -c pair_tex_tar.cu $(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o $(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o $(CUDA_LIB): $(OBJS) $(AR) -crusv $(CUDA_LIB) $(OBJS) clean: rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo veryclean: clean rm -rf *~ *.linkinfo diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu index cde56d569..174161331 100644 --- a/lib/gpu/gb_gpu.cu +++ b/lib/gpu/gb_gpu.cu @@ -1,518 +1,518 @@ /*************************************************************************** gb_gpu.cu ------------------- W. Michael Brown Gay-Berne anisotropic potential GPU calcultation *** Force decomposition by Atom Version *** __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Jun 23 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #include <iostream> #include <cassert> #include "nvc_macros.h" #include "nvc_timer.h" #include "nvc_device.h" -#include "gb_gpu_memory.h" +#include "gb_gpu_memory.cu" #include "gb_gpu_kernel.h" using namespace std; static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS]; #define GBMT GB_GPU_Memory<numtyp,acctyp> // --------------------------------------------------------------------------- // Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access // -- Only pack neighbors matching the specified inclusive range of forms // -- Only pack neighbors within cutoff // --------------------------------------------------------------------------- template<class numtyp> __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, const int start, const int inum, const int *dev_ij, const int form_low, const int form_high, const int nall) { // ii indexes the two interacting particles in gi int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start; if (ii<inum) { int *nbor=dev_nbor+ii; int i=*nbor; nbor+=nbor_pitch; int numj=*nbor; nbor+=nbor_pitch; const int *list=dev_ij+*nbor; const int *list_end=list+numj; nbor+=nbor_pitch; int *nbor_newj=nbor; nbor+=nbor_pitch; numtyp ix=_x_<numtyp>(i,0); numtyp iy=_x_<numtyp>(i,1); numtyp iz=_x_<numtyp>(i,2); int itype=_x_<numtyp>(i,7); int newj=0; for ( ; list<list_end; list++) { int j=*list; if (j>=nall) j%=nall; int jtype=_x_<numtyp>(j,7); if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) { // Compute r12; numtyp rsq=_x_<numtyp>(j,0)-ix; rsq*=rsq; numtyp t=_x_<numtyp>(j,1)-iy; rsq+=t*t; t=_x_<numtyp>(j,2)-iz; rsq+=t*t; if (rsq< _cutsq_<numtyp>(itype,jtype)) { *nbor=j; nbor+=nbor_pitch; newj++; } } } *nbor_newj=newj; } } // --------------------------------------------------------------------------- // Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access // -- Only pack neighbors matching the specified inclusive range of forms // -- Only pack neighbors within cutoff // -- Fast version of routine that uses shared memory for LJ constants // --------------------------------------------------------------------------- template<class numtyp> __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, const int start, const int inum, const int *dev_ij, const int form_low, const int form_high, const int nall) { int ii=threadIdx.x; __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { int itype=ii/MAX_SHARED_TYPES; int jtype=ii%MAX_SHARED_TYPES; cutsq[ii]=_cutsq_<numtyp>(itype,jtype); form[ii]=_form_(itype,jtype); } ii+=INT_MUL(blockIdx.x,blockDim.x)+start; if (ii<inum) { int *nbor=dev_nbor+ii; int i=*nbor; nbor+=nbor_pitch; int numj=*nbor; nbor+=nbor_pitch; const int *list=dev_ij+*nbor; const int *list_end=list+numj; nbor+=nbor_pitch; int *nbor_newj=nbor; nbor+=nbor_pitch; numtyp ix=_x_<numtyp>(i,0); numtyp iy=_x_<numtyp>(i,1); numtyp iz=_x_<numtyp>(i,2); int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7)); int newj=0; for ( ; list<list_end; list++) { int j=*list; if (j>=nall) j%=nall; int mtype=itype+_x_<numtyp>(j,7); if (form[mtype]>=form_low && form[mtype]<=form_high) { // Compute r12; numtyp rsq=_x_<numtyp>(j,0)-ix; rsq*=rsq; numtyp t=_x_<numtyp>(j,1)-iy; rsq+=t*t; t=_x_<numtyp>(j,2)-iz; rsq+=t*t; if (rsq<cutsq[mtype]) { *nbor=j; nbor+=nbor_pitch; newj++; } } } *nbor_newj=newj; } } template<class numtyp, class acctyp> void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, const int inum, const int form_low, const int form_high) { if (gbm.shared_types) kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>> (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); else kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>> (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum, gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall()); } // --------------------------------------------------------------------------- // Convert something to a string // --------------------------------------------------------------------------- #include <sstream> template <class t> inline string gb_gpu_toa(const t& in) { ostringstream o; o.precision(2); o << in; return o.str(); } // --------------------------------------------------------------------------- // Return string with GPU info // --------------------------------------------------------------------------- string gb_gpu_name(const int id, const int max_nbors) { string name=GBMF[0].gpu.name(id)+", "+ gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+ gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+ gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+ gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id), max_nbors))+" Atoms"; return name; } // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma, const double upsilon, const double mu, double **shape, double **well, double **cutsq, double **sigma, double **epsilon, double *host_lshape, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int max_nbors, const int thread, const int gpu_id) { assert(thread<MAX_GPU_THREADS); if (GBMF[thread].gpu.num_devices()==0) return 0; ij_size=IJ_SIZE; return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma, epsilon, host_lshape, form, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, max_nbors, false, gpu_id); } // --------------------------------------------------------------------------- // Clear memory on host and device // --------------------------------------------------------------------------- void gb_gpu_clear(const int thread) { GBMF[thread].clear(); } // --------------------------------------------------------------------------- // copy atom positions, quaternions, and optionally types to device // --------------------------------------------------------------------------- template <class numtyp, class acctyp> inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x, double **host_quat, const int *host_type, const bool rebuild, cudaStream_t &stream) { atom.time_atom.start(); atom.reset_write_buffer(); // Rows 1-3 of dev_x are position; rows 4-7 are quaternion atom.add_atom_data(host_x[0],3); atom.add_atom_data(host_x[0]+1,3); atom.add_atom_data(host_x[0]+2,3); atom.add_atom_data(host_quat[0],4); atom.add_atom_data(host_quat[0]+1,4); atom.add_atom_data(host_quat[0]+2,4); atom.add_atom_data(host_quat[0]+3,4); int csize=7; // If a rebuild occured, copy type data if (rebuild) { atom.add_atom_data(host_type); csize++; } atom.copy_atom_data(csize,stream); atom.time_atom.stop(); } void gb_gpu_atom(double **host_x, double **host_quat, const int *host_type, const bool rebuild, const int thread) { _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild, GBMF[thread].pair_stream); } // --------------------------------------------------------------------------- // Signal that we need to transfer a new neighbor list // --------------------------------------------------------------------------- template <class gbmtyp> int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, const int inum, int *ilist, const int *numj, const int *type, bool &success) { if (nall>gbm.max_atoms) { success=false; return 0; } success=true; gbm.nbor.time_nbor.start(); gbm.atom.nall(nall); gbm.atom.inum(inum); if (gbm.multiple_forms) { int p=0, acc=0; for (int i=0; i<inum; i++) { int itype=type[ilist[i]]; if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) { gbm.host_olist[p]=ilist[i]; gbm.nbor.host_ij[p]=numj[ilist[i]]; gbm.nbor.host_ij[p+inum]=acc; acc+=numj[ilist[i]]; p++; } } gbm.last_ellipse=p; for (int i=0; i<inum; i++) { int itype=type[ilist[i]]; if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) { gbm.host_olist[p]=ilist[i]; gbm.nbor.host_ij[p]=numj[ilist[i]]; gbm.nbor.host_ij[p+inum]=acc; acc+=numj[ilist[i]]; p++; } } gbm.nbor.ij_total=0; gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum); gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+ gbm.nbor.dev_nbor.row_size(), gbm.nbor.dev_nbor.row_size(),2,inum, gbm.pair_stream); } else { gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream); gbm.last_ellipse=inum; } gbm.nbor.time_nbor.stop(); if (gbm.multiple_forms) return gbm.host_olist.begin(); return ilist; } int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, int *ilist, const int *numj, const int *type, const int thread, bool &success) { return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type, success); } // --------------------------------------------------------------------------- // Copy a set of ij_size ij interactions to device and compute energies, // forces, and torques for those interactions // --------------------------------------------------------------------------- template <class gbmtyp> void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) { gbm.nbor.time_nbor.add_to_total(); // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed gbm.nbor.time_nbor.start(); gbm.nbor.add(num_ij,gbm.pair_stream); gbm.nbor.time_nbor.stop(); } void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) { _gb_gpu_nbors(GBMF[thread],num_ij,eflag); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques for all ij interactions // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, const bool rebuild) { // Compute the block size and grid size to keep all cores busy const int BX=BLOCK_1D; int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX)); if (gbm.multiple_forms) { gbm.time_kernel.start(); if (gbm.last_ellipse>0) { // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/ static_cast<double>(BX))); pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>> (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.nall()); gbm.time_gayberne.stop(); if (gbm.last_ellipse==gbm.atom.inum()) { gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); gbm.time_gayberne2.stop(); gbm.time_pair.start(); gbm.time_pair.stop(); return; } // ------------ SPHERE_ELLIPSE --------------- gbm.time_kernel2.start(); GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()- gbm.last_ellipse)/BX)); pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE, ELLIPSE_SPHERE); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>> (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); gbm.time_gayberne2.stop(); } else { gbm.atom.ans.zero(); gbm.time_kernel.stop(); gbm.time_gayberne.start(); gbm.time_gayberne.stop(); gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); gbm.time_gayberne2.stop(); } // ------------ LJ --------------- gbm.time_pair.start(); if (gbm.last_ellipse<gbm.atom.inum()) { if (gbm.shared_types) kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>> (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); else kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>> (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall()); } gbm.time_pair.stop(); } else { gbm.time_kernel.start(); pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>> (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), eflag, vflag, gbm.atom.inum(), gbm.atom.nall()); gbm.time_gayberne.stop(); } } void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, const int thread) { _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild); } // --------------------------------------------------------------------------- // Get energies, forces, and torques to host // --------------------------------------------------------------------------- template<class numtyp, class acctyp> double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial) { double evdw; gbm.atom.time_answer.start(); gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream); gbm.atom.time_atom.add_to_total(); gbm.nbor.time_nbor.add_to_total(); gbm.time_kernel.add_to_total(); gbm.time_gayberne.add_to_total(); if (gbm.multiple_forms) { gbm.time_kernel2.add_to_total(); gbm.time_gayberne2.add_to_total(); gbm.time_pair.add_to_total(); } // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); gbm.atom.add_forces(ilist,f); gbm.atom.add_torques(ilist,tor,gbm.last_ellipse); gbm.atom.time_answer.stop(); gbm.atom.time_answer.add_to_total(); return evdw; } double gb_gpu_forces(double **f, double **tor, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial, const int thread) { return _gb_gpu_forces<PRECISION,ACC_PRECISION> (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom, vflag_atom,eatom,vatom,virial); } void gb_gpu_time(const int i) { cout.precision(4); cout << "Atom copy: " << GBMF[i].atom.time_atom.total_seconds() << " s.\n" << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() << " s.\n" << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+ GBMF[i].time_kernel2.total_seconds() << " s.\n" << "Force calc: " << GBMF[i].time_gayberne.total_seconds()+ GBMF[i].time_gayberne2.total_seconds()<< " s.\n"; if (GBMF[i].multiple_forms) cout << "LJ calc: " << GBMF[i].time_pair.total_seconds() << " s.\n"; cout << "Answer copy: " << GBMF[i].atom.time_answer.total_seconds() << " s.\n"; } int gb_gpu_num_devices() { return GBMF[0].gpu.num_devices(); } double gb_gpu_bytes() { return GBMF[0].host_memory_usage(); } diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h index c918e0707..87bcecb3c 100644 --- a/lib/gpu/gb_gpu_extra.h +++ b/lib/gpu/gb_gpu_extra.h @@ -1,337 +1,337 @@ /*************************************************************************** gb_gpu_extra.h ------------------- W. Michael Brown Inline GPU kernel routines ala math_extra for the CPU. __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Jun 23 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #ifndef GB_GPU_EXTRA_H #define GB_GPU_EXTRA_H #include "math.h" #include "stdio.h" #include "string.h" /* ---------------------------------------------------------------------- Atomic update of global memory ------------------------------------------------------------------------- */ /* template <class numtyp> __device__ inline void atomicAdd(numtyp *address, numtyp val); template <> __device__ inline void atomicAdd<float>(float *address, float val) { int i_val = __float_as_int(val); int tmp0 = 0; int tmp1; while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) { tmp0 = tmp1; i_val = __float_as_int(val + __int_as_float(tmp1)); } }*/ /* ---------------------------------------------------------------------- dot product of 2 vectors ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2) { return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2]; } /* ---------------------------------------------------------------------- cross product of 2 vectors ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans) { ans[0] = v1[1]*v2[2]-v1[2]*v2[1]; ans[1] = v1[2]*v2[0]-v1[0]*v2[2]; ans[2] = v1[0]*v2[1]-v1[1]*v2[0]; } /* ---------------------------------------------------------------------- determinant of a matrix ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ numtyp gpu_det3(const numtyp m[9]) { numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + m[6]*m[1]*m[5] - m[6]*m[2]*m[4]; return ans; } /* ---------------------------------------------------------------------- diagonal matrix times a full matrix ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9], numtyp ans[9]) { ans[0] = _well_<numtyp>(i,0)*m[0]; ans[1] = _well_<numtyp>(i,0)*m[1]; ans[2] = _well_<numtyp>(i,0)*m[2]; ans[3] = _well_<numtyp>(i,1)*m[3]; ans[4] = _well_<numtyp>(i,1)*m[4]; ans[5] = _well_<numtyp>(i,1)*m[5]; ans[6] = _well_<numtyp>(i,2)*m[6]; ans[7] = _well_<numtyp>(i,2)*m[7]; ans[8] = _well_<numtyp>(i,2)*m[8]; } /* ---------------------------------------------------------------------- diagonal matrix times a full matrix ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9], numtyp ans[9]) { ans[0] = _shape_<numtyp>(i,0)*m[0]; ans[1] = _shape_<numtyp>(i,0)*m[1]; ans[2] = _shape_<numtyp>(i,0)*m[2]; ans[3] = _shape_<numtyp>(i,1)*m[3]; ans[4] = _shape_<numtyp>(i,1)*m[4]; ans[5] = _shape_<numtyp>(i,1)*m[5]; ans[6] = _shape_<numtyp>(i,2)*m[6]; ans[7] = _shape_<numtyp>(i,2)*m[7]; ans[8] = _shape_<numtyp>(i,2)*m[8]; } /* ---------------------------------------------------------------------- add two matrices ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9]) { ans[0] = m[0]+m2[0]; ans[1] = m[1]+m2[1]; ans[2] = m[2]+m2[2]; ans[3] = m[3]+m2[3]; ans[4] = m[4]+m2[4]; ans[5] = m[5]+m2[5]; ans[6] = m[6]+m2[6]; ans[7] = m[7]+m2[7]; ans[8] = m[8]+m2[8]; } /* ---------------------------------------------------------------------- multiply the transpose of mat1 times mat2 ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9], numtyp ans[9]) { ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6]; ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7]; ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8]; ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6]; ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7]; ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8]; ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6]; ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7]; ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8]; } /* ---------------------------------------------------------------------- row vector times matrix ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans) { ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6]; ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7]; ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2]; } /* ---------------------------------------------------------------------- solve Ax = b or M ans = v use gaussian elimination & partial pivoting on matrix error_flag set to 2 if bad matrix inversion attempted ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans, int *error_flag) { // create augmented matrix for pivoting numtyp aug[12], t; aug[3] = v[0]; aug[0] = m[0]; aug[1] = m[1]; aug[2] = m[2]; aug[7] = v[1]; aug[4] = m[3]; aug[5] = m[4]; aug[6] = m[5]; aug[11] = v[2]; aug[8] = m[6]; aug[9] = m[7]; aug[10] = m[8]; if (fabs(aug[4]) > fabs(aug[0])) { numtyp swapt; swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; } if (fabs(aug[8]) > fabs(aug[0])) { numtyp swapt; swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; } if (aug[0] != (numtyp)0.0) { if (0!=0) { numtyp swapt; swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt; swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt; swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt; swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt; } } else if (aug[4] != (numtyp)0.0) { if (1!=0) { numtyp swapt; swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt; swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt; swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt; swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt; } } else if (aug[8] != (numtyp)0.0) { if (2!=0) { numtyp swapt; swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt; swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt; swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt; swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt; } } else *error_flag=2; t = aug[4]/aug[0]; aug[5]-=t*aug[1]; aug[6]-=t*aug[2]; aug[7]-=t*aug[3]; t = aug[8]/aug[0]; aug[9]-=t*aug[1]; aug[10]-=t*aug[2]; aug[11]-=t*aug[3]; if (fabs(aug[9]) > fabs(aug[5])) { numtyp swapt; swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; } if (aug[5] != (numtyp)0.0) { if (1!=1) { numtyp swapt; swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt; swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt; swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt; swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt; } } else if (aug[9] != (numtyp)0.0) { if (2!=1) { numtyp swapt; swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt; swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt; swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt; swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt; } } t = aug[9]/aug[5]; aug[10]-=t*aug[6]; aug[11]-=t*aug[7]; if (aug[10] == (numtyp)0.0) *error_flag=2; ans[2] = aug[11]/aug[10]; t = (numtyp)0.0; t += aug[6]*ans[2]; ans[1] = (aug[7]-t) / aug[5]; t = (numtyp)0.0; t += aug[1]*ans[1]; t += aug[2]*ans[2]; ans[0] = (aug[3]-t) / aug[0]; } /* ---------------------------------------------------------------------- compute rotation matrix from quaternion conjugate quat = [w i j k] ------------------------------------------------------------------------- */ template <class numtyp> static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi, numtyp mat[9]) { numtyp qi3=_x_<numtyp>(qi,3); numtyp qi4=_x_<numtyp>(qi,4); numtyp qi5=_x_<numtyp>(qi,5); numtyp qi6=_x_<numtyp>(qi,6); numtyp w2 = qi3*qi3; numtyp i2 = qi4*qi4; numtyp j2 = qi5*qi5; numtyp k2 = qi6*qi6; - numtyp twoij = 2.0*qi4*qi5; - numtyp twoik = 2.0*qi4*qi6; - numtyp twojk = 2.0*qi5*qi6; - numtyp twoiw = 2.0*qi4*qi3; - numtyp twojw = 2.0*qi5*qi3; - numtyp twokw = 2.0*qi6*qi3; + numtyp twoij = (numtyp)2.0*qi4*qi5; + numtyp twoik = (numtyp)2.0*qi4*qi6; + numtyp twojk = (numtyp)2.0*qi5*qi6; + numtyp twoiw = (numtyp)2.0*qi4*qi3; + numtyp twojw = (numtyp)2.0*qi5*qi3; + numtyp twokw = (numtyp)2.0*qi6*qi3; mat[0] = w2+i2-j2-k2; mat[3] = twoij-twokw; mat[6] = twojw+twoik; mat[1] = twoij+twokw; mat[4] = w2-i2+j2-k2; mat[7] = twojk-twoiw; mat[2] = twoik-twojw; mat[5] = twojk+twoiw; mat[8] = w2-i2-j2+k2; } #endif diff --git a/lib/gpu/gb_gpu_kernel.h b/lib/gpu/gb_gpu_kernel.h index 8ca047e0e..3d74d916f 100644 --- a/lib/gpu/gb_gpu_kernel.h +++ b/lib/gpu/gb_gpu_kernel.h @@ -1,861 +1,861 @@ /*************************************************************************** gb_gpu_kernel.cu ------------------- W. Michael Brown Routines that actually perform the force/torque computation *** Force Decomposition by Atom Version *** __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Jun 23 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #ifndef GB_GPU_KERNEL #define GB_GPU_KERNEL #include "gb_gpu_extra.h" template <class numtyp> static __inline__ __device__ void compute_eta_torque(numtyp m[9], numtyp m2[9], const int i, numtyp ans[9]) { numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- m[3]*m[1]*m[8]+m[0]*m[4]*m[8]; den = (numtyp)1.0/den; numtyp shapex=_shape_<numtyp>(i,0); numtyp shapey=_shape_<numtyp>(i,1); numtyp shapez=_shape_<numtyp>(i,2); ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]- m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+ m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]- m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+ m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den; ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+ (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]- (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]- m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+ m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den; ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]- m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]- m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+ (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+ m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den; ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+ m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+ m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]- m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]- m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den; ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+ (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]- (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+ m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]- m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den; ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]- m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+ (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+ m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]- (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den; ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+ (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+ m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]- m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]- m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den; ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]- (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+ (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]- m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+ m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den; ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]- m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]- m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+ (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+ m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; } template<class numtyp, class acctyp> __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj, const int *dev_nbor, const int nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; ii+=INT_MUL(blockIdx.x,blockDim.x); if (ii<inum) { acctyp energy=(numtyp)0; acctyp fx=(numtyp)0; acctyp fy=(numtyp)0; acctyp fz=(numtyp)0; acctyp torx=(numtyp)0; acctyp tory=(numtyp)0; acctyp torz=(numtyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(numtyp)0; const int *nbor=dev_nbor+ii; int i=*nbor; nbor+=nbor_pitch; nbor+=nbor_pitch; nbor+=nbor_pitch; int numj=*nbor; nbor+=nbor_pitch; const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj); numtyp ix=_x_<numtyp>(i,0); numtyp iy=_x_<numtyp>(i,1); numtyp iz=_x_<numtyp>(i,2); int itype=_x_<numtyp>(i,7); numtyp a1[9], b1[9], g1[9]; { numtyp t[9]; gpu_quat_to_mat_trans(i,a1); gpu_shape_times3(itype,a1,t); gpu_transpose_times3(a1,t,g1); gpu_well_times3(itype,a1,t); gpu_transpose_times3(a1,t,b1); } numtyp factor_lj; for ( ; nbor<nbor_end; nbor+=nbor_pitch) { int j=*nbor; if (j < nall) - factor_lj = 1.0; + factor_lj = (numtyp)1.0; else { factor_lj = sp_lj[j/nall]; j %= nall; } int jtype=_x_<numtyp>(j,7); // Compute r12 numtyp r12[3]; r12[0] = _x_<numtyp>(j,0)-ix; r12[1] = _x_<numtyp>(j,1)-iy; r12[2] = _x_<numtyp>(j,2)-iz; numtyp ir = gpu_dot3(r12,r12); ir = rsqrt(ir); numtyp r = (numtyp)1.0/ir; numtyp a2[9]; gpu_quat_to_mat_trans(j,a2); numtyp u_r, dUr[3], tUr[3], eta, teta[3]; { // Compute U_r, dUr, eta, and teta // Compute g12 numtyp g12[9]; { numtyp g2[9]; { gpu_shape_times3(jtype,a2,g12); gpu_transpose_times3(a2,g12,g2); gpu_plus3(g1,g2,g12); } { // Compute U_r and dUr // Compute kappa numtyp kappa[3]; gpu_mldivide3(g12,r12,kappa,err_flag); // -- replace r12 with r12 hat r12[0]*=ir; r12[1]*=ir; r12[2]*=ir; // -- kappa is now / r kappa[0]*=ir; kappa[1]*=ir; kappa[2]*=ir; // energy // compute u_r and dUr numtyp uslj_rsq; { // Compute distance of closest approach numtyp h12, sigma12; sigma12 = gpu_dot3(r12,kappa); sigma12 = rsqrt((numtyp)0.5*sigma12); h12 = r-sigma12; // -- kappa is now ok kappa[0]*=r; kappa[1]*=r; kappa[2]*=r; numtyp sigma = _sigma_<numtyp>(itype,jtype); numtyp epsilon = _epsilon_<numtyp>(itype,jtype); numtyp varrho = sigma/(h12+gum[0]*sigma); numtyp varrho6 = varrho*varrho*varrho; varrho6*=varrho6; numtyp varrho12 = varrho6*varrho6; u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; temp1 = temp1*(numtyp)24.0*epsilon; uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; numtyp temp2 = gpu_dot3(kappa,r12); uslj_rsq = uslj_rsq*ir*ir; dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); } // torque for particle 1 { numtyp tempv[3], tempv2[3]; tempv[0] = -uslj_rsq*kappa[0]; tempv[1] = -uslj_rsq*kappa[1]; tempv[2] = -uslj_rsq*kappa[2]; gpu_row_times3(kappa,g1,tempv2); gpu_cross3(tempv,tempv2,tUr); } } } // Compute eta { eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype); numtyp det_g12 = gpu_det3(g12); eta = pow(eta/det_g12,gum[1]); } // Compute teta numtyp temp[9], tempv[3], tempv2[3]; compute_eta_torque(g12,a1,itype,temp); numtyp temp1 = -eta*gum[1]; tempv[0] = temp1*temp[0]; tempv[1] = temp1*temp[1]; tempv[2] = temp1*temp[2]; gpu_cross3(a1,tempv,tempv2); teta[0] = tempv2[0]; teta[1] = tempv2[1]; teta[2] = tempv2[2]; tempv[0] = temp1*temp[3]; tempv[1] = temp1*temp[4]; tempv[2] = temp1*temp[5]; gpu_cross3(a1+3,tempv,tempv2); teta[0] += tempv2[0]; teta[1] += tempv2[1]; teta[2] += tempv2[2]; tempv[0] = temp1*temp[6]; tempv[1] = temp1*temp[7]; tempv[2] = temp1*temp[8]; gpu_cross3(a1+6,tempv,tempv2); teta[0] += tempv2[0]; teta[1] += tempv2[1]; teta[2] += tempv2[2]; } numtyp chi, dchi[3], tchi[3]; { // Compute chi and dchi // Compute b12 numtyp b2[9], b12[9]; { gpu_well_times3(jtype,a2,b12); gpu_transpose_times3(a2,b12,b2); gpu_plus3(b1,b2,b12); } // compute chi_12 r12[0]*=r; r12[1]*=r; r12[2]*=r; numtyp iota[3]; gpu_mldivide3(b12,r12,iota,err_flag); // -- iota is now iota/r iota[0]*=ir; iota[1]*=ir; iota[2]*=ir; r12[0]*=ir; r12[1]*=ir; r12[2]*=ir; chi = gpu_dot3(r12,iota); chi = pow(chi*(numtyp)2.0,gum[2]); // -- iota is now ok iota[0]*=r; iota[1]*=r; iota[2]*=r; numtyp temp1 = gpu_dot3(iota,r12); numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); dchi[0] = temp2*(iota[0]-temp1*r12[0]); dchi[1] = temp2*(iota[1]-temp1*r12[1]); dchi[2] = temp2*(iota[2]-temp1*r12[2]); // compute t_chi numtyp tempv[3]; gpu_row_times3(iota,b1,tempv); gpu_cross3(tempv,iota,tchi); temp1 = (numtyp)-4.0*ir*ir; tchi[0] *= temp1; tchi[1] *= temp1; tchi[2] *= temp1; } numtyp temp2 = factor_lj*eta*chi; if (eflag) energy+=u_r*temp2; numtyp temp1 = -eta*u_r*factor_lj; if (vflag) { r12[0]*=-r; r12[1]*=-r; r12[2]*=-r; numtyp ft=temp1*dchi[0]-temp2*dUr[0]; fx+=ft; virial[0]+=r12[0]*ft; ft=temp1*dchi[1]-temp2*dUr[1]; fy+=ft; virial[1]+=r12[1]*ft; virial[3]+=r12[0]*ft; ft=temp1*dchi[2]-temp2*dUr[2]; fz+=ft; virial[2]+=r12[2]*ft; virial[4]+=r12[0]*ft; virial[5]+=r12[1]*ft; } else { fx+=temp1*dchi[0]-temp2*dUr[0]; fy+=temp1*dchi[1]-temp2*dUr[1]; fz+=temp1*dchi[2]-temp2*dUr[2]; } // Torque on 1 temp1 = -u_r*eta*factor_lj; temp2 = -u_r*chi*factor_lj; numtyp temp3 = -chi*eta*factor_lj; torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; } // for nbor // Store answers acctyp *ap1=ans+ii; if (eflag) { *ap1=energy; ap1+=ans_pitch; } if (vflag) { for (int i=0; i<6; i++) { *ap1=virial[i]; ap1+=ans_pitch; } } *ap1=fx; ap1+=ans_pitch; *ap1=fy; ap1+=ans_pitch; *ap1=fz; ap1+=ans_pitch; *ap1=torx; ap1+=ans_pitch; *ap1=tory; ap1+=ans_pitch; *ap1=torz; } // if ii } template<class numtyp, class acctyp> __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj, const int *dev_nbor, const int nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; ii+=INT_MUL(blockIdx.x,blockDim.x)+start; if (ii<inum) { acctyp energy=(numtyp)0; acctyp fx=(numtyp)0; acctyp fy=(numtyp)0; acctyp fz=(numtyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(numtyp)0; const int *nbor=dev_nbor+ii; int i=*nbor; nbor+=nbor_pitch; nbor+=nbor_pitch; nbor+=nbor_pitch; int numj=*nbor; nbor+=nbor_pitch; const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj); numtyp ix=_x_<numtyp>(i,0); numtyp iy=_x_<numtyp>(i,1); numtyp iz=_x_<numtyp>(i,2); int itype=_x_<numtyp>(i,7); numtyp oner=_shape_<numtyp>(itype,0); numtyp one_well=_well_<numtyp>(itype,0); numtyp factor_lj; for ( ; nbor<nbor_end; nbor+=nbor_pitch) { int j=*nbor; if (j < nall) - factor_lj = 1.0; + factor_lj = (numtyp)1.0; else { factor_lj = sp_lj[j/nall]; j %= nall; } int jtype=_x_<numtyp>(j,7); // Compute r12 numtyp r12[3]; r12[0] = _x_<numtyp>(j,0)-ix; r12[1] = _x_<numtyp>(j,1)-iy; r12[2] = _x_<numtyp>(j,2)-iz; numtyp ir = gpu_dot3(r12,r12); ir = rsqrt(ir); numtyp r = (numtyp)1.0/ir; numtyp r12hat[3]; r12hat[0]=r12[0]*ir; r12hat[1]=r12[1]*ir; r12hat[2]=r12[2]*ir; numtyp a2[9]; gpu_quat_to_mat_trans(j,a2); numtyp u_r, dUr[3], eta; { // Compute U_r, dUr, eta, and teta // Compute g12 numtyp g12[9]; { { numtyp g2[9]; gpu_shape_times3(jtype,a2,g12); gpu_transpose_times3(a2,g12,g2); g12[0]=g2[0]+oner; g12[4]=g2[4]+oner; g12[8]=g2[8]+oner; g12[1]=g2[1]; g12[2]=g2[2]; g12[3]=g2[3]; g12[5]=g2[5]; g12[6]=g2[6]; g12[7]=g2[7]; } { // Compute U_r and dUr // Compute kappa numtyp kappa[3]; gpu_mldivide3(g12,r12,kappa,err_flag); // -- kappa is now / r kappa[0]*=ir; kappa[1]*=ir; kappa[2]*=ir; // energy // compute u_r and dUr numtyp uslj_rsq; { // Compute distance of closest approach numtyp h12, sigma12; sigma12 = gpu_dot3(r12hat,kappa); sigma12 = rsqrt((numtyp)0.5*sigma12); h12 = r-sigma12; // -- kappa is now ok kappa[0]*=r; kappa[1]*=r; kappa[2]*=r; numtyp sigma = _sigma_<numtyp>(itype,jtype); numtyp epsilon = _epsilon_<numtyp>(itype,jtype); numtyp varrho = sigma/(h12+gum[0]*sigma); numtyp varrho6 = varrho*varrho*varrho; varrho6*=varrho6; numtyp varrho12 = varrho6*varrho6; u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; temp1 = temp1*(numtyp)24.0*epsilon; uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; numtyp temp2 = gpu_dot3(kappa,r12hat); uslj_rsq = uslj_rsq*ir*ir; dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]); dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]); dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]); } } } // Compute eta { eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype); numtyp det_g12 = gpu_det3(g12); eta = pow(eta/det_g12,gum[1]); } } numtyp chi, dchi[3]; { // Compute chi and dchi // Compute b12 numtyp b12[9]; { numtyp b2[9]; gpu_well_times3(jtype,a2,b12); gpu_transpose_times3(a2,b12,b2); b12[0]=b2[0]+one_well; b12[4]=b2[4]+one_well; b12[8]=b2[8]+one_well; b12[1]=b2[1]; b12[2]=b2[2]; b12[3]=b2[3]; b12[5]=b2[5]; b12[6]=b2[6]; b12[7]=b2[7]; } // compute chi_12 numtyp iota[3]; gpu_mldivide3(b12,r12,iota,err_flag); // -- iota is now iota/r iota[0]*=ir; iota[1]*=ir; iota[2]*=ir; chi = gpu_dot3(r12hat,iota); chi = pow(chi*(numtyp)2.0,gum[2]); // -- iota is now ok iota[0]*=r; iota[1]*=r; iota[2]*=r; numtyp temp1 = gpu_dot3(iota,r12hat); numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]); dchi[0] = temp2*(iota[0]-temp1*r12hat[0]); dchi[1] = temp2*(iota[1]-temp1*r12hat[1]); dchi[2] = temp2*(iota[2]-temp1*r12hat[2]); } numtyp temp2 = factor_lj*eta*chi; if (eflag) energy+=u_r*temp2; numtyp temp1 = -eta*u_r*factor_lj; if (vflag) { r12[0]*=-1; r12[1]*=-1; r12[2]*=-1; numtyp ft=temp1*dchi[0]-temp2*dUr[0]; fx+=ft; virial[0]+=r12[0]*ft; ft=temp1*dchi[1]-temp2*dUr[1]; fy+=ft; virial[1]+=r12[1]*ft; virial[3]+=r12[0]*ft; ft=temp1*dchi[2]-temp2*dUr[2]; fz+=ft; virial[2]+=r12[2]*ft; virial[4]+=r12[0]*ft; virial[5]+=r12[1]*ft; } else { fx+=temp1*dchi[0]-temp2*dUr[0]; fy+=temp1*dchi[1]-temp2*dUr[1]; fz+=temp1*dchi[2]-temp2*dUr[2]; } } // for nbor // Store answers acctyp *ap1=ans+ii; if (eflag) { *ap1=energy; ap1+=ans_pitch; } if (vflag) { for (int i=0; i<6; i++) { *ap1=virial[i]; ap1+=ans_pitch; } } *ap1=fx; ap1+=ans_pitch; *ap1=fy; ap1+=ans_pitch; *ap1=fz; } // if ii } template<class numtyp, class acctyp> __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, const int *dev_ij, const int nbor_pitch, acctyp *ans, size_t ans_pitch, int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall) { __shared__ numtyp sp_lj[4]; // ii indexes the two interacting particles in gi int ii=threadIdx.x; if (ii<4) sp_lj[ii]=special_lj[ii]; ii+=INT_MUL(blockIdx.x,blockDim.x)+start; if (ii<inum) { acctyp energy=(numtyp)0; acctyp fx=(numtyp)0; acctyp fy=(numtyp)0; acctyp fz=(numtyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(numtyp)0; const int *nbor=dev_nbor+ii; int i=*nbor; nbor+=nbor_pitch; int numj=*nbor; nbor+=nbor_pitch; const int *list=dev_ij+*nbor; const int *list_end=list+numj; numtyp ix=_x_<numtyp>(i,0); numtyp iy=_x_<numtyp>(i,1); numtyp iz=_x_<numtyp>(i,2); int itype=_x_<numtyp>(i,7); numtyp factor_lj; for ( ; list<list_end; list++) { int j=*list; if (j < nall) - factor_lj = 1.0; + factor_lj = (numtyp)1.0; else { factor_lj = sp_lj[j/nall]; j %= nall; } int jtype=_x_<numtyp>(j,7); // Compute r12 numtyp delx = ix-_x_<numtyp>(j,0); numtyp dely = iy-_x_<numtyp>(j,1); numtyp delz = iz-_x_<numtyp>(j,2); numtyp r2inv = delx*delx+dely*dely+delz*delz; if (r2inv<_cutsq_<numtyp>(itype,jtype) && _form_(itype,jtype)==SPHERE_SPHERE) { r2inv=(numtyp)1.0/r2inv; numtyp r6inv = r2inv*r2inv*r2inv; numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv- _lj1_<numtyp>(itype,jtype).y); force*=factor_lj; fx+=delx*force; fy+=dely*force; fz+=delz*force; if (eflag) { numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv- _lj3_<numtyp>(itype,jtype).y); energy+=factor_lj*(e-_offset_<numtyp>(1,1)); } if (vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor // Store answers acctyp *ap1=ans+ii; if (eflag) { *ap1+=energy; ap1+=ans_pitch; } if (vflag) { for (int i=0; i<6; i++) { *ap1+=virial[i]; ap1+=ans_pitch; } } *ap1+=fx; ap1+=ans_pitch; *ap1+=fy; ap1+=ans_pitch; *ap1+=fz; } // if ii } template<class numtyp, class acctyp> __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, const int *dev_ij, const int nbor_pitch, acctyp *ans, size_t ans_pitch,int *err_flag, const bool eflag, const bool vflag, const int start, const int inum, const int nall){ // ii indexes the two interacting particles in gi int ii=threadIdx.x; __shared__ numtyp sp_lj[4]; __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (ii<4) sp_lj[ii]=special_lj[ii]; if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { int itype=ii/MAX_SHARED_TYPES; int jtype=ii%MAX_SHARED_TYPES; cutsq[ii]=_cutsq_<numtyp>(itype,jtype); form[ii]=_form_(itype,jtype); lj1[ii]=_lj1_<numtyp>(itype,jtype).x; lj2[ii]=_lj1_<numtyp>(itype,jtype).y; if (eflag) { lj3[ii]=_lj3_<numtyp>(itype,jtype).x; lj4[ii]=_lj3_<numtyp>(itype,jtype).y; offset[ii]=_offset_<numtyp>(itype,jtype); } } ii+=INT_MUL(blockIdx.x,blockDim.x)+start; if (ii<inum) { acctyp energy=(numtyp)0; acctyp fx=(numtyp)0; acctyp fy=(numtyp)0; acctyp fz=(numtyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(numtyp)0; const int *nbor=dev_nbor+ii; int i=*nbor; nbor+=nbor_pitch; int numj=*nbor; nbor+=nbor_pitch; const int *list=dev_ij+*nbor; const int *list_end=list+numj; numtyp ix=_x_<numtyp>(i,0); numtyp iy=_x_<numtyp>(i,1); numtyp iz=_x_<numtyp>(i,2); int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7)); numtyp factor_lj; for ( ; list<list_end; list++) { int j=*list; if (j < nall) - factor_lj = 1.0; + factor_lj = (numtyp)1.0; else { factor_lj = sp_lj[j/nall]; j %= nall; } int mtype=itype+_x_<numtyp>(j,7); // Compute r12 numtyp delx = ix-_x_<numtyp>(j,0); numtyp dely = iy-_x_<numtyp>(j,1); numtyp delz = iz-_x_<numtyp>(j,2); numtyp r2inv = delx*delx+dely*dely+delz*delz; if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) { r2inv=(numtyp)1.0/r2inv; numtyp r6inv = r2inv*r2inv*r2inv; numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]); fx+=delx*force; fy+=dely*force; fz+=delz*force; if (eflag) { numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]); energy+=factor_lj*(e-offset[mtype]); } if (vflag) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor // Store answers acctyp *ap1=ans+ii; if (eflag) { *ap1+=energy; ap1+=ans_pitch; } if (vflag) { for (int i=0; i<6; i++) { *ap1+=virial[i]; ap1+=ans_pitch; } } *ap1+=fx; ap1+=ans_pitch; *ap1+=fy; ap1+=ans_pitch; *ap1+=fz; } // if ii } #endif diff --git a/lib/gpu/gb_gpu_memory.cu b/lib/gpu/gb_gpu_memory.cu index 021cd85e9..aab5ea55a 100644 --- a/lib/gpu/gb_gpu_memory.cu +++ b/lib/gpu/gb_gpu_memory.cu @@ -1,146 +1,138 @@ /*************************************************************************** gb_gpu_memory.cu ------------------- W. Michael Brown Global variables for GPU Gayberne Library __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Thu Jun 25 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #include "gb_gpu_memory.h" #define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp> template <class numtyp, class acctyp> GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() { this->atom.atom_fields(8); this->atom.ans_fields(13); this->nbor.packing(true); } template <class numtyp, class acctyp> GB_GPU_MemoryT::~GB_GPU_Memory() { clear(); } template <class numtyp, class acctyp> int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, const double gamma, const double upsilon, const double mu, double **host_shape, double **host_well, double **host_cutsq, double **host_sigma, double **host_epsilon, double *host_lshape, int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, const int max_nbors, const bool force_d, const int me) { if (this->allocated) clear(); LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon, host_lj1, host_lj2, host_lj3, host_lj4, host_offset, host_special_lj, max_nbors, me); host_form=h_form; // Initialize timers for the selected GPU time_kernel.init(); time_gayberne.init(); time_kernel2.init(); time_gayberne2.init(); // Use the write buffer from atom for data initialization NVC_HostT &host_write=this->atom.host_write; assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2); // Allocate, cast and asynchronous memcpy of constant data gamma_upsilon_mu.safe_alloc(3); host_write[0]=static_cast<numtyp>(gamma); host_write[1]=static_cast<numtyp>(upsilon); host_write[2]=static_cast<numtyp>(mu); gamma_upsilon_mu.copy_from_host(host_write.begin()); - lshape.safe_alloc(ntypes); + lshape.safe_alloc(ntypes,lshape_get_texture<numtyp>()); lshape.cast_copy(host_lshape,host_write); lshape.copy_from_host(host_write.begin()); // Copy shape, well, sigma, epsilon, and cutsq onto GPU - shape.safe_alloc(ntypes,3); + shape.safe_alloc(ntypes,3,shape_get_texture<numtyp>()); shape.cast_copy(host_shape[0],host_write); - well.safe_alloc(ntypes,3); + well.safe_alloc(ntypes,3,well_get_texture<numtyp>()); well.cast_copy(host_well[0],host_write); // Copy LJ data onto GPU int lj_types=ntypes; if (lj_types<=MAX_SHARED_TYPES) lj_types=MAX_SHARED_TYPES; - form.safe_alloc(lj_types,lj_types); + form.safe_alloc(lj_types,lj_types,form_get_texture()); form.copy_2Dfrom_host(host_form[0],ntypes,ntypes); // See if we want fast GB-sphere or sphere-sphere calculations multiple_forms=false; for (int i=1; i<ntypes; i++) for (int j=i; j<ntypes; j++) if (host_form[i][j]!=ELLIPSE_ELLIPSE) multiple_forms=true; // Memory for ilist ordered by particle type host_olist.safe_alloc_rw(this->max_atoms); - // Bind constant data to textures - lshape_bind_texture<numtyp>(lshape); - shape_bind_texture<numtyp>(shape); - well_bind_texture<numtyp>(well); - form_bind_texture(form); - return this->nbor.host_ij.begin(); } template <class numtyp, class acctyp> void GB_GPU_MemoryT::clear() { if (!this->allocated) return; int err_flag; this->dev_error.copy_to_host(&err_flag); if (err_flag == 1) std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N " << "and RECOMPILE.\n"; else if (err_flag == 2) std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n"; LJ_GPU_MemoryT::clear(); - shape_unbind_texture<numtyp>(); - well_unbind_texture<numtyp>(); - form_unbind_texture(); + lshape.unbind(); shape.clear(); well.clear(); form.clear(); lshape.clear(); gamma_upsilon_mu.clear(); host_olist.clear(); } template <class numtyp, class acctyp> double GB_GPU_MemoryT::host_memory_usage() { return this->atom.host_memory_usage(this->max_atoms)+ this->nbor.host_memory_usage()+4*sizeof(numtyp)+ sizeof(GB_GPU_Memory<numtyp,acctyp>)+this->max_atoms*sizeof(int); } template class GB_GPU_Memory<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h index 8886931c1..4d607ed14 100644 --- a/lib/gpu/gb_gpu_memory.h +++ b/lib/gpu/gb_gpu_memory.h @@ -1,74 +1,73 @@ /*************************************************************************** gb_gpu_memory.h ------------------- W. Michael Brown Global variables for GPU Gayberne Library __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Thu Jun 25 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #ifndef GB_GPU_MEMORY_H #define GB_GPU_MEMORY_H #define MAX_GPU_THREADS 4 #include "lj_gpu_memory.h" -#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp,acctyp> enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; template <class numtyp, class acctyp> -class GB_GPU_Memory : public LJ_GPU_MemoryT { +class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> { public: GB_GPU_Memory(); ~GB_GPU_Memory(); int* init(const int ij_size, const int ntypes, const double gamma, const double upsilon, const double mu, double **host_shape, double **host_well, double **host_cutsq, double **host_sigma, double **host_epsilon, double *host_lshape, int **h_form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, const int max_nbors, const bool force_d, const int me); void clear(); double host_memory_usage(); // ---------------------------- DATA ---------------------------- // ilist with particles sorted by type NVC_HostI host_olist; // --------------- Const Data for Atoms NVC_ConstMatT shape, well; NVC_ConstMatI form; NVC_VecT lshape, gamma_upsilon_mu; // --------------- Timing Stuff NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2; // True if we want to use fast GB-sphere or sphere-sphere calculations bool multiple_forms; int **host_form; int last_ellipse; private: }; #endif diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu index ea5ba440c..94d826e14 100644 --- a/lib/gpu/lj_gpu.cu +++ b/lib/gpu/lj_gpu.cu @@ -1,234 +1,235 @@ /*************************************************************************** lj_gpu.cu ------------------- W. Michael Brown Lennard-Jones potential GPU calcultation __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Aug 4 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #include <iostream> #include <cassert> #include "nvc_macros.h" #include "nvc_timer.h" #include "nvc_device.h" -#include "lj_gpu_memory.h" +#include "pair_gpu_texture.h" +#include "lj_gpu_memory.cu" #include "lj_gpu_kernel.h" using namespace std; static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF; #define LJMT LJ_GPU_Memory<numtyp,acctyp> // --------------------------------------------------------------------------- // Convert something to a string // --------------------------------------------------------------------------- #include <sstream> template <class t> inline string lj_gpu_toa(const t& in) { ostringstream o; o.precision(2); o << in; return o.str(); } // --------------------------------------------------------------------------- // Return string with GPU info // --------------------------------------------------------------------------- string lj_gpu_name(const int id, const int max_nbors) { string name=LJMF.gpu.name(id)+", "+ lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+ lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+ lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+ lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id), max_nbors))+" Atoms"; return name; } // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, double **epsilon, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int max_nbors, const int gpu_id) { if (LJMF.gpu.num_devices()==0) return 0; ij_size=IJ_SIZE; return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id); } // --------------------------------------------------------------------------- // Clear memory on host and device // --------------------------------------------------------------------------- void lj_gpu_clear() { LJMF.clear(); } // --------------------------------------------------------------------------- // copy atom positions and optionally types to device // --------------------------------------------------------------------------- template <class numtyp, class acctyp> inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x, const int *host_type, const bool rebuild, cudaStream_t &stream) { atom.time_atom.start(); atom.reset_write_buffer(); // First row of dev_x is x position, second is y, third is z atom.add_atom_data(host_x[0],3); atom.add_atom_data(host_x[0]+1,3); atom.add_atom_data(host_x[0]+2,3); int csize=3; // If a rebuild occured, copy type data if (rebuild) { atom.add_atom_data(host_type); csize++; } atom.copy_atom_data(csize,stream); atom.time_atom.stop(); } void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) { _lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream); } // --------------------------------------------------------------------------- // Signal that we need to transfer a new neighbor list // --------------------------------------------------------------------------- template <class LJMTyp> bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum, int *ilist, const int *numj) { if (nall>ljm.max_atoms) return false; ljm.nbor.time_nbor.start(); ljm.atom.nall(nall); ljm.atom.inum(inum); ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream); ljm.nbor.time_nbor.stop(); return true; } bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, const int *numj) { return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj); } // --------------------------------------------------------------------------- // Copy a set of ij_size ij interactions to device and compute energies, // forces, and torques for those interactions // --------------------------------------------------------------------------- template <class LJMTyp> void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) { ljm.nbor.time_nbor.add_to_total(); // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed ljm.nbor.time_nbor.start(); ljm.nbor.add(num_ij,ljm.pair_stream); ljm.nbor.time_nbor.stop(); } void lj_gpu_nbors(const int num_ij) { _lj_gpu_nbors(LJMF,num_ij); } // --------------------------------------------------------------------------- // Calculate energies and forces for all ij interactions // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){ // Compute the block size and grid size to keep all cores busy const int BX=BLOCK_1D; int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX)); ljm.time_pair.start(); if (ljm.shared_types) kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>> (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, vflag, ljm.atom.inum(), ljm.atom.nall()); else kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>> (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, vflag, ljm.atom.inum(), ljm.atom.nall()); ljm.time_pair.stop(); } void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) { _lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild); } // --------------------------------------------------------------------------- // Get energies and forces to host // --------------------------------------------------------------------------- template<class numtyp, class acctyp> double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial) { double evdw; ljm.atom.time_answer.start(); ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream); ljm.atom.time_atom.add_to_total(); ljm.nbor.time_nbor.add_to_total(); ljm.time_pair.add_to_total(); // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial); ljm.atom.add_forces(ilist,f); ljm.atom.time_answer.stop(); ljm.atom.time_answer.add_to_total(); return evdw; } double lj_gpu_forces(double **f, const int *ilist, const bool eflag, const bool vflag, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial) { return _lj_gpu_forces<PRECISION,ACC_PRECISION> (LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial); } void lj_gpu_time() { cout.precision(4); cout << "Atom copy: " << LJMF.atom.time_atom.total_seconds() << " s.\n"; cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n"; cout << "LJ calc: " << LJMF.time_pair.total_seconds() << " s.\n"; cout << "Answer copy: " << LJMF.atom.time_answer.total_seconds() << " s.\n"; } int lj_gpu_num_devices() { return LJMF.gpu.num_devices(); } double lj_gpu_bytes() { return LJMF.host_memory_usage(); } diff --git a/lib/gpu/lj_gpu_memory.cu b/lib/gpu/lj_gpu_memory.cu index 484e28e86..5e8bf4cdc 100644 --- a/lib/gpu/lj_gpu_memory.cu +++ b/lib/gpu/lj_gpu_memory.cu @@ -1,167 +1,152 @@ /*************************************************************************** lj_gpu_memory.cu ------------------- W. Michael Brown Global variables for GPU Lennard-Jones Library __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Aug 4 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #include "lj_gpu_memory.h" #define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp> template <class numtyp, class acctyp> int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); } template <class numtyp, class acctyp> int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) { int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/ bytes_per_atom(max_nbors)); if (matoms>MAX_ATOMS) matoms=MAX_ATOMS; return matoms; } template <class numtyp, class acctyp> int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, double **host_cutsq, double **host_sigma, double **host_epsilon, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, const int max_nbors, const int me) { if (allocated) clear(); if (me>=gpu.num_devices()) return 0; gpu.set(me); if (gpu.revision()<1.0) return 0; // Initialize timers for the selected GPU time_pair.init(); // Initialize atom and nbor data max_atoms=get_max_atoms(gpu.bytes(),max_nbors); atom.init(max_atoms); nbor.init(ij_size,max_atoms,max_nbors); // Get a stream for computing pair potentials CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream)); // Use the write buffer from atom for data initialization NVC_HostT &host_write=atom.host_write; assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2); // Copy data for bonded interactions special_lj.safe_alloc(4); special_lj.cast_copy(host_special_lj,host_write); // Copy sigma, epsilon, and cutsq onto GPU - sigma.safe_alloc(ntypes,ntypes); + sigma.safe_alloc(ntypes,ntypes,sigma_get_texture<numtyp>()); sigma.cast_copy(host_sigma[0],host_write); - epsilon.safe_alloc(ntypes,ntypes); + epsilon.safe_alloc(ntypes,ntypes,epsilon_get_texture<numtyp>()); epsilon.cast_copy(host_epsilon[0],host_write); - cutsq.safe_alloc(ntypes,ntypes); + cutsq.safe_alloc(ntypes,ntypes,cutsq_get_texture<numtyp>()); cutsq.cast_copy(host_cutsq[0],host_write); // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; if (lj_types<=MAX_SHARED_TYPES) { lj_types=MAX_SHARED_TYPES; shared_types=true; } - offset.safe_alloc(lj_types,lj_types); + offset.safe_alloc(lj_types,lj_types,offset_get_texture<numtyp>()); offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes); double *t1=host_lj1[0]; double *t2=host_lj2[0]; - for (int i=0; i<lj_types*lj_types; i++) { + for (int i=0; i<ntypes*ntypes; i++) { host_write[i*2]=t1[i]; host_write[i*2+1]=t2[i]; } - lj1.safe_alloc(lj_types,lj_types); - lj1.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()), + lj1.safe_alloc(lj_types,lj_types,lj1_get_texture<numtyp>()); + lj1.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()), ntypes,ntypes); t1=host_lj3[0]; t2=host_lj4[0]; - for (int i=0; i<lj_types*lj_types; i++) { + for (int i=0; i<ntypes*ntypes; i++) { host_write[i*2]=t1[i]; host_write[i*2+1]=t2[i]; } - lj3.safe_alloc(lj_types,lj_types); - lj3.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()), + lj3.safe_alloc(lj_types,lj_types,lj3_get_texture<numtyp>()); + lj3.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()), ntypes,ntypes); - // Bind constant data to textures - sigma_bind_texture<numtyp>(sigma); - epsilon_bind_texture<numtyp>(epsilon); - cutsq_bind_texture<numtyp>(cutsq); - offset_bind_texture<numtyp>(offset); - lj1_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj1); - lj3_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj3); - dev_error.safe_alloc(1); dev_error.zero(); allocated=true; return nbor.host_ij.begin(); } template <class numtyp, class acctyp> void LJ_GPU_MemoryT::clear() { if (!allocated) return; allocated=false; // Check for any pair style specific errors here int err_flag; dev_error.copy_to_host(&err_flag); atom.clear(); nbor.clear(); - sigma_unbind_texture<numtyp>(); - epsilon_unbind_texture<numtyp>(); - cutsq_unbind_texture<numtyp>(); - offset_unbind_texture<numtyp>(); - lj1_unbind_texture<typename cu_vec_traits<numtyp>::vec2>(); - lj3_unbind_texture<typename cu_vec_traits<numtyp>::vec2>(); - CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream)); dev_error.clear(); sigma.clear(); epsilon.clear(); special_lj.clear(); cutsq.clear(); offset.clear(); lj1.clear(); lj3.clear(); } template <class numtyp, class acctyp> double LJ_GPU_MemoryT::host_memory_usage() const { return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+ sizeof(LJ_GPU_Memory<numtyp,acctyp>); } template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h index d6f3e1cf1..82a87ea6a 100644 --- a/lib/gpu/lj_gpu_memory.h +++ b/lib/gpu/lj_gpu_memory.h @@ -1,88 +1,89 @@ /*************************************************************************** lj_gpu_memory.h ------------------- W. Michael Brown Global variables for GPU Lennard-Jones Library __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Aug 4 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #ifndef LJ_GPU_MEMORY_H #define LJ_GPU_MEMORY_H #include "nvc_device.h" +#include "nvc_traits.h" #include "pair_gpu_atom.h" #include "pair_gpu_nbor.h" #define BLOCK_1D 64 #define MAX_SHARED_TYPES 8 #define PERCENT_GPU_MEMORY 0.7 template <class numtyp, class acctyp> class LJ_GPU_Memory { public: LJ_GPU_Memory() : allocated(false) {} ~LJ_GPU_Memory() { clear(); } /// Allocate memory on host and device int* init(const int ij_size, const int ntypes, double **host_cutsq, double **host_sigma, double **host_epsilon, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, double *host_special_lj, const int max_nbors, const int me); /// Free any memory on host and device void clear(); /// Returns memory usage on GPU per atom int bytes_per_atom(const int max_nbors) const; /// Maximum number of atoms that can be stored on GPU int get_max_atoms(const size_t gpu_bytes, const int max_nbors); /// Total host memory used by library double host_memory_usage() const; // ------------------------- DATA ----------------------------- // Device Properties NVCDevice gpu; // Device Error Flag NVC_VecI dev_error; // Stream for asynchronous work cudaStream_t pair_stream; // Atom Data PairGPUAtom<numtyp,acctyp> atom; // Neighbor Data PairGPUNbor nbor; // --------------- Const Data for Atoms NVC_ConstMatT sigma, epsilon, cutsq, offset; - NVC_ConstMat< typename cu_vec_traits<numtyp>::vec2 > lj1, lj3; + NVC_ConstMat< typename nvc_vec_traits<numtyp>::vec2 > lj1, lj3; NVC_VecT special_lj; size_t max_atoms; // Timing for pair calculation NVCTimer time_pair; // If atom type constants fit in shared memory, use fast kernels bool shared_types; protected: bool allocated; }; #endif diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h index a83178985..0ff5229c0 100644 --- a/lib/gpu/nvc_memory.h +++ b/lib/gpu/nvc_memory.h @@ -1,447 +1,475 @@ /*************************************************************************** nvc_memory.h ------------------- W. Michael Brown Routines for memory management on CUDA devices __________________________________________________________________________ This file is part of the NVC Library __________________________________________________________________________ begin : Thu Jun 25 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #ifndef NVC_MEMORY_H #define NVC_MEMORY_H #include <iostream> +#include "nvc_macros.h" #define NVC_HostT NVC_Host<numtyp> #define NVC_HostD NVC_Host<double> #define NVC_HostS NVC_Host<float> #define NVC_HostI NVC_Host<int> #define NVC_VecT NVC_Vec<numtyp> #define NVC_VecD NVC_Vec<double> #define NVC_VecS NVC_Vec<float> #define NVC_VecI NVC_Vec<int> #define NVC_VecI2 NVC_Vec<int2> #define NVC_VecU2 NVC_Vec<uint2> #define NVC_MatT NVC_Mat<numtyp> #define NVC_MatD NVC_Mat<double> #define NVC_MatS NVC_Mat<float> #define NVC_MatI NVC_Mat<int> #define NVC_ConstMatT NVC_ConstMat<numtyp> #define NVC_ConstMatD NVC_ConstMat<double> #define NVC_ConstMatS NVC_ConstMat<float> #define NVC_ConstMatI NVC_ConstMat<int> #define NVC_ConstMatD2 NVC_ConstMat<double2> namespace NVC { // Get a channel for float array template <class numtyp> inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); } // Get a channel for float2 array template <> inline void cuda_gb_get_channel<float2>(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); } // Get a channel for double array template <> inline void cuda_gb_get_channel<double>(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned); } // Get a channel for double array template <> inline void cuda_gb_get_channel<double2>(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned); } // Get a channel for int array template <> inline void cuda_gb_get_channel<int>(cudaChannelFormatDesc &channel) { channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned); } } /// Page-locked Row Vector on Host template <class numtyp> class NVC_Host { public: NVC_Host() { _cols=0; } ~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); } // Allocate page-locked memory with fast write/slow read on host inline void safe_alloc_w(const size_t cols) { _cols=cols; _row_bytes=cols*sizeof(numtyp); CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes, cudaHostAllocWriteCombined)); _end=_array+cols; } // Allocate page-locked memory with fast read/write on host inline void safe_alloc_rw(const size_t cols) { _cols=cols; _row_bytes=cols*sizeof(numtyp); CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes)); _end=_array+cols; } /// Free any memory associated with device inline void clear() { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } } /// Set each element to zero inline void zero() { memset(_array,0,row_bytes()); } /// Set first n elements to zero inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); } inline numtyp * begin() { return _array; } inline const numtyp * begin() const { return _array; } inline numtyp * end() { return _end; } inline const numtyp * end() const { return _end; } inline size_t numel() const { return _cols; } inline size_t rows() const { return 1; } inline size_t cols() const { return _cols; } inline size_t row_size() const { return _cols; } inline size_t row_bytes() const { return _row_bytes; } inline numtyp & operator[](const int i) { return _array[i]; } inline const numtyp & operator[](const int i) const { return _array[i]; } /// Copy from device (numel is not bytes) inline void copy_from_device(const numtyp *device_p, size_t numel) { CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp), cudaMemcpyDeviceToHost)); } /// Copy to device (numel is not bytes) inline void copy_to_device(numtyp *device_p, size_t numel) { CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp), cudaMemcpyHostToDevice)); } /// Copy to 2D matrix on device (numel is not bytes) inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, const size_t rows, const size_t cols) { CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp), _array,cols*sizeof(numtyp), cols*sizeof(numtyp),rows, cudaMemcpyHostToDevice)); } /// Asynchronous copy from device (numel is not bytes) inline void copy_from_device(const numtyp *device_p, size_t numel, cudaStream_t &stream) { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp), cudaMemcpyDeviceToHost,stream)); } /// Asynchronous copy to device (numel is not bytes) inline void copy_to_device(numtyp *device_p, size_t numel, cudaStream_t &stream) { CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp), cudaMemcpyHostToDevice,stream)); } /// Asynchronous copy to 2D matrix on device (numel is not bytes) inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size, const size_t rows, const size_t cols, cudaStream_t &stream) { CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp), _array,cols*sizeof(numtyp), cols*sizeof(numtyp),rows, cudaMemcpyHostToDevice,stream)); } private: numtyp *_array, *_end; size_t _row_bytes, _row_size, _rows, _cols; }; /// Row vector on device template <class numtyp> class NVC_Vec { public: NVC_Vec() { _cols=0; } ~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); } // Row vector on device inline void safe_alloc(const size_t cols) { _cols=cols; _row_bytes=cols*sizeof(numtyp); CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes)); _end=_array+cols; } + // Row vector on device (allocate and assign texture and bind) + inline void safe_alloc(const size_t cols, textureReference *t) + { safe_alloc(cols); assign_texture(t); bind(); } + /// Free any memory associated with device inline void clear() { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } } /// Set each element to zero inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); } inline numtyp * begin() { return _array; } inline const numtyp * begin() const { return _array; } inline numtyp * end() { return _end; } inline const numtyp * end() const { return _end; } inline size_t numel() const { return _cols; } inline size_t rows() const { return 1; } inline size_t cols() const { return _cols; } inline size_t row_size() const { return _cols; } inline size_t row_bytes() const { return _row_bytes; } /// Copy from host inline void copy_from_host(const numtyp *host_p) { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), cudaMemcpyHostToDevice)); } /// Asynchronous copy from host inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(), cudaMemcpyHostToDevice, stream)); } /// Copy to host inline void copy_to_host(numtyp *host_p) { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(), cudaMemcpyDeviceToHost)); } /// Copy n elements to host inline void copy_to_host(numtyp *host_p, const int n) { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp), cudaMemcpyDeviceToHost)); } /// Cast and then copy to device template <class numtyp2> inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { for (int i=0; i<numel(); i++) host_write[i]=static_cast<numtyp>(buffer[i]); copy_from_host(host_write.begin()); } + /// Assign a texture to matrix + inline void assign_texture(textureReference *t) { _tex_ptr=t; } + /// Bind to texture - template <class texture> - inline void bind_texture(texture &texi, cudaChannelFormatDesc &channel) { - NVC::cuda_gb_get_channel<numtyp>(channel); - texi.addressMode[0] = cudaAddressModeClamp; - texi.addressMode[1] = cudaAddressModeClamp; - texi.filterMode = cudaFilterModePoint; - texi.normalized = false; - CUDA_SAFE_CALL(cudaBindTexture(NULL,&texi,_array,&channel)); + inline void bind() { + NVC::cuda_gb_get_channel<numtyp>(_channel); + (*_tex_ptr).addressMode[0] = cudaAddressModeClamp; + (*_tex_ptr).addressMode[1] = cudaAddressModeClamp; + (*_tex_ptr).filterMode = cudaFilterModePoint; + (*_tex_ptr).normalized = false; + CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,_array,&_channel)); } + /// Unbind texture + inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); } + /// Output the vector (debugging) inline void print(std::ostream &out) { print (out, numel()); } // Output first n elements of vector inline void print(std::ostream &out, const int n) { numtyp *t=new numtyp[n]; copy_to_host(t,n); for (int i=0; i<n; i++) out << t[i] << " "; delete []t; } private: numtyp *_array, *_end; size_t _row_bytes, _row_size, _rows, _cols; + cudaChannelFormatDesc _channel; + textureReference *_tex_ptr; }; /// 2D Matrix on device (can have extra column storage to get correct alignment) template <class numtyp> class NVC_Mat { public: NVC_Mat() { _rows=0; } ~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); } // Row major matrix on device // - Coalesced access using adjacent cols on same row // - NVC_Mat(row,col) given by array[row*row_size()+col] inline void safe_alloc(const size_t rows, const size_t cols) { _rows=rows; _cols=cols; CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch, cols*sizeof(numtyp),rows)); _row_size=_pitch/sizeof(numtyp); _end=_array+_row_size*cols; } /// Free any memory associated with device inline void clear() { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } } /// Set each element to zero inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); } inline numtyp * begin() { return _array; } inline const numtyp * begin() const { return _array; } inline numtyp * end() { return _end; } inline const numtyp * end() const { return _end; } inline size_t numel() const { return _cols*_rows; } inline size_t rows() const { return _rows; } inline size_t cols() const { return _cols; } inline size_t row_size() const { return _row_size; } inline size_t row_bytes() const { return _pitch; } /// Copy from host (elements not bytes) inline void copy_from_host(const numtyp *host_p, const size_t numel) { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp), cudaMemcpyHostToDevice)); } /// Asynchronous copy from host (elements not bytes) inline void copy_from_host(const numtyp *host_p, const size_t numel, cudaStream_t &stream) { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), cudaMemcpyHostToDevice, stream)); } /// Asynchronous Copy from Host /** \note Used when the number of columns/rows allocated on host smaller than * on device **/ inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, const size_t cols, cudaStream_t &stream) { CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp), cols*sizeof(numtyp), rows, cudaMemcpyHostToDevice,stream)); } private: numtyp *_array, *_end; size_t _pitch, _row_size, _rows, _cols; }; /// Const 2D Matrix on device (requires texture binding) template <class numtyp> class NVC_ConstMat { public: NVC_ConstMat() { _rows=0; } ~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); } - + + /// Assign a texture to matrix + inline void assign_texture(textureReference *t) { _tex_ptr=t; } + /// Row major matrix on device inline void safe_alloc(const size_t rows, const size_t cols) { _rows=rows; _cols=cols; NVC::cuda_gb_get_channel<numtyp>(_channel); CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows)); } + /// Row major matrix on device (Allocate and bind texture) + inline void safe_alloc(const size_t rows, const size_t cols, + textureReference *t) + { safe_alloc(rows,cols); assign_texture(t); bind(); } + /// Bind to texture - template <class texture> - inline void bind_texture(texture &texi) { - texi.addressMode[0] = cudaAddressModeClamp; - texi.addressMode[1] = cudaAddressModeClamp; - texi.filterMode = cudaFilterModePoint; - texi.normalized = false; - CUDA_SAFE_CALL(cudaBindTextureToArray(&texi,_array,&_channel)); + inline void bind() { + (*_tex_ptr).addressMode[0] = cudaAddressModeClamp; + (*_tex_ptr).addressMode[1] = cudaAddressModeClamp; + (*_tex_ptr).filterMode = cudaFilterModePoint; + (*_tex_ptr).normalized = false; + CUDA_SAFE_CALL(cudaBindTextureToArray(_tex_ptr,_array,&_channel)); } - /// Free any memory associated with device - inline void clear() - { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFreeArray(_array)); } } + /// Unbind texture + inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); } + + /// Free any memory associated with device and unbind + inline void clear() { + if (_rows>0) { + _rows=0; + CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); + CUDA_SAFE_CALL(cudaFreeArray(_array)); + } + } inline size_t numel() const { return _cols*_rows; } inline size_t rows() const { return _rows; } inline size_t cols() const { return _cols; } inline size_t row_size() const { return _cols; } inline size_t row_bytes() const { return _cols*sizeof(numtyp); } /// Copy from Host inline void copy_from_host(const numtyp *host_p) { CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p, numel()*sizeof(numtyp), cudaMemcpyHostToDevice)); } /// Copy from Host /** \note Used when the number of columns/rows allocated on host smaller than * on device **/ inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, const size_t cols) { CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p, cols*sizeof(numtyp), cols*sizeof(numtyp), rows, cudaMemcpyHostToDevice)); } /// Asynchronous Copy from Host inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) { CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, numel()*sizeof(numtyp), cudaMemcpyHostToDevice,stream)); } /// Asynchronous Copy from Host /** \note Used when the number of columns/rows allocated on host smaller than * on device **/ inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows, const size_t cols, cudaStream_t &stream) { CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, cols*sizeof(numtyp), cols*sizeof(numtyp), rows, cudaMemcpyHostToDevice,stream)); } /// Cast buffer to numtyp in host_write and copy to array template <class numtyp2> inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) { int n=numel(); for (int i=0; i<n; i++) { host_write[i]=static_cast<numtyp>(*buffer); buffer++; } copy_from_host(host_write.begin()); } /// Cast buffer to numtyp in host_write and copy to array /** \note Used when the number of columns/rows allocated on host smaller than * on device **/ template <class numtyp2> inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write, const size_t rows, const size_t cols) { int n=rows*cols; for (int i=0; i<n; i++) { host_write[i]=static_cast<numtyp>(*buffer); buffer++; } copy_2Dfrom_host(host_write.begin(),rows,cols); } /// Cast buffer to numtyp in host_write and copy to array asynchronously template <class numtyp2> inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write, cudaStream_t &stream) { int n=numel(); for (int i=0; i<n; i++) { host_write[i]=static_cast<numtyp>(*buffer); buffer++; } copy_from_host(host_write.begin(),stream); } private: size_t _rows, _cols; cudaArray *_array; cudaChannelFormatDesc _channel; + textureReference *_tex_ptr; }; #endif diff --git a/lib/gpu/nvc_traits.h b/lib/gpu/nvc_traits.h new file mode 100644 index 000000000..eee92b702 --- /dev/null +++ b/lib/gpu/nvc_traits.h @@ -0,0 +1,31 @@ +/*************************************************************************** + nvc_texture_traits.h + ------------------- + W. Michael Brown + + Tricks for templating textures + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Jun 23 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#ifndef NVC_TEXTURE_TRAITS_H +#define NVC_TEXTURE_TRAITS_H + +template <class numtyp> class nvc_vec_traits; +template <> class nvc_vec_traits<float> { public: typedef float2 vec2; }; +template <> class nvc_vec_traits<double> { public: typedef double2 vec2; }; + +#endif diff --git a/lib/gpu/pair_gpu_atom.cu b/lib/gpu/pair_gpu_atom.cu index 94617a5c1..88b24e741 100644 --- a/lib/gpu/pair_gpu_atom.cu +++ b/lib/gpu/pair_gpu_atom.cu @@ -1,173 +1,174 @@ /*************************************************************************** pair_gpu_atom.cu ------------------- W. Michael Brown Memory routines for moving atom and force data between host and gpu __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Aug 4 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ +#include "pair_gpu_texture.h" #include "pair_gpu_atom.h" + #define PairGPUAtomT PairGPUAtom<numtyp,acctyp> template <class numtyp, class acctyp> int PairGPUAtomT::bytes_per_atom() const { return atom_fields()*sizeof(numtyp)+ans_fields()*sizeof(acctyp); } template <class numtyp, class acctyp> void PairGPUAtomT::init(const int max_atoms) { if (allocated) clear(); _max_atoms=max_atoms; // Initialize timers for the selected GPU time_atom.init(); time_answer.init(); // Device matrices for atom and force data - dev_x.safe_alloc(atom_fields(),max_atoms); - x_bind_texture<numtyp>(dev_x); + dev_x.safe_alloc(atom_fields(),max_atoms,x_get_texture<numtyp>()); ans.safe_alloc(ans_fields(),max_atoms); // Get a host write only buffer host_write.safe_alloc_w(max_atoms*4); // Get a host read/write buffer host_read.safe_alloc_rw(ans.row_size()*ans_fields()); allocated=true; } template <class numtyp, class acctyp> void PairGPUAtomT::clear() { if (!allocated) return; allocated=false; - x_unbind_texture<numtyp>(); + dev_x.unbind(); ans.clear(); host_write.clear(); host_read.clear(); dev_x.clear(); } template <class numtyp, class acctyp> double PairGPUAtomT::host_memory_usage(const int max_atoms) const { return max_atoms*atom_fields()*sizeof(numtyp)+ ans_fields()*(max_atoms)*sizeof(acctyp)+ sizeof(PairGPUAtom<numtyp,acctyp>); } template <class numtyp, class acctyp> void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, cudaStream_t &s) { _eflag=eflag; _vflag=vflag; int csize=ans_fields(); if (!eflag) csize--; if (!vflag) csize-=6; host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s); } template <class numtyp, class acctyp> double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial) { double evdwl=0.0; int gap=ans.row_size()-_inum; acctyp *ap=host_read.begin(); if (_eflag) { if (eflag_atom) { for (int i=0; i<_inum; i++) { evdwl+=*ap; eatom[ilist[i]]+=*ap*0.5; ap++; } } else for (int i=0; i<_inum; i++) { evdwl+=*ap; ap++; } ap+=gap; evdwl*=0.5; } _read_loc=ap; gap=ans.row_size(); if (_vflag) { if (vflag_atom) { for (int ii=0; ii<_inum; ii++) { int i=ilist[ii]; ap=_read_loc+ii; for (int j=0; j<6; j++) { vatom[i][j]+=*ap*0.5; virial[j]+=*ap; ap+=gap; } } } else { for (int ii=0; ii<_inum; ii++) { ap=_read_loc+ii; for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=gap; } } } for (int j=0; j<6; j++) virial[j]*=0.5; _read_loc+=gap*6; } return evdwl; } template <class numtyp, class acctyp> void PairGPUAtomT::add_forces(const int *ilist, double **f) { int gap=ans.row_size(); for (int ii=0; ii<_inum; ii++) { acctyp *ap=_read_loc+ii; int i=ilist[ii]; f[i][0]+=*ap; ap+=gap; f[i][1]+=*ap; ap+=gap; f[i][2]+=*ap; } } template <class numtyp, class acctyp> void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) { int gap=ans.row_size(); _read_loc+=gap*3; for (int ii=0; ii<n; ii++) { acctyp *ap=_read_loc+ii; int i=ilist[ii]; tor[i][0]+=*ap; ap+=gap; tor[i][1]+=*ap; ap+=gap; tor[i][2]+=*ap; } } template class PairGPUAtom<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index 18acafea3..01f450ffc 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -1,153 +1,153 @@ /*************************************************************************** pair_gpu_atom.h ------------------- W. Michael Brown Memory routines for moving atom and force data between host and gpu __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Aug 4 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ #ifndef PAIR_GPU_ATOM_H #define PAIR_GPU_ATOM_H // PRECISION - Precision for rsq, energy, force, and torque calculation // ACC_PRECISION - Precision for accumulation of energies, forces, and torques #ifdef _SINGLE_DOUBLE #define PRECISION float #define ACC_PRECISION double #define MAX_ATOMS 65536 #endif #ifdef _DOUBLE_DOUBLE #define PRECISION double #define ACC_PRECISION double #define MAX_ATOMS 32768 #endif #ifndef PRECISION #define PRECISION float -#define ACC_PRECISION double +#define ACC_PRECISION float #define MAX_ATOMS 65536 #endif #include "nvc_timer.h" -#include "pair_gpu_texture.h" +#include "nvc_memory.h" template <class numtyp, class acctyp> class PairGPUAtom { public: PairGPUAtom() : _atom_fields(4), _ans_fields(10), allocated(false) {} ~PairGPUAtom() { clear(); } // Accessors inline int atom_fields() const { return _atom_fields; } inline int ans_fields() const { return _ans_fields; } inline int max_atoms() const { return _max_atoms; } inline int nall() const { return _nall; } inline int inum() const { return _inum; } /// Set number of atoms for future copy operations inline void nall(const int n) { _nall=n; } /// Set number of inum for future copy operations inline void inum(const int n) { _inum=n; } /// Set the number of atom fields (x, y, z, type, etc) inline void atom_fields(const int n) { _atom_fields=n; } /// Set the number of answer fields (energy, virial, force, etc.) inline void ans_fields(const int n) { _ans_fields=n; } /// Memory usage per atom in this class /** \note atom_fields and ans_fields should be set for correct answer **/ int bytes_per_atom() const; /// Must be called once to allocate host and device memory /** \note atom_fields and ans_fields should be set first if not default **/ void init(const int max_atoms); /// Free all memory on host and device void clear(); /// Return the total amount of host memory used by class double host_memory_usage(const int max_atoms) const; // -------------------------COPY TO GPU ---------------------------------- /// Reset the write buffer pointer (Start copying new atom data) inline void reset_write_buffer() { _write_loc=host_write.begin(); } /// Add a row to write buffer with unit stride /** Copies nall() elements **/ template<class cpytyp> inline void add_atom_data(const cpytyp *host_ptr) { for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i]; _write_loc++; } } /// Add a row to write buffer with non-unit stride /** Copies nall() elements **/ template<class cpytyp> inline void add_atom_data(const cpytyp *hostptr, const int stride) { int t=_nall*stride; for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; } } /// Copy num_rows x nall() write buffer to x in GPU /** num_rows<=atom_fields() **/ inline void copy_atom_data(const int num_rows, cudaStream_t &stream) { dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); } // -------------------------COPY FROM GPU ------------------------------- /// Copy answers from GPU into read buffer void copy_answers(const bool eflag, const bool vflag, cudaStream_t &s); /// Copy energy and virial data into LAMMPS memory double energy_virial(const int *ilist, const bool eflag_atom, const bool vflag_atom, double *eatom, double **vatom, double *virial); /// Add forces from the GPU into a LAMMPS pointer void add_forces(const int *ilist, double **f); /// Add torques from the GPU into a LAMMPS pointer void add_torques(const int *ilist, double **tor, const int n); // ------------------------------ DATA ---------------------------------- // atom_fields() x n (example: rows 1-3 position, 4 is type) NVC_ConstMatT dev_x; // ans_fields() x n Storage for Forces, etc. // example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force // example: if (!eflag) 1-3 is force NVC_Mat<acctyp> ans; // Buffer for moving floating point data to GPU NVC_HostT host_write; // Buffer for moving floating point data to CPU NVC_Host<acctyp> host_read; // Timing Stuff NVCTimer time_atom, time_answer; private: bool allocated, _eflag, _vflag; int _atom_fields, _ans_fields; int _max_atoms, _nall, _inum; numtyp * _write_loc; acctyp * _read_loc; }; #endif diff --git a/lib/gpu/pair_gpu_texture.h b/lib/gpu/pair_gpu_texture.h index 76bac7136..e647dda8b 100644 --- a/lib/gpu/pair_gpu_texture.h +++ b/lib/gpu/pair_gpu_texture.h @@ -1,300 +1,316 @@ /*************************************************************************** pair_gpu_texture.h ------------------- W. Michael Brown Tricks for templating textures __________________________________________________________________________ This file is part of the LAMMPS GPU Library __________________________________________________________________________ begin : Tue Jun 23 2009 copyright : (C) 2009 by W. Michael Brown email : wmbrown@sandia.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. ----------------------------------------------------------------------- */ +#include "nvc_traits.h" #include "nvc_memory.h" #ifndef PAIR_GPU_TEXTURE_H #define PAIR_GPU_TEXTURE_H #ifdef _SINGLE_DOUBLE #define GB_GPU_DOUBLE #endif #ifdef _DOUBLE_DOUBLE #define GB_GPU_DOUBLE #endif -template <class numtyp> class cu_vec_traits; -template <> class cu_vec_traits<float> { public: typedef float2 vec2; }; -template <> class cu_vec_traits<double> { public: typedef double2 vec2; }; - // ------------------------------- x ------------------------------------ static texture<float, 2, cudaReadModeElementType> x_float_tex; static texture<int2, 2, cudaReadModeElementType> x_double_tex; -template <class numtyp> inline void x_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(x_float_tex); } - -template <> inline void x_bind_texture<double>(NVC_ConstMatD &mat) - { mat.bind_texture(x_double_tex); } -template <class numtyp> inline void x_unbind_texture() - { cudaUnbindTexture(x_float_tex); } -template <> inline void x_unbind_texture<double>() - { cudaUnbindTexture(x_double_tex); } +template <class numtyp> inline textureReference * x_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"x_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * x_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"x_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ numtyp _x_(const int i, const int j) { return tex2D(x_float_tex,i,j); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double _x_<double>(const int i,const int j) { int2 t=tex2D(x_double_tex,i,j); return __hiloint2double(t.y, t.x); } #endif // ------------------------------- form ------------------------------------ static texture<int, 2, cudaReadModeElementType> form_tex; -inline void form_bind_texture(NVC_ConstMatI &mat) - { mat.bind_texture(form_tex); } -inline void form_unbind_texture() - { cudaUnbindTexture(form_tex); } +inline textureReference * form_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"form_tex"); + return const_cast<textureReference *>(ptr); +} static __inline__ __device__ int _form_(const int i, const int j) { return tex2D(form_tex,i,j); } // ------------------------------- lshape ------------------------------------ static texture<float, 1, cudaReadModeElementType> lshape_float_tex; static texture<int2, 1, cudaReadModeElementType> lshape_double_tex; -static cudaChannelFormatDesc channel_lshape; -template <class numtyp> inline void lshape_bind_texture(NVC_VecT &vec) - { vec.bind_texture(lshape_float_tex,channel_lshape); } -template <> inline void lshape_bind_texture<double>(NVC_VecD &vec) - { vec.bind_texture(lshape_double_tex,channel_lshape); } -template <class numtyp> inline void lshape_unbind_texture() - { cudaUnbindTexture(lshape_float_tex); } -template <> inline void lshape_unbind_texture<double>() - { cudaUnbindTexture(lshape_double_tex); } +template <class numtyp> inline textureReference * lshape_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"lshape_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * lshape_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"lshape_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ numtyp _lshape_(const int i) { return tex1Dfetch(lshape_float_tex,i); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double _lshape_<double>(const int i) { int2 t=tex1Dfetch(lshape_double_tex,i); return __hiloint2double(t.y, t.x); } #endif // ------------------------------- shape ------------------------------------ static texture<float, 2, cudaReadModeElementType> shape_float_tex; static texture<int2, 2, cudaReadModeElementType> shape_double_tex; -template <class numtyp> inline void shape_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(shape_float_tex); } -template <> inline void shape_bind_texture<double>(NVC_ConstMatD &mat) - { mat.bind_texture(shape_double_tex); } -template <class numtyp> inline void shape_unbind_texture() - { cudaUnbindTexture(shape_float_tex); } -template <> inline void shape_unbind_texture<double>() - { cudaUnbindTexture(shape_double_tex); } +template <class numtyp> inline textureReference * shape_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"shape_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * shape_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"shape_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ numtyp _shape_(const int i, const int j) { return tex2D(shape_float_tex,j,i); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double _shape_<double>(const int i, const int j) { int2 t=tex2D(shape_double_tex,j,i); return __hiloint2double(t.y, t.x); } #endif // ------------------------------- well ------------------------------------ static texture<float, 2, cudaReadModeElementType> well_float_tex; static texture<int2, 2, cudaReadModeElementType> well_double_tex; -template <class numtyp> inline void well_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(well_float_tex); } -template <> inline void well_bind_texture<double>(NVC_ConstMatD &mat) - { mat.bind_texture(well_double_tex); } -template <class numtyp> inline void well_unbind_texture() - { cudaUnbindTexture(well_float_tex); } -template <> inline void well_unbind_texture<double>() - { cudaUnbindTexture(well_double_tex); } +template <class numtyp> inline textureReference * well_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"well_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * well_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"well_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ numtyp _well_(const int i, const int j) { return tex2D(well_float_tex,j,i); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double _well_<double>(const int i,const int j) { int2 t=tex2D(well_double_tex,j,i); return __hiloint2double(t.y, t.x); } #endif // ------------------------------- sigma ------------------------------------ static texture<float, 2, cudaReadModeElementType> sigma_float_tex; static texture<int2, 2, cudaReadModeElementType> sigma_double_tex; -template <class numtyp> inline void sigma_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(sigma_float_tex); } -template <> inline void sigma_bind_texture<double>(NVC_ConstMatD &mat) - { mat.bind_texture(sigma_double_tex); } -template <class numtyp> inline void sigma_unbind_texture() - { cudaUnbindTexture(sigma_float_tex); } -template <> inline void sigma_unbind_texture<double>() - { cudaUnbindTexture(sigma_double_tex); } +template <class numtyp> inline textureReference * sigma_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"sigma_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * sigma_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"sigma_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ numtyp _sigma_(const int i, const int j) { return tex2D(sigma_float_tex,j,i); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double _sigma_<double>(const int i,const int j) { int2 t=tex2D(sigma_double_tex,j,i); return __hiloint2double(t.y, t.x); } #endif // ------------------------------- epsilon ------------------------------------ static texture<float, 2, cudaReadModeElementType> epsilon_float_tex; static texture<int2, 2, cudaReadModeElementType> epsilon_double_tex; -template <class numtyp> inline void epsilon_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(epsilon_float_tex); } -template <> inline void epsilon_bind_texture<double>(NVC_ConstMatD &mat) - { mat.bind_texture(epsilon_double_tex); } -template <class numtyp> inline void epsilon_unbind_texture() - { cudaUnbindTexture(epsilon_float_tex); } -template <> inline void epsilon_unbind_texture<double>() - { cudaUnbindTexture(epsilon_double_tex); } +template <class numtyp> inline textureReference * epsilon_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"epsilon_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * epsilon_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"epsilon_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ numtyp _epsilon_(const int i, const int j) { return tex2D(epsilon_float_tex,j,i); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double _epsilon_<double>(const int i,const int j) { int2 t=tex2D(epsilon_double_tex,j,i); return __hiloint2double(t.y, t.x); } #endif // ------------------------------- cutsq ------------------------------------ static texture<float, 2, cudaReadModeElementType> cutsq_float_tex; static texture<int2, 2, cudaReadModeElementType> cutsq_double_tex; -template <class numtyp> inline void cutsq_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(cutsq_float_tex); } -template <> inline void cutsq_bind_texture<double>(NVC_ConstMatD &mat) - { mat.bind_texture(cutsq_double_tex); } -template <class numtyp> inline void cutsq_unbind_texture() - { cudaUnbindTexture(cutsq_float_tex); } -template <> inline void cutsq_unbind_texture<double>() - { cudaUnbindTexture(cutsq_double_tex); } +template <class numtyp> inline textureReference * cutsq_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"cutsq_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * cutsq_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"cutsq_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ numtyp _cutsq_(const int i, const int j) { return tex2D(cutsq_float_tex,j,i); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double _cutsq_<double>(const int i,const int j) { int2 t=tex2D(cutsq_double_tex,j,i); return __hiloint2double(t.y, t.x); } #endif // ------------------------------- lj1 ------------------------------------ static texture<float2, 2, cudaReadModeElementType> lj1_float_tex; static texture<int4, 2, cudaReadModeElementType> lj1_double_tex; -template <class numtyp> inline void lj1_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(lj1_float_tex); } -template <> inline void lj1_bind_texture<double2>(NVC_ConstMatD2 &mat) - { mat.bind_texture(lj1_double_tex); } -template <class numtyp> inline void lj1_unbind_texture() - { cudaUnbindTexture(lj1_float_tex); } -template <> inline void lj1_unbind_texture<double2>() - { cudaUnbindTexture(lj1_double_tex); } +template <class numtyp> inline textureReference * lj1_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"lj1_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * lj1_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"lj1_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ -typename cu_vec_traits<numtyp>::vec2 _lj1_(const int i, const int j) { +typename nvc_vec_traits<numtyp>::vec2 _lj1_(const int i, const int j) { return tex2D(lj1_float_tex,j,i); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double2 _lj1_<double>(const int i,const int j) { int4 t=tex2D(lj1_double_tex,j,i); double2 ans; ans.x=__hiloint2double(t.y, t.x); ans.y=__hiloint2double(t.w, t.z); return ans; } #endif // ------------------------------- lj3 ------------------------------------ static texture<float2, 2, cudaReadModeElementType> lj3_float_tex; static texture<int4, 2, cudaReadModeElementType> lj3_double_tex; -template <class numtyp> inline void lj3_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(lj3_float_tex); } -template <> inline void lj3_bind_texture<double2>(NVC_ConstMatD2 &mat) - { mat.bind_texture(lj3_double_tex); } -template <class numtyp> inline void lj3_unbind_texture() - { cudaUnbindTexture(lj3_float_tex); } -template <> inline void lj3_unbind_texture<double2>() - { cudaUnbindTexture(lj3_double_tex); } +template <class numtyp> inline textureReference * lj3_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"lj3_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * lj3_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"lj3_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ -typename cu_vec_traits<numtyp>::vec2 _lj3_(const int i, const int j) { +typename nvc_vec_traits<numtyp>::vec2 _lj3_(const int i, const int j) { return tex2D(lj3_float_tex,j,i); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double2 _lj3_<double>(const int i,const int j) { int4 t=tex2D(lj3_double_tex,j,i); double2 ans; ans.x=__hiloint2double(t.y, t.x); ans.y=__hiloint2double(t.w, t.z); return ans; } #endif // ------------------------------- offset ------------------------------------ static texture<float, 2, cudaReadModeElementType> offset_float_tex; static texture<int2, 2, cudaReadModeElementType> offset_double_tex; -template <class numtyp> inline void offset_bind_texture(NVC_ConstMatT &mat) - { mat.bind_texture(offset_float_tex); } -template <> inline void offset_bind_texture<double>(NVC_ConstMatD &mat) - { mat.bind_texture(offset_double_tex); } -template <class numtyp> inline void offset_unbind_texture() - { cudaUnbindTexture(offset_float_tex); } -template <> inline void offset_unbind_texture<double>() - { cudaUnbindTexture(offset_double_tex); } +template <class numtyp> inline textureReference * offset_get_texture() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"offset_float_tex"); + return const_cast<textureReference *>(ptr); +} +template <> inline textureReference * offset_get_texture<double>() { + const textureReference *ptr; + cudaGetTextureReference(&ptr,"offset_double_tex"); + return const_cast<textureReference *>(ptr); +} template <class numtyp> static __inline__ __device__ numtyp _offset_(const int i, const int j) { return tex2D(offset_float_tex,j,i); } #ifdef GB_GPU_DOUBLE template <> static __inline__ __device__ double _offset_<double>(const int i,const int j) { int2 t=tex2D(offset_double_tex,j,i); return __hiloint2double(t.y, t.x); } #endif #endif diff --git a/lib/gpu/pair_tex_tar.cu b/lib/gpu/pair_tex_tar.cu new file mode 100644 index 000000000..d0b07177e --- /dev/null +++ b/lib/gpu/pair_tex_tar.cu @@ -0,0 +1,28 @@ +/*************************************************************************** + pair_tex_tar.cu + ------------------- + W. Michael Brown + + "Tar" of header and source files that need texture reference definitions + within file scope. + + __________________________________________________________________________ + This file is part of the LAMMPS GPU Library + __________________________________________________________________________ + + begin : Tue Jun 23 2009 + copyright : (C) 2009 by W. Michael Brown + email : wmbrown@sandia.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + Copyright (2009) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + ----------------------------------------------------------------------- */ + +#include "pair_gpu_atom.cu" +#include "lj_gpu.cu" +#include "gb_gpu.cu" +