diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp index c142a4867..33b73568b 100644 --- a/lib/gpu/lal_buck.cpp +++ b/lib/gpu/lal_buck.cpp @@ -1,153 +1,153 @@ /*************************************************************************** - lal_buck.cpp + buck.cpp ------------------- - W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Class for acceleration of the lj/cut pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #ifdef USE_OPENCL #include "buck_cl.h" #else #include "buck_ptx.h" #endif #include "lal_buck.h" #include <cassert> using namespace LAMMPS_AL; #define BuckT Buck<numtyp, acctyp> extern Device<PRECISION,ACC_PRECISION> device; template <class numtyp, class acctyp> BuckT::Buck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) { } template <class numtyp, class acctyp> BuckT::~Buck() { clear(); } template <class numtyp, class acctyp> int BuckT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template <class numtyp, class acctyp> int BuckT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, _screen,buck); if (success!=0) return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; // Allocate a host write buffer for data initialization UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int i=0; i<lj_types*lj_types; i++) host_write[i]=0.0; coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv, host_buck1,host_buck2,host_cutsq); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, host_offset); UCL_H_Vec<double> dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); ucl_copy(sp_lj,dview,false); _allocated=true; this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes(); return 0; } template <class numtyp, class acctyp> void BuckT::clear() { if (!_allocated) return; _allocated=false; coeff1.clear(); coeff2.clear(); sp_lj.clear(); this->clear_atomic(); } template <class numtyp, class acctyp> double BuckT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Buck<numtyp,acctyp>); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void BuckT::loop(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } template class Buck<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu index 011a116a6..1281fef64 100644 --- a/lib/gpu/lal_buck.cu +++ b/lib/gpu/lal_buck.cu @@ -1,195 +1,195 @@ // ************************************************************************** -// lj.cu +// buck.cu // ------------------- -// W. Michael Brown (ORNL) +// Trung Dac Nguyen (ORNL) // -// Device code for acceleration of the lj/cut pair style +// Device code for acceleration of the buck pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : -// email : brownw@ornl.gov +// email : nguyentd@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_aux_fun1.h" texture<float4> pos_tex; #ifndef _DOUBLE_DOUBLE ucl_inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(pos_tex, i); } #endif #endif __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, __global numtyp4* coeff2, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; int itype=ix.w; numtyp factor_lj; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp r2inv = delx*delx+dely*dely+delz*delz; int mtype=itype*lj_types+jtype; if (r2inv<coeff1[mtype].w) { numtyp r=ucl_sqrt(r2inv); numtyp rexp = ucl_exp(-r*coeff1[mtype].x); r2inv=ucl_recip(r2inv); numtyp r6inv = r2inv*r2inv*r2inv; numtyp force = r2inv*(coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv); force*=factor_lj; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); } // if ii } __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in, __global numtyp4* coeff2_in, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { coeff1[tid]=coeff1_in[tid]; if (eflag>0) coeff2[tid]=coeff2_in[tid]; } acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; __syncthreads(); if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); numtyp factor_lj; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int mtype=itype+jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp r2inv = delx*delx+dely*dely+delz*delz; if (r2inv<coeff1[mtype].w) { numtyp r=ucl_sqrt(r2inv); numtyp rexp = ucl_exp(-r*coeff1[mtype].x); r2inv=ucl_recip(r2inv); numtyp r6inv = r2inv*r2inv*r2inv; numtyp force = r2inv*(coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv); force*=factor_lj; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); } // if ii } diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h index b60df32fd..b0b6347c4 100644 --- a/lib/gpu/lal_buck.h +++ b/lib/gpu/lal_buck.h @@ -1,80 +1,80 @@ /*************************************************************************** - lal_buck.h + buck.h ------------------- - W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Class for acceleration of the buck pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #ifndef LAL_BUCK_H #define LAL_BUCK_H #include "lal_base_atomic.h" namespace LAMMPS_AL { template <class numtyp, class acctyp> class Buck : public BaseAtomic<numtyp, acctyp> { public: Buck(); ~Buck(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; /// Total host memory used by library for pair style double host_memory_usage() const; // --------------------------- TYPE DATA -------------------------- /// coeff1.x = rhoinv, coeff1.y = buck1, coeff1.z = buck2, coeff1.w = cutsq UCL_D_Vec<numtyp4> coeff1; /// coeff2.x = a, coeff2.y = c, coeff2.z = offset UCL_D_Vec<numtyp4> coeff2; /// Special LJ values UCL_D_Vec<numtyp> sp_lj; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; /// Number of atom types int _lj_types; private: bool _allocated; void loop(const bool _eflag, const bool _vflag); }; } #endif diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp index 86ca89cc4..42dbfb3e7 100644 --- a/lib/gpu/lal_buck_coul.cpp +++ b/lib/gpu/lal_buck_coul.cpp @@ -1,166 +1,166 @@ /*************************************************************************** - lal_buck_coul.cpp + buck_coul.cpp ------------------- - Trung Dac Nguyen, W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Class for acceleration of the buck/coul/cut pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov nguyentd@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #ifdef USE_OPENCL #include "buck_coul_cl.h" #else #include "buck_coul_ptx.h" #endif #include "lal_buck_coul.h" #include <cassert> using namespace LAMMPS_AL; #define BuckCoulT BuckCoul<numtyp, acctyp> extern Device<PRECISION,ACC_PRECISION> device; template <class numtyp, class acctyp> BuckCoulT::BuckCoul() : BaseCharge<numtyp,acctyp>(), _allocated(false) { } template <class numtyp, class acctyp> BuckCoulT::~BuckCoul() { clear(); } template <class numtyp, class acctyp> int BuckCoulT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template <class numtyp, class acctyp> int BuckCoulT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, _screen,buck_coul); if (success!=0) return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; // Allocate a host write buffer for data initialization UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int i=0; i<lj_types*lj_types; i++) host_write[i]=0.0; coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv, host_buck1,host_buck2); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, host_offset); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq, host_cut_ljsq, host_cut_coulsq); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; host_write[i+4]=host_special_coul[i]; } ucl_copy(sp_lj,host_write,8,false); _qqrd2e = qqrd2e; _allocated=true; this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes(); return 0; } template <class numtyp, class acctyp> void BuckCoulT::clear() { if (!_allocated) return; _allocated=false; coeff1.clear(); coeff2.clear(); sp_lj.clear(); this->clear_atomic(); } template <class numtyp, class acctyp> double BuckCoulT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(BuckCoul<numtyp,acctyp>); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void BuckCoulT::loop(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->dev_q.begin(), &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->dev_q.begin(), &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); } template class BuckCoul<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu index 39e3a11cc..df4a824b4 100644 --- a/lib/gpu/lal_buck_coul.cu +++ b/lib/gpu/lal_buck_coul.cu @@ -1,244 +1,244 @@ // ************************************************************************** -// lal_buck_coul.cu +// buck_coul.cu // ------------------- -// Trung Dac Nguyen, W. Michael Brown (ORNL) +// Trung Dac Nguyen (ORNL) // // Device code for acceleration of the buck/coul/cut pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : -// email : brownw@ornl.gov nguyentd@ornl.gov +// email : nguyentd@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_aux_fun1.h" texture<float4> pos_tex; texture<float> q_tex; #ifndef _DOUBLE_DOUBLE ucl_inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(pos_tex, i); } ucl_inline float fetch_q(const int& i, const float *q) { return tex1Dfetch(q_tex, i); } #endif #endif __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, __global numtyp4* coeff2, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, __global numtyp *q_ , __global numtyp4 *cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; sp_lj[4]=sp_lj_in[4]; sp_lj[5]=sp_lj_in[5]; sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; numtyp qtmp=fetch_q(i,q_); int itype=ix.w; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; numtyp factor_lj, factor_coul; factor_lj = sp_lj[sbmask(j)]; factor_coul = sp_lj[sbmask(j)+4]; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; int mtype=itype*lj_types+jtype; if (rsq<cutsq[mtype].x) { numtyp r2inv=ucl_recip(rsq); numtyp forcecoul, forcebuck, force, r6inv; numtyp rexp = (numtyp)0.0; if (rsq < cutsq[mtype].y) { // buckingham numtyp r=ucl_sqrt(rsq); rexp = ucl_exp(-r*coeff1[mtype].x); r6inv = r2inv*r2inv*r2inv; forcebuck = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv)*factor_lj; } else forcebuck = (numtyp)0.0; if (rsq < coeff2[mtype].z) // coul forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul; else forcecoul = (numtyp)0.0; force = (forcebuck + forcecoul) * r2inv; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { e_coul += forcecoul; if (rsq < cutsq[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, vflag,ans,engv); } // if ii } __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in, __global numtyp4* coeff2_in, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, __global numtyp *q_, __global numtyp4 *_cutsq, const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { coeff1[tid]=coeff1_in[tid]; cutsq[tid]=_cutsq[tid]; if (eflag>0) coeff2[tid]=coeff2_in[tid]; } acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; __syncthreads(); if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; numtyp qtmp=fetch_q(i,q_); int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; numtyp factor_lj, factor_coul; factor_lj = sp_lj[sbmask(j)]; factor_coul = sp_lj[sbmask(j)+4]; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int mtype=itype+jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<cutsq[mtype].x) { numtyp r2inv=ucl_recip(rsq); numtyp forcecoul, forcebuck, force, r6inv; numtyp rexp = (numtyp)0.0; if (rsq < cutsq[mtype].y) { // buckingham numtyp r=ucl_sqrt(rsq); rexp = ucl_exp(-r*coeff1[mtype].x); r6inv = r2inv*r2inv*r2inv; forcebuck = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv)*factor_lj; } else forcebuck = (numtyp)0.0; if (rsq < cutsq[mtype].z) // coul forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul; else forcecoul = (numtyp)0.0; force = (forcebuck + forcecoul) * r2inv; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { e_coul += forcecoul; if (rsq < cutsq[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, vflag,ans,engv); } // if ii } diff --git a/lib/gpu/lal_buck_coul.h b/lib/gpu/lal_buck_coul.h index 44ec43988..e4bf59107 100644 --- a/lib/gpu/lal_buck_coul.h +++ b/lib/gpu/lal_buck_coul.h @@ -1,86 +1,86 @@ /*************************************************************************** - lal_buck_coul.h + buck_coul.h ------------------- - Trung Dac Nguyen, W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Class for acceleration of the buck/coul/cut pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov nguyentd@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #ifndef LAL_BUCK_COUL_H #define LAL_BUCK_COUL_H #include "lal_base_charge.h" namespace LAMMPS_AL { template <class numtyp, class acctyp> class BuckCoul : public BaseCharge<numtyp, acctyp> { public: BuckCoul(); ~BuckCoul(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; /// Total host memory used by library for pair style double host_memory_usage() const; // --------------------------- TYPE DATA -------------------------- /// coeff1.x = rhoinv, coeff1.y = buck1, coeff1.z = buck2 UCL_D_Vec<numtyp4> coeff1; /// coeff2.x = a, coeff2.y = c, coeff2.z = offset UCL_D_Vec<numtyp4> coeff2; /// cutsq.x = cutsq, cutsq.y = cutsq_lj, cutsq.z = cutsq_coul UCL_D_Vec<numtyp4> cutsq; /// Special LJ values UCL_D_Vec<numtyp> sp_lj; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; /// Number of atom types int _lj_types; numtyp _qqrd2e; private: bool _allocated; void loop(const bool _eflag, const bool _vflag); }; } #endif diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp index 9755f852f..ac3e6b891 100644 --- a/lib/gpu/lal_buck_coul_ext.cpp +++ b/lib/gpu/lal_buck_coul_ext.cpp @@ -1,131 +1,131 @@ /*************************************************************************** - lal_buck_coul_ext.cpp + buck_coul_ext.cpp ------------------- - Trung Dac Nguyen, W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Functions for LAMMPS access to buck/coul/cut acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov nguyentd@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_buck_coul.h" using namespace std; using namespace LAMMPS_AL; static BuckCoul<PRECISION,ACC_PRECISION> BUCKCMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e) { BUCKCMF.clear(); gpu_mode=BUCKCMF.device->gpu_mode(); double gpu_split=BUCKCMF.device->particle_split(); int first_gpu=BUCKCMF.device->first_device(); int last_gpu=BUCKCMF.device->last_device(); int world_me=BUCKCMF.device->world_me(); int gpu_rank=BUCKCMF.device->gpu_rank(); int procs_per_gpu=BUCKCMF.device->procs_per_gpu(); BUCKCMF.device->init_message(screen,"buck",first_gpu,last_gpu); bool message=false; if (BUCKCMF.device->replica_me()==0 && screen) message=true; if (message) { fprintf(screen,"Initializing GPU and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); BUCKCMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); else fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); BUCKCMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BUCKCMF.estimate_gpu_overhead(); return init_ok; } void buckc_gpu_clear() { BUCKCMF.clear(); } int ** buckc_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return BUCKCMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void buckc_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { BUCKCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, vflag,eatom,vatom,host_start,cpu_time,success,host_q, nlocal,boxlo,prd); } double buckc_gpu_bytes() { return BUCKCMF.host_memory_usage(); } diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp index 681983a81..c47f46d1b 100644 --- a/lib/gpu/lal_buck_coul_long.cpp +++ b/lib/gpu/lal_buck_coul_long.cpp @@ -1,171 +1,171 @@ /*************************************************************************** - lal_buck_coul_long.cpp + buck_coul_long.cpp ------------------- - Trung Dac Nguyen, W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Class for acceleration of the buck/coul/long pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov nguyentd@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #ifdef USE_OPENCL #include "buck_coul_long_cl.h" #else #include "buck_coul_long_ptx.h" #endif #include "lal_buck_coul_long.h" #include <cassert> using namespace LAMMPS_AL; #define BuckCoulLongT BuckCoulLong<numtyp, acctyp> extern Device<PRECISION,ACC_PRECISION> device; template <class numtyp, class acctyp> BuckCoulLongT::BuckCoulLong() : BaseCharge<numtyp,acctyp>(), _allocated(false) { } template <class numtyp, class acctyp> BuckCoulLongT::~BuckCoulLongT() { clear(); } template <class numtyp, class acctyp> int BuckCoulLongT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template <class numtyp, class acctyp> int BuckCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, _screen,buck_coul_long); if (success!=0) return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; // Allocate a host write buffer for data initialization UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int i=0; i<lj_types*lj_types; i++) host_write[i]=0.0; coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv, host_buck1,host_buck2,host_cut_ljsq); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, host_offset); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<4; i++) { host_write[i]=host_special_lj[i]; host_write[i+4]=host_special_coul[i]; } ucl_copy(sp_lj,host_write,8,false); _cut_coulsq=host_cut_coulsq; _qqrd2e=qqrd2e; _g_ewald=g_ewald; _allocated=true; this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes(); return 0; } template <class numtyp, class acctyp> void BuckCoulLongT::clear() { if (!_allocated) return; _allocated=false; coeff1.clear(); coeff2.clear(); sp_lj.clear(); this->clear_atomic(); } template <class numtyp, class acctyp> double BuckCoulLongT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(BuckCoulLong<numtyp,acctyp>); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->dev_q.begin(), &cutsq.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->dev_q.begin(), &cutsq.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); } template class BuckCoulLong<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu index 0884741f3..95c13dc26 100644 --- a/lib/gpu/lal_buck_coul_long.cu +++ b/lib/gpu/lal_buck_coul_long.cu @@ -1,258 +1,258 @@ // ************************************************************************** -// lal_buck_coul_long.cu +// buck_coul_long.cu // ------------------- -// Trung Dac Nguyen, W. Michael Brown (ORNL) +// Trung Dac Nguyen (ORNL) // // Device code for acceleration of the buck/coul/long pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : -// email : brownw@ornl.gov nguyentd@ornl.gov +// email : nguyentd@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_aux_fun1.h" texture<float4> pos_tex; texture<float> q_tex; #ifndef _DOUBLE_DOUBLE ucl_inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(pos_tex, i); } ucl_inline float fetch_q(const int& i, const float *q) { return tex1Dfetch(q_tex, i); } #endif #endif __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, __global numtyp4* coeff2, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, __global numtyp *q_, __global numtyp *cutsq, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; sp_lj[4]=sp_lj_in[4]; sp_lj[5]=sp_lj_in[5]; sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; numtyp qtmp=fetch_q(i,q_); int itype=ix.w; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; numtyp factor_lj, factor_coul; factor_lj = sp_lj[sbmask(j)]; factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; int mtype=itype*lj_types+jtype; if (rsq<cutsq[mtype]) { numtyp r2inv=ucl_recip(rsq); numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc; numtyp rexp = (numtyp)0.0; if (rsq < coeff1[mtype].w) { // cut_ljsq numtyp r=ucl_sqrt(rsq); rexp = ucl_exp(-r*coeff1[mtype].x); r6inv = r2inv*r2inv*r2inv; force_lj = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv)*factor_lj; } else force_lj = (numtyp)0.0; if (rsq < cut_coulsq) { numtyp r = ucl_rsqrt(r2inv); numtyp grij = g_ewald * r; numtyp expm2 = ucl_exp(-grij*grij); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); } else forcecoul = (numtyp)0.0; force = (force_lj + forcecoul) * r2inv; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < coeff1[mtype].w) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, vflag,ans,engv); } // if ii } __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in, __global numtyp4* coeff2_in, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, __global numtyp *q_, __global numtyp *cutsq, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; if (tid<8) sp_lj[tid]=sp_lj_in[tid]; if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { coeff1[tid]=coeff1_in[tid]; if (eflag>0) coeff2[tid]=coeff2_in[tid]; } acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; __syncthreads(); if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; numtyp qtmp=fetch_q(i,q_); int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; numtyp factor_lj, factor_coul; factor_lj = sp_lj[sbmask(j)]; factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int mtype=itype+jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<cutsq[mtype]) { numtyp r2inv=ucl_recip(rsq); numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc; numtyp rexp = (numtyp)0.0; if (rsq < coeff1[mtype].w) { numtyp r=ucl_sqrt(rsq); rexp = ucl_exp(-r*coeff1[mtype].x); r6inv = r2inv*r2inv*r2inv; force_lj = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv)*factor_lj; } else force_lj = (numtyp)0.0; if (rsq < cut_coulsq) { numtyp r = ucl_rsqrt(r2inv); numtyp grij = g_ewald * r; numtyp expm2 = ucl_exp(-grij*grij); numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); } else forcecoul = (numtyp)0.0; force = (force_lj + forcecoul) * r2inv; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { if (rsq < cut_coulsq) e_coul += prefactor*(_erfc-factor_coul); if (rsq < coeff1[mtype].w) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; energy+=factor_lj*(e-coeff2[mtype].z); } } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, vflag,ans,engv); } // if ii } diff --git a/lib/gpu/lal_buck_coul_long.h b/lib/gpu/lal_buck_coul_long.h index 098dece75..dc59d7ad4 100644 --- a/lib/gpu/lal_buck_coul_long.h +++ b/lib/gpu/lal_buck_coul_long.h @@ -1,86 +1,86 @@ /*************************************************************************** - lal_buck_coul_long.h + buck_coul_long.h ------------------- - Trung Dac Nguyen, W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Class for acceleration of the buck/coul/long pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov nguyentd@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #ifndef LAL_BUCK_COUL_LONG_H #define LAL_BUCK_COUL_LONG_H #include "lal_base_charge.h" namespace LAMMPS_AL { template <class numtyp, class acctyp> class BuckCoulLong : public BaseCharge<numtyp, acctyp> { public: BuckCoulLong(); ~BuckCoulLong(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; /// Total host memory used by library for pair style double host_memory_usage() const; // --------------------------- TYPE DATA -------------------------- /// coeff1.x = rhoinv, coeff1.y = buck1, coeff1.z = buck2, coeff1.w = cutsq_lj UCL_D_Vec<numtyp4> coeff1; /// coeff2.x = a, coeff2.y = c, coeff2.z = offset UCL_D_Vec<numtyp4> coeff2; /// cutsq UCL_D_Vec<numtyp> cutsq; /// Special LJ values [0-3] and Special Coul values [4-7] UCL_D_Vec<numtyp> sp_lj; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; /// Number of atom types int _lj_types; numtyp _cut_coulsq, _qqrd2e, _g_ewald; private: bool _allocated; void loop(const bool _eflag, const bool _vflag); }; } #endif diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp index 2c3d3e5a9..d9328a921 100644 --- a/lib/gpu/lal_buck_coul_long_ext.cpp +++ b/lib/gpu/lal_buck_coul_long_ext.cpp @@ -1,130 +1,130 @@ /*************************************************************************** - lal_buck_coul_long_ext.cpp + buck_coul_long_ext.cpp ------------------- - Trung Dac Nguyen, W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Functions for LAMMPS access to buck/coul/long acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov nguyentd@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_buck_coul_long.h" using namespace std; using namespace LAMMPS_AL; static BuckCoulLong<PRECISION,ACC_PRECISION> BUCKCLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { BUCKCLMF.clear(); gpu_mode=BUCKCLMF.device->gpu_mode(); double gpu_split=BUCKCLMF.device->particle_split(); int first_gpu=BUCKCLMF.device->first_device(); int last_gpu=BUCKCLMF.device->last_device(); int world_me=BUCKCLMF.device->world_me(); int gpu_rank=BUCKCLMF.device->gpu_rank(); int procs_per_gpu=BUCKCLMF.device->procs_per_gpu(); BUCKCLMF.device->init_message(screen,"buck/coul/long",first_gpu,last_gpu); bool message=false; if (BUCKCLMF.device->replica_me()==0 && screen) message=true; if (message) { fprintf(screen,"Initializing GPU and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BUCKCLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); else fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BUCKCLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BUCKCLMF.estimate_gpu_overhead(); return init_ok; } void buckcl_gpu_clear() { BUCKCLMF.clear(); } int** buckcl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return BUCKCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void buckcl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { BUCKCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_q,nlocal,boxlo,prd); } double buckcl_gpu_bytes() { return BUCKCLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp index 1c4aaa222..9f7f725ae 100644 --- a/lib/gpu/lal_buck_ext.cpp +++ b/lib/gpu/lal_buck_ext.cpp @@ -1,121 +1,121 @@ /*************************************************************************** - lal_buck_ext.cpp + buck_ext.cpp ------------------- - W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Functions for LAMMPS access to buck acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_buck.h" using namespace std; using namespace LAMMPS_AL; static Buck<PRECISION,ACC_PRECISION> BUCKMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { BUCKMF.clear(); gpu_mode=BUCKMF.device->gpu_mode(); double gpu_split=BUCKMF.device->particle_split(); int first_gpu=BUCKMF.device->first_device(); int last_gpu=BUCKMF.device->last_device(); int world_me=BUCKMF.device->world_me(); int gpu_rank=BUCKMF.device->gpu_rank(); int procs_per_gpu=BUCKMF.device->procs_per_gpu(); BUCKMF.device->init_message(screen,"buck",first_gpu,last_gpu); bool message=false; if (BUCKMF.device->replica_me()==0 && screen) message=true; if (message) { fprintf(screen,"Initializing GPU and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BUCKMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); else fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BUCKMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BUCKMF.estimate_gpu_overhead(); return init_ok; } void buck_gpu_clear() { BUCKMF.clear(); } int ** buck_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void buck_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { BUCKMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double buck_gpu_bytes() { return BUCKMF.host_memory_usage(); } diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index d9d027935..971725764 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -1,582 +1,578 @@ /*************************************************************************** - lal_eam.cpp + eam.cpp ------------------- - W. Michael Brown, Trung Dac Nguyen (ORNL) + Trung Dac Nguyen, W. Michael Brown (ORNL) Class for acceleration of the eam pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ #ifdef USE_OPENCL #include "eam_cl.h" #else #include "eam_ptx.h" #endif #include "lal_eam.h" #include <cassert> using namespace LAMMPS_AL; #define EAMT EAM<numtyp, acctyp> #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MAX(A,B) ((A) > (B) ? (A) : (B)) extern Device<PRECISION,ACC_PRECISION> device; template <class numtyp, class acctyp> EAMT::EAM() : BaseAtomic<numtyp,acctyp>(), _compiled_energy(false), _allocated(false) { } template <class numtyp, class acctyp> EAMT::~EAM() { clear(); } template <class numtyp, class acctyp> -int EAMT::init(const int ntypes, double host_cutforcesq, - int **host_type2rhor, int **host_type2z2r, int *host_type2frho, - double ***host_rhor_spline, double ***host_z2r_spline, - double ***host_frho_spline, - double rdr, double rdrho, int nrhor, int nrho, - int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen) +int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, + int **host_type2z2r, int *host_type2frho, + double ***host_rhor_spline, double ***host_z2r_spline, + double ***host_frho_spline, double rdr, double rdrho, int nrhor, + int nrho, int nz2r, int nfrho, int nr, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, FILE *_screen) { int success; - success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,eam); + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,eam); if (success!=0) return success; // allocate fp bool cpuview=false; if (this->ucl_device->device_type()==UCL_CPU) cpuview=true; int ef_nall=nall; if (ef_nall==0) ef_nall=2000; _max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10); host_fp.alloc(_max_fp_size,*(this->ucl_device)); if (cpuview) dev_fp.view(host_fp); else dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY); k_energy.set_function(*(this->pair_program),"kernel_energy"); k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast"); fp_tex.get_texture(*(this->pair_program),"fp_tex"); fp_tex.bind_float(dev_fp,1); _compiled_energy = true; // Initialize timers for selected GPU time_pair2.init(*(this->ucl_device)); time_pair2.zero(); time_fp1.init(*(this->ucl_device)); time_fp1.zero(); time_fp2.init(*(this->ucl_device)); time_fp2.zero(); // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; } _ntypes=lj_types; _cutforcesq=host_cutforcesq; _rdr=rdr; _rdrho = rdrho; _nrhor=nrhor; _nrho=nrho; _nz2r=nz2r; _nfrho=nfrho; _nr=nr; UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int i=0; i<lj_types*lj_types; i++) { dview_type[i].x=0; dview_type[i].y=0; } // pack type2rhor and type2z2r type2rhor_z2r.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<ntypes; i++) { for (int j=0; j<ntypes; j++) { dview_type[i*lj_types+j].x=host_type2rhor[i][j]; dview_type[i*lj_types+j].y=host_type2z2r[i][j]; } } ucl_copy(type2rhor_z2r,dview_type,false); // pack type2frho UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_OPTIMIZED); type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<ntypes; i++) dview_type2frho[i]=host_type2frho[i]; ucl_copy(type2frho,dview_type2frho,false); - - // pack frho_spline UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int ix=0; ix<nfrho; ix++) for (int iy=0; iy<nrho+1; iy++) { dview_frho_spline[ix*(nrho+1)+iy].x=host_frho_spline[ix][iy][0]; dview_frho_spline[ix*(nrho+1)+iy].y=host_frho_spline[ix][iy][1]; dview_frho_spline[ix*(nrho+1)+iy].z=host_frho_spline[ix][iy][2]; dview_frho_spline[ix*(nrho+1)+iy].w=0; } frho_spline1.alloc(nfrho*(nrho+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(frho_spline1,dview_frho_spline,false); frho_spline1_tex.get_texture(*(this->pair_program),"frho_sp1_tex"); frho_spline1_tex.bind_float(frho_spline1,4); for (int ix=0; ix<nfrho; ix++) for (int iy=0; iy<nrho+1; iy++) { dview_frho_spline[ix*(nrho+1)+iy].x=host_frho_spline[ix][iy][3]; dview_frho_spline[ix*(nrho+1)+iy].y=host_frho_spline[ix][iy][4]; dview_frho_spline[ix*(nrho+1)+iy].z=host_frho_spline[ix][iy][5]; dview_frho_spline[ix*(nrho+1)+iy].w=host_frho_spline[ix][iy][6]; } frho_spline2.alloc(nfrho*(nrho+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(frho_spline2,dview_frho_spline,false); frho_spline2_tex.get_texture(*(this->pair_program),"frho_sp2_tex"); frho_spline2_tex.bind_float(frho_spline2,4); // pack rhor_spline UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int ix=0; ix<nrhor; ix++) for (int iy=0; iy<nr+1; iy++) { dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][0]; dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][1]; dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][2]; dview_rhor_spline[ix*(nr+1)+iy].w=(numtyp)0; } rhor_spline1.alloc(nrhor*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(rhor_spline1,dview_rhor_spline,false); rhor_spline1_tex.get_texture(*(this->pair_program),"rhor_sp1_tex"); rhor_spline1_tex.bind_float(rhor_spline1,4); for (int ix=0; ix<nrhor; ix++) for (int iy=0; iy<nr+1; iy++) { dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][3]; dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][4]; dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][5]; dview_rhor_spline[ix*(nr+1)+iy].w=host_rhor_spline[ix][iy][6]; } rhor_spline2.alloc(nrhor*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(rhor_spline2,dview_rhor_spline,false); rhor_spline2_tex.get_texture(*(this->pair_program),"rhor_sp2_tex"); rhor_spline2_tex.bind_float(rhor_spline2,4); // pack z2r_spline UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int ix=0; ix<nz2r; ix++) for (int iy=0; iy<nr+1; iy++) { dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][0]; dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][1]; dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][2]; dview_z2r_spline[ix*(nr+1)+iy].w=(numtyp)0; } z2r_spline1.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(z2r_spline1,dview_z2r_spline,false); z2r_spline1_tex.get_texture(*(this->pair_program),"z2r_sp1_tex"); z2r_spline1_tex.bind_float(z2r_spline1,4); for (int ix=0; ix<nz2r; ix++) for (int iy=0; iy<nr+1; iy++) { dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][3]; dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][4]; dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][5]; dview_z2r_spline[ix*(nr+1)+iy].w=host_z2r_spline[ix][iy][6]; } z2r_spline2.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY); ucl_copy(z2r_spline2,dview_z2r_spline,false); z2r_spline2_tex.get_texture(*(this->pair_program),"z2r_sp2_tex"); z2r_spline2_tex.bind_float(z2r_spline2,4); _allocated=true; this->_max_bytes=type2rhor_z2r.row_bytes() + type2frho.row_bytes() + rhor_spline1.row_bytes() + rhor_spline2.row_bytes() + frho_spline1.row_bytes() + frho_spline2.row_bytes() + z2r_spline1.row_bytes() + z2r_spline2.row_bytes() + dev_fp.row_bytes(); return 0; } template <class numtyp, class acctyp> void EAMT::clear() { if (!_allocated) return; _allocated=false; type2rhor_z2r.clear(); type2frho.clear(); rhor_spline1.clear(); rhor_spline2.clear(); frho_spline1.clear(); frho_spline2.clear(); z2r_spline1.clear(); z2r_spline2.clear(); host_fp.clear(); dev_fp.clear(); time_pair2.clear(); time_fp1.clear(); time_fp2.clear(); if (_compiled_energy) { k_energy_fast.clear(); k_energy.clear(); _compiled_energy=false; } this->clear_atomic(); } template <class numtyp, class acctyp> double EAMT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(EAM<numtyp,acctyp>); } // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then compute atom energies/forces // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { this->acc_timers(); if (this->device->time_device()) { // Put time from the second part to the total time_pair this->time_pair.add_time_to_total(time_pair2.time()); // Add transfer time from device -> host after part 1 this->atom->add_transfer_time(time_fp1.time()); // Add transfer time from host -> device before part 2 this->atom->add_transfer_time(time_fp2.time()); } // ------------------- Resize FP Array for EAM -------------------- if (nall>_max_fp_size) { dev_fp.clear(); host_fp.clear(); _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10); host_fp.alloc(_max_fp_size,*(this->ucl_device)); if (this->ucl_device->device_type()==UCL_CPU) dev_fp.view(host_fp); else dev_fp.alloc(_max_fp_size,*(this->ucl_device)); fp_tex.bind_float(dev_fp,1); } *fp_ptr=host_fp.begin(); // ---------------------------------------------------------------- if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style this->resize_atom(0,nall,success); this->zero_timers(); return; } int ago=this->hd_balancer.ago_first(f_ago); int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); this->ans->inum(inum); host_start=inum; // ----------------------------------------------------------------- if (ago==0) { this->reset_nbors(nall, inum, ilist, numj, firstneigh, success); if (!success) return; } this->atom->cast_x_data(host_x,host_type); this->atom->add_x_data(host_x,host_type); loop(eflag,vflag); // copy fp from device to host for comm _nlocal=nlocal; time_fp1.start(); ucl_copy(host_fp,dev_fp,nlocal,false); time_fp1.stop(); } // --------------------------------------------------------------------------- // Reneighbor on GPU and then compute per-atom densities // --------------------------------------------------------------------------- template <class numtyp, class acctyp> int** EAMT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr) { this->acc_timers(); if (this->device->time_device()) { // Put time from the second part to the total time_pair this->time_pair.add_time_to_total(time_pair2.time()); // Add transfer time from device -> host after part 1 this->atom->add_transfer_time(time_fp1.time()); // Add transfer time from host -> device before part 2 this->atom->add_transfer_time(time_fp2.time()); } // ------------------- Resize FP Array for EAM -------------------- if (nall>_max_fp_size) { dev_fp.clear(); host_fp.clear(); _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10); host_fp.alloc(_max_fp_size,*(this->ucl_device)); if (this->ucl_device->device_type()==UCL_CPU) dev_fp.view(host_fp); else dev_fp.alloc(_max_fp_size,*(this->ucl_device)); fp_tex.bind_float(dev_fp,1); } *fp_ptr=host_fp.begin(); // ----------------------------------------------------------------- if (inum_full==0) { host_start=0; // Make sure textures are correct if realloc by a different hybrid style this->resize_atom(0,nall,success); this->zero_timers(); return NULL; } // load balance, returning the atom count on the device (inum) this->hd_balancer.balance(cpu_time); inum=this->hd_balancer.get_gpu_count(ago,inum_full); this->ans->inum(inum); host_start=inum; // Build neighbor list on GPU if necessary if (ago==0) { this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; } else { this->atom->cast_x_data(host_x,host_type); this->atom->add_x_data(host_x,host_type); } *ilist=this->nbor->host_ilist.begin(); *jnum=this->nbor->host_acc.begin(); loop(eflag,vflag); // copy fp from device to host for comm _nlocal=inum_full; time_fp1.start(); ucl_copy(host_fp,dev_fp,inum_full,false); time_fp1.stop(); return this->nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void EAMT::compute2(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom) { if (this->ans->inum()==0) return; this->hd_balancer.start_timer(); time_fp2.start(); this->add_fp_data(); time_fp2.stop(); loop2(eflag,vflag); if (ilist == NULL) this->ans->copy_answers(eflag,vflag,eatom,vatom); else this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist); this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); } // --------------------------------------------------------------------------- // Calculate per-atom energies and forces // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void EAMT::loop(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { this->k_energy_fast.set_size(GX,BX); this->k_energy_fast.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(), &type2frho.begin(), &rhor_spline2.begin(), &frho_spline1.begin(),&frho_spline2.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &dev_fp.begin(), &this->ans->dev_engv.begin(), &eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr, &this->_threads_per_atom); } else { this->k_energy.set_size(GX,BX); this->k_energy.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(), &type2frho.begin(), &rhor_spline2.begin(), &frho_spline1.begin(),&frho_spline2.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &dev_fp.begin(), &this->ans->dev_engv.begin(), &eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr, &this->_threads_per_atom); } this->time_pair.stop(); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void EAMT::loop2(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair2.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(), &type2rhor_z2r.begin(), &rhor_spline1.begin(), &z2r_spline1.begin(), &z2r_spline2.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr, &_nr, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(), &type2rhor_z2r.begin(), &rhor_spline1.begin(), &z2r_spline1.begin(), &z2r_spline2.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr, &this->_threads_per_atom); } this->time_pair2.stop(); } template class EAM<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu index e705695bb..84417c251 100644 --- a/lib/gpu/lal_eam.cu +++ b/lib/gpu/lal_eam.cu @@ -1,493 +1,493 @@ // ************************************************************************** -// lal_eam.cu +// eam.cu // ------------------- -// Trung Dac Nguyen, W. Michael Brown (ORNL) +// Trung Dac Nguyen, W. Michael Brown (ORNL) // // Device code for acceleration of the eam pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : brownw@ornl.gov nguyentd@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_aux_fun1.h" texture<float4> pos_tex; texture<float> fp_tex; texture<float4> rhor_sp1_tex; texture<float4> rhor_sp2_tex; texture<float4> frho_sp1_tex; texture<float4> frho_sp2_tex; texture<float4> z2r_sp1_tex; texture<float4> z2r_sp2_tex; #ifdef _DOUBLE_DOUBLE ucl_inline double4 fetch_rhor_sp1(const int& i, const double4 *rhor_spline1) { return rhor_spline1[i]; } ucl_inline double4 fetch_rhor_sp2(const int& i, const double4 *rhor_spline2) { return rhor_spline2[i]; } ucl_inline double4 fetch_frho_sp1(const int& i, const double4 *frho_spline1) { return frho_spline1[i]; } ucl_inline double4 fetch_frho_sp2(const int& i, const double4 *frho_spline2) { return frho_spline2[i]; } ucl_inline double4 fetch_z2r_sp1(const int& i, const double4 *z2r_spline1) { return z2r_spline1[i]; } ucl_inline double4 fetch_z2r_sp2(const int& i, const double4 *z2r_spline2) { return z2r_spline2[i]; } #endif #ifndef _DOUBLE_DOUBLE ucl_inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(pos_tex, i); } ucl_inline float fetch_q(const int& i, const float *fp) { return tex1Dfetch(fp_tex, i); } ucl_inline float4 fetch_rhor_sp1(const int& i, const float4 *rhor_spline1) { return tex1Dfetch(rhor_sp1_tex, i); } ucl_inline float4 fetch_rhor_sp2(const int& i, const float4 *rhor_spline2) { return tex1Dfetch(rhor_sp2_tex, i); } ucl_inline float4 fetch_frho_sp1(const int& i, const float4 *frho_spline1) { return tex1Dfetch(frho_sp1_tex, i); } ucl_inline float4 fetch_frho_sp2(const int& i, const float4 *frho_spline2) { return tex1Dfetch(frho_sp2_tex, i); } ucl_inline float4 fetch_z2r_sp1(const int& i, const float4 *z2r_spline1) { return tex1Dfetch(z2r_sp1_tex, i); } ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2) { return tex1Dfetch(z2r_sp2_tex, i); } #endif #endif #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MAX(A,B) ((A) > (B) ? (A) : (B)) #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset, \ eflag,vflag,engv,rdrho,nrho,i) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=rho; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ if (offset < s) \ red_acc[tid] += red_acc[tid+s]; \ } \ rho=red_acc[tid]; \ } \ if (offset==0) { \ numtyp p = rho*rdrho + (numtyp)1.0; \ int m=p; \ m = MAX(1,MIN(m,nrho-1)); \ p -= m; \ p = MIN(p,(numtyp)1.0); \ int index = type2frho[itype]*(nrho+1)+m; \ numtyp4 coeff = fetch_frho_sp1(index, frho_spline1); \ numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z; \ fp_[i]=fp; \ if (eflag>0) { \ coeff = fetch_frho_sp2(index, frho_spline2); \ energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ engv[ii]=(acctyp)2.0*energy; \ } \ } #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom, \ offset, elag, vflag, ans, engv) \ if (t_per_atom>1) { \ __local acctyp red_acc[6][BLOCK_PAIR]; \ red_acc[0][tid]=f.x; \ red_acc[1][tid]=f.y; \ red_acc[2][tid]=f.z; \ red_acc[3][tid]=energy; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ if (offset < s) { \ for (int r=0; r<4; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ } \ } \ f.x=red_acc[0][tid]; \ f.y=red_acc[1][tid]; \ f.z=red_acc[2][tid]; \ energy=red_acc[3][tid]; \ if (vflag>0) { \ for (int r=0; r<6; r++) \ red_acc[r][tid]=virial[r]; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ if (offset < s) { \ for (int r=0; r<6; r++) \ red_acc[r][tid] += red_acc[r][tid+s]; \ } \ } \ for (int r=0; r<6; r++) \ virial[r]=red_acc[r][tid]; \ } \ } \ if (offset==0) { \ if (eflag>0) { \ engv[ii]+=energy; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ engv[ii]=virial[i]; \ engv+=inum; \ } \ } \ ans[ii]=f; \ } __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r, __global int *type2frho, __global numtyp4 *rhor_spline2, __global numtyp4 *frho_spline1, __global numtyp4 *frho_spline2, __global int *dev_nbor, __global int *dev_packed, __global numtyp *fp_, __global acctyp *engv, const int eflag, const int inum, const int nbor_pitch, const int ntypes, const numtyp cutforcesq, const numtyp rdr, const numtyp rdrho, const int nrho, const int nr, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); acctyp rho = (acctyp)0; acctyp energy = (acctyp)0; if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; int itype=ix.w; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<cutforcesq) { numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0; int m=p; m = MIN(m,nr-1); p -= m; p = MIN(p,(numtyp)1.0); int mtype = jtype*ntypes+itype; int index = type2rhor_z2r[mtype].x*(nr+1)+m; numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2); rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; } } // for nbor store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset, eflag,vflag,engv,rdrho,nrho,i); } // if ii } __kernel void kernel_energy_fast(__global numtyp4 *x_, __global int2 *type2rhor_z2r_in, __global int *type2frho_in, __global numtyp4 *rhor_spline2, __global numtyp4 *frho_spline1, __global numtyp4 *frho_spline2, __global int *dev_nbor, __global int *dev_packed, __global numtyp *fp_, __global acctyp *engv, const int eflag, const int inum, const int nbor_pitch, const int ntypes, const numtyp cutforcesq, const numtyp rdr, const numtyp rdrho, const int nrho, const int nr, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local int type2frho[MAX_SHARED_TYPES]; if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { type2rhor_z2r[tid]=type2rhor_z2r_in[tid]; } if (tid<MAX_SHARED_TYPES) { type2frho[tid]=type2frho_in[tid]; } acctyp rho = (acctyp)0; acctyp energy = (acctyp)0; __syncthreads(); if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; int itype=ix.w; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<cutforcesq) { numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0; int m=p; m = MIN(m,nr-1); p -= m; p = MIN(p,(numtyp)1.0); int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w); int mtype = jtype+itype; int index = type2rhor_z2r[mtype].x*(nr+1)+m; numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2); rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; } } // for nbor store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset, eflag,vflag,engv,rdrho,nrho,i); } // if ii } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_, __global int2 *type2rhor_z2r, __global numtyp4 *rhor_spline1, __global numtyp4 *z2r_spline1, __global numtyp4 *z2r_spline2, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int ntypes, const numtyp cutforcesq, const numtyp rdr, const int nr, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; numtyp ifp=fetch_q(i,fp_); //fp_[i]; int itype=ix.w; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<cutforcesq) { numtyp r = ucl_sqrt(rsq); numtyp p = r*rdr + (numtyp)1.0; int m=p; m = MIN(m,nr-1); p -= m; p = MIN(p,(numtyp)1.0); int mtype,index; numtyp4 coeff; mtype = itype*ntypes+jtype; index = type2rhor_z2r[mtype].x*(nr+1)+m; coeff = fetch_rhor_sp1(index, rhor_spline1); numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z; mtype = jtype*ntypes+itype; index = type2rhor_z2r[mtype].x*(nr+1)+m; coeff = fetch_rhor_sp1(index, rhor_spline1); numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z; mtype = itype*ntypes+jtype; index = type2rhor_z2r[mtype].y*(nr+1)+m; coeff = fetch_z2r_sp1(index, z2r_spline1); numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z; coeff = fetch_z2r_sp2(index, z2r_spline2); numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; numtyp recip = ucl_recip(r); numtyp phi = z2*recip; numtyp phip = z2p*recip - phi*recip; numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip; numtyp force = -psip*recip; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { energy += phi; } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); } // if ii } __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_, __global int2 *type2rhor_z2r_in, __global numtyp4 *rhor_spline1, __global numtyp4 *z2r_spline1, __global numtyp4 *z2r_spline2, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const numtyp cutforcesq, const numtyp rdr, const int nr, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { type2rhor_z2r[tid]=type2rhor_z2r_in[tid]; } acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; __syncthreads(); if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; numtyp ifp=fetch_q(i,fp_); //fp_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jw=jx.w; int jtype=fast_mul((int)MAX_SHARED_TYPES,jw); // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<cutforcesq) { numtyp r = ucl_sqrt(rsq); numtyp p = r*rdr + (numtyp)1.0; int m=p; m = MIN(m,nr-1); p -= m; p = MIN(p,(numtyp)1.0); numtyp4 coeff; int mtype,index; mtype = itype+jw; index = type2rhor_z2r[mtype].x*(nr+1)+m; coeff = fetch_rhor_sp1(index, rhor_spline1); numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z; mtype = jtype+iw; index = type2rhor_z2r[mtype].x*(nr+1)+m; coeff = fetch_rhor_sp1(index, rhor_spline1); numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z; mtype = itype+jw; index = type2rhor_z2r[mtype].y*(nr+1)+m; coeff = fetch_z2r_sp1(index, z2r_spline1); numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z; coeff = fetch_z2r_sp2(index, z2r_spline2); numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; numtyp recip = ucl_recip(r); numtyp phi = z2*recip; numtyp phip = z2p*recip - phi*recip; numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip; numtyp force = -psip*recip; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { energy += phi; } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); } // if ii } diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h index 26d9c6fa3..9a7baf6f1 100644 --- a/lib/gpu/lal_eam.h +++ b/lib/gpu/lal_eam.h @@ -1,146 +1,146 @@ /*************************************************************************** - lal_eam.h + eam.h ------------------- - W. Michael Brown, Trung Dac Nguyen (ORNL) + Trung Dac Nguyen, W. Michael Brown (ORNL) Class for acceleration of the eam pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ #ifndef LAL_EAM_H #define LAL_EAM_H #include "lal_precision.h" #include "lal_base_atomic.h" namespace LAMMPS_AL { template <class numtyp, class acctyp> class EAM : public BaseAtomic<numtyp, acctyp> { public: EAM(); ~EAM(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double rdr, double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen); // Copy charges to device asynchronously inline void add_fp_data() { int nghost=this->atom->nall()-_nlocal; if (nghost>0) { UCL_H_Vec<numtyp> host_view; UCL_D_Vec<numtyp> dev_view; host_view.view_offset(_nlocal,host_fp); dev_view.view_offset(_nlocal,dev_fp); ucl_copy(dev_view,host_view,nghost,true); } } /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; /// Total host memory used by library for pair style double host_memory_usage() const; /// Pair loop with host neighboring void compute(const int f_ago, const int inum_full, const int, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr); /// Pair loop with device neighboring int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, int &inum, void **fp_ptr); /// Pair loop with host neighboring void compute2(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom); // ------------------------- DEVICE KERNELS ------------------------- UCL_Kernel k_energy, k_energy_fast; // --------------------------- TEXTURES ----------------------------- UCL_Texture fp_tex; UCL_Texture rhor_spline1_tex, rhor_spline2_tex; UCL_Texture frho_spline1_tex, frho_spline2_tex; UCL_Texture z2r_spline1_tex, z2r_spline2_tex; // --------------------------- DEVICE DATA -------------------------- /// Device Timers UCL_Timer time_pair2, time_fp1, time_fp2; // --------------------------- TYPE DATA -------------------------- UCL_D_Vec<int2> type2rhor_z2r; UCL_D_Vec<int> type2frho; UCL_D_Vec<numtyp4> z2r_spline1, z2r_spline2; UCL_D_Vec<numtyp4> frho_spline1, frho_spline2; UCL_D_Vec<numtyp4> rhor_spline1, rhor_spline2; numtyp _cutforcesq,_rdr,_rdrho; int _nfrho,_nrhor,_nrho,_nz2r,_nr; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; /// Number of atom types int _ntypes; int _max_fp_size; /// True of energy kernels are compiled bool _compiled_energy; /// Per-atom arrays UCL_H_Vec<numtyp> host_fp; UCL_D_Vec<numtyp> dev_fp; protected: bool _allocated; int _nlocal; void loop(const bool _eflag, const bool _vflag); void loop2(const bool _eflag, const bool _vflag); }; } #endif diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp index 9fab47451..d8f6b3cee 100644 --- a/lib/gpu/lal_eam_ext.cpp +++ b/lib/gpu/lal_eam_ext.cpp @@ -1,147 +1,147 @@ -// ************************************************************************** -// lal_eam_ext.cpp -// ------------------- -// W. Michael Brown, Trung Dac Nguyen (ORNL) -// -// Class for acceleration of the eam pair style -// -// __________________________________________________________________________ -// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) -// __________________________________________________________________________ -// -// begin : -// email : brownw@ornl.gov nguyentd@ornl.gov -// ***************************************************************************/ +/*************************************************************************** + eam_ext.cpp + ------------------- + Trung Dac Nguyen, W. Michael Brown (ORNL) + + Functions for LAMMPS access to buck acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov nguyentd@ornl.gov + ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_eam.h" using namespace std; using namespace LAMMPS_AL; static EAM<PRECISION,ACC_PRECISION> EAMMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int eam_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double rdr, double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size) { EAMMF.clear(); gpu_mode=EAMMF.device->gpu_mode(); double gpu_split=EAMMF.device->particle_split(); int first_gpu=EAMMF.device->first_device(); int last_gpu=EAMMF.device->last_device(); int world_me=EAMMF.device->world_me(); int gpu_rank=EAMMF.device->gpu_rank(); int procs_per_gpu=EAMMF.device->procs_per_gpu(); // disable host/device split for now if (gpu_split != 1.0) return -8; fp_size=sizeof(PRECISION); EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu); bool message=false; if (EAMMF.device->replica_me()==0 && screen) message=true; if (message) { fprintf(screen,"Initializing GPU and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); else fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) EAMMF.estimate_gpu_overhead(); return init_ok; } void eam_gpu_clear() { EAMMF.clear(); } int ** eam_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr) { return EAMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, inum, fp_ptr); } void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { EAMMF.compute(ago,inum_full,nlocal,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, fp_ptr); } void eam_gpu_compute_force(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom) { EAMMF.compute2(ilist, eflag, vflag, eatom, vatom); } double eam_gpu_bytes() { return EAMMF.host_memory_usage(); } diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp index c75bd089b..5ab94ae81 100644 --- a/lib/gpu/lal_yukawa.cpp +++ b/lib/gpu/lal_yukawa.cpp @@ -1,150 +1,150 @@ /*************************************************************************** - lal_yukawa.cpp + yukawa.cpp ------------------- - Trung Dac Nguyen, W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Class for acceleration of the yukawa pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov nguyentd@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #ifdef USE_OPENCL #include "yukawa_cl.h" #else #include "yukawa_ptx.h" #endif #include "lal_yukawa.h" #include <cassert> using namespace LAMMPS_AL; #define YukawaT Yukawa<numtyp, acctyp> extern Device<PRECISION,ACC_PRECISION> device; template <class numtyp, class acctyp> YukawaT::Yukawa() : BaseAtomic<numtyp,acctyp>(), _allocated(false) { } template <class numtyp, class acctyp> YukawaT::~Yukawa() { clear(); } template <class numtyp, class acctyp> int YukawaT::bytes_per_atom(const int max_nbors) const { return this->bytes_per_atom_atomic(max_nbors); } template <class numtyp, class acctyp> int YukawaT::init(const int ntypes, double **host_cutsq, double kappa, double **host_a, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, _screen,yukawa); if (success!=0) return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; // Allocate a host write buffer for data initialization UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int i=0; i<lj_types*lj_types; i++) host_write[i]=0.0; coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,host_offset, host_cutsq); UCL_H_Vec<double> dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); ucl_copy(sp_lj,dview,false); _kappa = kappa; _allocated=true; this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes(); return 0; } template <class numtyp, class acctyp> void YukawaT::clear() { if (!_allocated) return; _allocated=false; coeff.clear(); sp_lj.clear(); this->clear_atomic(); } template <class numtyp, class acctyp> double YukawaT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Yukawa<numtyp,acctyp>); } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void YukawaT::loop(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa, &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } template class Yukawa<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu index 39c571455..e2fa11aa3 100644 --- a/lib/gpu/lal_yukawa.cu +++ b/lib/gpu/lal_yukawa.cu @@ -1,189 +1,189 @@ // ************************************************************************** -// yukawa.cu +// yukawa.cu // ------------------- -// Trung Dac Nguyen, W. Michael Brown (ORNL) +// Trung Dac Nguyen (ORNL) // // Device code for acceleration of the yukawa pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : -// email : brownw@ornl.gov nguyentd@ornl.gov +// email : nguyentd@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_aux_fun1.h" texture<float4> pos_tex; #ifndef _DOUBLE_DOUBLE ucl_inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(pos_tex, i); } #endif #endif __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff, const numtyp kappa, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; int itype=ix.w; numtyp factor_lj; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int jtype=jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; int mtype=itype*lj_types+jtype; if (rsq<coeff[mtype].z) { numtyp r2inv = (numtyp)1.0/rsq; numtyp r = ucl_rsqrt(r2inv); numtyp rinv = 1.0/r; numtyp screening = exp(-kappa*r); numtyp force = coeff[mtype].x*screening*(kappa + rinv)*r2inv; force*=factor_lj; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e=coeff[mtype].x*screening*rinv; energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); } // if ii } __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in, const numtyp kappa, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; if (tid<4) sp_lj[tid]=sp_lj_in[tid]; if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { coeff[tid]=coeff_in[tid]; } acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; __syncthreads(); if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); numtyp factor_lj; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; int mtype=itype+jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<coeff[mtype].z) { numtyp r2inv = (numtyp)1.0/rsq; numtyp r = ucl_rsqrt(r2inv); numtyp rinv = 1.0/r; numtyp screening = exp(-kappa*r); numtyp force = coeff[mtype].x*screening*(kappa + rinv)*r2inv; force*=factor_lj; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { numtyp e=coeff[mtype].x*screening*rinv; energy+=factor_lj*(e-coeff[mtype].y); } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); } // if ii } diff --git a/lib/gpu/lal_yukawa.h b/lib/gpu/lal_yukawa.h index 257bd9da5..720dc903d 100644 --- a/lib/gpu/lal_yukawa.h +++ b/lib/gpu/lal_yukawa.h @@ -1,80 +1,80 @@ /*************************************************************************** - lal_yukawa.h + yukawa.h ------------------- - Trung Dac Nguyen, W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Class for acceleration of the yukawa pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov nguyentd@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #ifndef LAL_YUKAWA_H #define LAL_YUKAWA_H #include "lal_base_atomic.h" namespace LAMMPS_AL { template <class numtyp, class acctyp> class Yukawa : public BaseAtomic<numtyp, acctyp> { public: Yukawa(); ~Yukawa(); /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, double kappa, double **host_a, double **host_offset, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; /// Total host memory used by library for pair style double host_memory_usage() const; // --------------------------- TYPE DATA -------------------------- /// coeff.x = a, coeff.y = offset, coeff.z = cutsq UCL_D_Vec<numtyp4> coeff; /// Special LJ values UCL_D_Vec<numtyp> sp_lj; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; /// Number of atom types int _lj_types; /// kappa numtyp _kappa; private: bool _allocated; void loop(const bool _eflag, const bool _vflag); }; } #endif diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp index 1ba2d0902..36f390ab9 100644 --- a/lib/gpu/lal_yukawa_ext.cpp +++ b/lib/gpu/lal_yukawa_ext.cpp @@ -1,120 +1,120 @@ /*************************************************************************** - lal_yukawa_ext.cpp + yukawa_ext.cpp ------------------- - Trung Dac Nguyen, W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Functions for LAMMPS access to yukawa acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : - email : brownw@ornl.gov nguyentd@ornl.gov + email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_yukawa.h" using namespace std; using namespace LAMMPS_AL; static Yukawa<PRECISION,ACC_PRECISION> YKMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, double **host_a, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { YKMF.clear(); gpu_mode=YKMF.device->gpu_mode(); double gpu_split=YKMF.device->particle_split(); int first_gpu=YKMF.device->first_device(); int last_gpu=YKMF.device->last_device(); int world_me=YKMF.device->world_me(); int gpu_rank=YKMF.device->gpu_rank(); int procs_per_gpu=YKMF.device->procs_per_gpu(); YKMF.device->init_message(screen,"yukawa",first_gpu,last_gpu); bool message=false; if (YKMF.device->replica_me()==0 && screen) message=true; if (message) { fprintf(screen,"Initializing GPU and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); YKMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); else fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); YKMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) YKMF.estimate_gpu_overhead(); return init_ok; } void yukawa_gpu_clear() { YKMF.clear(); } int ** yukawa_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return YKMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void yukawa_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { YKMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double yukawa_gpu_bytes() { return YKMF.host_memory_usage(); } diff --git a/src/GPU/pair_buck_coul_cut_gpu.cpp b/src/GPU/pair_buck_coul_cut_gpu.cpp index ceaf1090b..d4e00811c 100644 --- a/src/GPU/pair_buck_coul_cut_gpu.cpp +++ b/src/GPU/pair_buck_coul_cut_gpu.cpp @@ -1,257 +1,260 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL) + Contributing authors: Trung Dac Nguyen (ORNL) ------------------------------------------------------------------------- */ #include "lmptype.h" #include "math.h" #include "stdio.h" #include "stdlib.h" #include "pair_buck_coul_cut_gpu.h" #include "atom.h" #include "atom_vec.h" #include "comm.h" #include "force.h" #include "neighbor.h" #include "neigh_list.h" #include "integrate.h" #include "memory.h" #include "error.h" #include "neigh_request.h" #include "universe.h" #include "update.h" #include "domain.h" #include "string.h" #include "gpu_extra.h" // External functions from cuda library for atom decomposition int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, - double **host_a, double **host_c, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e); + double **host_buck1, double **host_buck2, double **host_a, + double **host_c, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); void buckc_gpu_clear(); -int ** buckc_gpu_compute_n(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd); +int ** buckc_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd); void buckc_gpu_compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd); + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double buckc_gpu_bytes(); using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ PairBuckCoulCutGPU::PairBuckCoulCutGPU(LAMMPS *lmp) : PairBuckCoulCut(lmp), - gpu_mode(GPU_FORCE) + gpu_mode(GPU_FORCE) { respa_enable = 0; cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } /* ---------------------------------------------------------------------- free all arrays ------------------------------------------------------------------------- */ PairBuckCoulCutGPU::~PairBuckCoulCutGPU() { buckc_gpu_clear(); } /* ---------------------------------------------------------------------- */ void PairBuckCoulCutGPU::compute(int eflag, int vflag) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; int nall = atom->nlocal + atom->nghost; int inum, host_start; bool success = true; int *ilist, *numneigh, **firstneigh; if (gpu_mode != GPU_FORCE) { inum = atom->nlocal; firstneigh = buckc_gpu_compute_n(neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, - &ilist, &numneigh, cpu_time, success, - atom->q, domain->boxlo, domain->prd); + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, + &ilist, &numneigh, cpu_time, success, + atom->q, domain->boxlo, domain->prd); } else { inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; buckc_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, - ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, atom->q, - atom->nlocal, domain->boxlo, domain->prd); + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one(FLERR,"Out of memory on GPGPU"); if (host_start<inum) { cpu_time = MPI_Wtime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_time = MPI_Wtime() - cpu_time; } } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairBuckCoulCutGPU::init_style() { if (force->newton_pair) - error->all(FLERR,"Cannot use newton pair with buck/gpu pair style"); + error->all(FLERR, + "Cannot use newton pair with buck/coul/cut/gpu pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; double cut; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); cut *= cut; if (cut > maxcut) maxcut = cut; cutsq[i][j] = cutsq[j][i] = cut; } else cutsq[i][j] = cutsq[j][i] = 0.0; } } double cell_size = sqrt(maxcut) + neighbor->skin; int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; int success = buckc_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2, - a, c, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, - force->special_coul, force->qqrd2e); + a, c, offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, + cut_coulsq, force->special_coul, force->qqrd2e); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { int irequest = neighbor->request(this); neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } } /* ---------------------------------------------------------------------- */ double PairBuckCoulCutGPU::memory_usage() { double bytes = Pair::memory_usage(); return bytes + buckc_gpu_bytes(); } /* ---------------------------------------------------------------------- */ -void PairBuckCoulCutGPU::cpu_compute(int start, int inum, int eflag, int vflag, - int *ilist, int *numneigh, int **firstneigh) { +void PairBuckCoulCutGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, + int **firstneigh) { int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,qtmp,delx,dely,delz,evdwl,ecoul,fpair; double rsq,r2inv,r6inv,forcecoul,forcebuck,factor_coul,factor_lj; double r,rexp; int *jlist; evdwl = ecoul = 0.0; double **x = atom->x; double **f = atom->f; double *q = atom->q; int *type = atom->type; double *special_coul = force->special_coul; double *special_lj = force->special_lj; double qqrd2e = force->qqrd2e; // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { i = ilist[ii]; qtmp = q[i]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; factor_lj = special_lj[sbmask(j)]; factor_coul = special_coul[sbmask(j)]; j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; delz = ztmp - x[j][2]; rsq = delx*delx + dely*dely + delz*delz; jtype = type[j]; if (rsq < cutsq[itype][jtype]) { r2inv = 1.0/rsq; r = sqrt(rsq); - if (rsq < cut_coulsq[itype][jtype]) - forcecoul = qqrd2e * qtmp*q[j]/r; - else forcecoul = 0.0; - - if (rsq < cut_ljsq[itype][jtype]) { - r6inv = r2inv*r2inv*r2inv; - rexp = exp(-r*rhoinv[itype][jtype]); - forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv; - } else forcebuck = 0.0; + if (rsq < cut_coulsq[itype][jtype]) + forcecoul = qqrd2e * qtmp*q[j]/r; + else forcecoul = 0.0; + if (rsq < cut_ljsq[itype][jtype]) { + r6inv = r2inv*r2inv*r2inv; + rexp = exp(-r*rhoinv[itype][jtype]); + forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv; + } else forcebuck = 0.0; + fpair = (factor_coul*forcecoul + factor_lj*forcebuck) * r2inv; - + f[i][0] += delx*fpair; f[i][1] += dely*fpair; f[i][2] += delz*fpair; if (eflag) { if (rsq < cut_coulsq[itype][jtype]) ecoul = factor_coul * qqrd2e * qtmp*q[j]/r; else ecoul = 0.0; if (rsq < cut_ljsq[itype][jtype]) { evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv - offset[itype][jtype]; evdwl *= factor_lj; } else evdwl = 0.0; } if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz); } } } } diff --git a/src/GPU/pair_buck_coul_long_gpu.cpp b/src/GPU/pair_buck_coul_long_gpu.cpp index 92d54181a..a5f640111 100644 --- a/src/GPU/pair_buck_coul_long_gpu.cpp +++ b/src/GPU/pair_buck_coul_long_gpu.cpp @@ -1,288 +1,291 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Trung Dac Nguyen, Mike Brown (ORNL) + Contributing author: Trung Dac Nguyen (ORNL) ------------------------------------------------------------------------- */ #include "lmptype.h" #include "math.h" #include "stdio.h" #include "stdlib.h" #include "pair_buck_coul_long_gpu.h" #include "atom.h" #include "atom_vec.h" #include "comm.h" #include "force.h" #include "neighbor.h" #include "neigh_list.h" #include "integrate.h" #include "memory.h" #include "error.h" #include "neigh_request.h" #include "universe.h" #include "update.h" #include "domain.h" #include "string.h" #include "kspace.h" #include "gpu_extra.h" #define EWALD_F 1.12837917 #define EWALD_P 0.3275911 #define A1 0.254829592 #define A2 -0.284496736 #define A3 1.421413741 #define A4 -1.453152027 #define A5 1.061405429 // External functions from cuda library for atom decomposition int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, - double **host_a, double **host_c, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald); + double **host_buck1, double **host_buck2, double **host_a, + double **host_c, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); void buckcl_gpu_clear(); -int** buckcl_gpu_compute_n(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd); +int** buckcl_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd); void buckcl_gpu_compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd); + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd); double buckcl_gpu_bytes(); using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ PairBuckCoulLongGPU::PairBuckCoulLongGPU(LAMMPS *lmp) : PairBuckCoulLong(lmp), gpu_mode(GPU_FORCE) { respa_enable = 0; cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } /* ---------------------------------------------------------------------- free all arrays ------------------------------------------------------------------------- */ PairBuckCoulLongGPU::~PairBuckCoulLongGPU() { buckcl_gpu_clear(); } /* ---------------------------------------------------------------------- */ void PairBuckCoulLongGPU::compute(int eflag, int vflag) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; int nall = atom->nlocal + atom->nghost; int inum, host_start; bool success = true; int *ilist, *numneigh, **firstneigh; if (gpu_mode != GPU_FORCE) { inum = atom->nlocal; firstneigh = buckcl_gpu_compute_n(neighbor->ago, inum, nall, atom->x, - atom->type, domain->sublo, domain->subhi, - atom->tag, atom->nspecial, atom->special, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, atom->q, domain->boxlo, - domain->prd); + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd); } else { inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; buckcl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, - ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, atom->q, - atom->nlocal, domain->boxlo, domain->prd); + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one(FLERR,"Out of memory on GPGPU"); if (host_start<inum) { cpu_time = MPI_Wtime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_time = MPI_Wtime() - cpu_time; } } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairBuckCoulLongGPU::init_style() { if (!atom->q_flag) - error->all(FLERR,"Pair style buck/coul/long/gpu requires atom attribute q"); + error->all(FLERR, + "Pair style buck/coul/long/gpu requires atom attribute q"); if (force->newton_pair) - error->all(FLERR,"Cannot use newton pair with buck/could/cut/gpu pair style"); + error->all(FLERR, + "Cannot use newton pair with buck/coul/long/gpu pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; double cut; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); cut *= cut; if (cut > maxcut) maxcut = cut; cutsq[i][j] = cutsq[j][i] = cut; } else cutsq[i][j] = cutsq[j][i] = 0.0; } } double cell_size = sqrt(maxcut) + neighbor->skin; cut_coulsq = cut_coul * cut_coul; // insure use of KSpace long-range solver, set g_ewald if (force->kspace == NULL) error->all(FLERR,"Pair style is incompatible with KSpace style"); g_ewald = force->kspace->g_ewald; int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; int success = buckcl_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2, - a, c, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq, - force->special_coul, force->qqrd2e, g_ewald); + a, c, offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen, cut_ljsq, + cut_coulsq, force->special_coul, force->qqrd2e, + g_ewald); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { int irequest = neighbor->request(this); neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } } /* ---------------------------------------------------------------------- */ double PairBuckCoulLongGPU::memory_usage() { double bytes = Pair::memory_usage(); return bytes + buckcl_gpu_bytes(); } /* ---------------------------------------------------------------------- */ void PairBuckCoulLongGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist, int *numneigh, int **firstneigh) { int i,j,ii,jj,jnum,itype,jtype,itable; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; double fraction,table; double r,rexp,r2inv,r6inv,forcecoul,forcebuck,factor_coul,factor_lj; double grij,expm2,prefactor,t,erfc; int *jlist; double rsq; evdwl = ecoul = 0.0; double **x = atom->x; double **f = atom->f; double *q = atom->q; int *type = atom->type; double *special_coul = force->special_coul; double *special_lj = force->special_lj; double qqrd2e = force->qqrd2e; // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { i = ilist[ii]; qtmp = q[i]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; factor_lj = special_lj[sbmask(j)]; factor_coul = special_coul[sbmask(j)]; j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; delz = ztmp - x[j][2]; rsq = delx*delx + dely*dely + delz*delz; jtype = type[j]; if (rsq < cutsq[itype][jtype]) { r2inv = 1.0/rsq; if (rsq < cut_coulsq) { grij = g_ewald * r; expm2 = exp(-grij*grij); t = 1.0 / (1.0 + EWALD_P*grij); erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; prefactor = qqrd2e * qtmp*q[j]/r; forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; } else forcecoul = 0.0; if (rsq < cut_ljsq[itype][jtype]) { r6inv = r2inv*r2inv*r2inv; - rexp = exp(-r*rhoinv[itype][jtype]); - forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv; + rexp = exp(-r*rhoinv[itype][jtype]); + forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv; } else forcebuck = 0.0; fpair = (forcecoul + factor_lj*forcebuck) * r2inv; f[i][0] += delx*fpair; f[i][1] += dely*fpair; f[i][2] += delz*fpair; if (eflag) { if (rsq < cut_coulsq) { - ecoul = prefactor*erfc; + ecoul = prefactor*erfc; if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; } else ecoul = 0.0; if (rsq < cut_ljsq[itype][jtype]) { evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv - offset[itype][jtype]; evdwl *= factor_lj; } else evdwl = 0.0; } if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz); } } } } diff --git a/src/GPU/pair_buck_gpu.cpp b/src/GPU/pair_buck_gpu.cpp index c277f9b02..67982202c 100644 --- a/src/GPU/pair_buck_gpu.cpp +++ b/src/GPU/pair_buck_gpu.cpp @@ -1,229 +1,230 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Mike Brown (SNL) + Contributing author: Trung Dac Nguyen (ORNL) ------------------------------------------------------------------------- */ #include "lmptype.h" #include "math.h" #include "stdio.h" #include "stdlib.h" #include "pair_buck_gpu.h" #include "atom.h" #include "atom_vec.h" #include "comm.h" #include "force.h" #include "neighbor.h" #include "neigh_list.h" #include "integrate.h" #include "memory.h" #include "error.h" #include "neigh_request.h" #include "universe.h" #include "update.h" #include "domain.h" #include "string.h" #include "gpu_extra.h" // External functions from cuda library for atom decomposition int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, - double **host_buck1, double **host_buck2, - double **host_a, double **host_c, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); + double **host_buck1, double **host_buck2, + double **host_a, double **host_c, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); void buck_gpu_clear(); -int ** buck_gpu_compute_n(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success); +int ** buck_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success); void buck_gpu_compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double buck_gpu_bytes(); using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ PairBuckGPU::PairBuckGPU(LAMMPS *lmp) : PairBuck(lmp), gpu_mode(GPU_FORCE) { respa_enable = 0; cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } /* ---------------------------------------------------------------------- free all arrays ------------------------------------------------------------------------- */ PairBuckGPU::~PairBuckGPU() { buck_gpu_clear(); } /* ---------------------------------------------------------------------- */ void PairBuckGPU::compute(int eflag, int vflag) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; int nall = atom->nlocal + atom->nghost; int inum, host_start; bool success = true; int *ilist, *numneigh, **firstneigh; if (gpu_mode != GPU_FORCE) { inum = atom->nlocal; firstneigh = buck_gpu_compute_n(neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, - &ilist, &numneigh, cpu_time, success); + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, + &ilist, &numneigh, cpu_time, success); } else { inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; buck_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, - ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success); + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); } if (!success) error->one(FLERR,"Out of memory on GPGPU"); if (host_start<inum) { cpu_time = MPI_Wtime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_time = MPI_Wtime() - cpu_time; } } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairBuckGPU::init_style() { if (force->newton_pair) error->all(FLERR,"Cannot use newton pair with buck/gpu pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; double cut; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); cut *= cut; if (cut > maxcut) maxcut = cut; cutsq[i][j] = cutsq[j][i] = cut; } else cutsq[i][j] = cutsq[j][i] = 0.0; } } double cell_size = sqrt(maxcut) + neighbor->skin; int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; int success = buck_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2, - a, c, offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); + a, c, offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { int irequest = neighbor->request(this); neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } } /* ---------------------------------------------------------------------- */ double PairBuckGPU::memory_usage() { double bytes = Pair::memory_usage(); return bytes + buck_gpu_bytes(); } /* ---------------------------------------------------------------------- */ void PairBuckGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist, int *numneigh, int **firstneigh) { int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r2inv,r6inv,forcebuck,factor_lj; double r,rexp; int *jlist; double **x = atom->x; double **f = atom->f; int *type = atom->type; double *special_lj = force->special_lj; // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { i = ilist[ii]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; factor_lj = special_lj[sbmask(j)]; j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; delz = ztmp - x[j][2]; rsq = delx*delx + dely*dely + delz*delz; jtype = type[j]; if (rsq < cutsq[itype][jtype]) { r2inv = 1.0/rsq; r6inv = r2inv*r2inv*r2inv; r = sqrt(rsq); rexp = exp(-r*rhoinv[itype][jtype]); forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv; fpair = factor_lj*forcebuck*r2inv; f[i][0] += delx*fpair; f[i][1] += dely*fpair; f[i][2] += delz*fpair; if (eflag) { evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv - offset[itype][jtype]; evdwl *= factor_lj; } if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); } } } } diff --git a/src/GPU/pair_cg_cmm_coul_msm.cpp b/src/GPU/pair_cg_cmm_coul_msm.cpp index 5321945eb..791dfc9c4 100644 --- a/src/GPU/pair_cg_cmm_coul_msm.cpp +++ b/src/GPU/pair_cg_cmm_coul_msm.cpp @@ -1,290 +1,291 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- CMM coarse grained MD potentials. Coulomb with MSM version. Contributing author: Mike Brown <brownw@ornl.gov> ------------------------------------------------------------------------- */ #include "string.h" #include "pair_cg_cmm_coul_msm.h" #include "memory.h" #include "atom.h" #include "force.h" #include "kspace.h" using namespace LAMMPS_NS; enum {C3=0,C4=1}; /* ---------------------------------------------------------------------- */ PairCGCMMCoulMSM::PairCGCMMCoulMSM(LAMMPS *lmp) : PairCMMCommon(lmp) { respa_enable = 0; single_enable = 0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } /* ---------------------------------------------------------------------- */ PairCGCMMCoulMSM::~PairCGCMMCoulMSM() { if (allocated_coul) { memory->destroy(cut_lj); memory->destroy(cut_ljsq); memory->destroy(cut_coul); memory->destroy(cut_coulsq); allocated_coul=0; } } /* ---------------------------------------------------------------------- */ void PairCGCMMCoulMSM::allocate() { PairCMMCommon::allocate(); allocated_coul = 1; int n = atom->ntypes; memory->create(cut_lj,n+1,n+1,"paircg:cut_lj"); memory->create(cut_ljsq,n+1,n+1,"paircg:cut_ljsq"); memory->create(cut_coul,n+1,n+1,"paircg:cut_coul"); memory->create(cut_coulsq,n+1,n+1,"paircg:cut_coulsq"); } /* ---------------------------------------------------------------------- global settings ------------------------------------------------------------------------- */ void PairCGCMMCoulMSM::settings(int narg, char **arg) { // strip off smoothing type and send args to parent if (narg < 1) error->all(FLERR,"Illegal pair_style command"); if (strcmp(arg[0],"C3") == 0) _smooth = C3; else if (strcmp(arg[0],"C4") == 0) _smooth = C4; else error->all(FLERR,"Illegal pair_style command"); PairCMMCommon::settings(narg-1,&arg[1]); } /* ---------------------------------------------------------------------- */ void PairCGCMMCoulMSM::init_style() { if (!atom->q_flag) error->all(FLERR,"Pair style cg/cut/coul/msm requires atom attribute q"); PairCMMCommon::init_style(); _ia=-1.0/cut_coul_global; _ia2=_ia*_ia; _ia3=_ia2*_ia; cut_respa = NULL; } /* ---------------------------------------------------------------------- */ double PairCGCMMCoulMSM::init_one(int i, int j) { double mycut = PairCMMCommon::init_one(i,j); return mycut; } /* ---------------------------------------------------------------------- */ void PairCGCMMCoulMSM::compute(int eflag, int vflag) { int i,j,ii,jj,inum,jnum,itype,jtype,itable; double qtmp,xtmp,ytmp,ztmp,delx,dely,delz; double fraction,table; double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; double grij,expm2,prefactor,t,erfc; int *ilist,*jlist,*numneigh,**firstneigh; double rsq; if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; double **x = atom->x; double **f = atom->f; double *q = atom->q; int *type = atom->type; int nlocal = atom->nlocal; int nall = nlocal + atom->nghost; double *special_coul = force->special_coul; double *special_lj = force->special_lj; double qqrd2e = force->qqrd2e; int newton_pair = force->newton_pair; inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; // loop over neighbors of my atoms for (ii = 0; ii < inum; ii++) { i = ilist[ii]; qtmp = q[i]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; factor_lj = special_lj[sbmask(j)]; factor_coul = special_coul[sbmask(j)]; j &= NEIGHMASK; const double delx = xtmp - x[j][0]; const double dely = ytmp - x[j][1]; const double delz = ztmp - x[j][2]; const double rsq = delx*delx + dely*dely + delz*delz; const int jtype = type[j]; double evdwl = 0.0; double ecoul = 0.0; double fpair = 0.0; if (rsq < cutsq[itype][jtype]) { const double r2inv = 1.0/rsq; const int cgt=cg_type[itype][jtype]; double forcelj = 0.0; double forcecoul = 0.0; if (rsq < cut_ljsq[itype][jtype]) { forcelj=factor_lj; if (eflag) evdwl=factor_lj; if (cgt == CG_LJ12_4) { const double r4inv=r2inv*r2inv; forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv - lj2[itype][jtype]); if (eflag) { evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv - lj4[itype][jtype]) - offset[itype][jtype]; } } else if (cgt == CG_LJ9_6) { const double r3inv = r2inv*sqrt(r2inv); const double r6inv = r3inv*r3inv; forcelj *= r6inv*(lj1[itype][jtype]*r3inv - lj2[itype][jtype]); if (eflag) { evdwl *= r6inv*(lj3[itype][jtype]*r3inv - lj4[itype][jtype]) - offset[itype][jtype]; } } else { const double r6inv = r2inv*r2inv*r2inv; forcelj *= r6inv*(lj1[itype][jtype]*r6inv - lj2[itype][jtype]); if (eflag) { evdwl *= r6inv*(lj3[itype][jtype]*r6inv - lj4[itype][jtype]) - offset[itype][jtype]; } } } if (rsq < cut_coulsq_global) { const double ir = 1.0/sqrt(rsq); const double prefactor = qqrd2e * qtmp*q[j]; const double r2_ia2 = rsq*_ia2; const double r4_ia4 = r2_ia2*r2_ia2; if (_smooth==C3) { forcecoul = prefactor*(_ia3*(-4.375+5.25*r2_ia2-1.875*r4_ia4)- ir/rsq); if (eflag) ecoul = prefactor*(ir+_ia*(2.1875-2.1875*r2_ia2+ 1.3125*r4_ia4-0.3125*r2_ia2*r4_ia4)); } else { const double r6_ia6 = r2_ia2*r4_ia4; forcecoul = prefactor*(_ia3*(-6.5625+11.8125*r2_ia2-8.4375*r4_ia4+ 2.1875*r6_ia6)-ir/rsq); if (eflag) ecoul = prefactor*(ir+_ia*(2.4609375-3.28125*r2_ia2+ 2.953125*r4_ia4-1.40625*r6_ia6+ 0.2734375*r4_ia4*r4_ia4)); } if (factor_coul < 1.0) { forcecoul -= (1.0-factor_coul)*prefactor*ir; if (eflag) ecoul -= (1.0-factor_coul)*prefactor*ir; } } fpair = forcecoul + forcelj * r2inv; f[i][0] += delx*fpair; f[i][1] += dely*fpair; f[i][2] += delz*fpair; if (newton_pair || j < nlocal) { f[j][0] -= delx*fpair; f[j][1] -= dely*fpair; f[j][2] -= delz*fpair; } if (evflag) ev_tally(i,j,nlocal,newton_pair, evdwl,ecoul,fpair,delx,dely,delz); } } } if (vflag_fdotr) virial_fdotr_compute(); } /* ---------------------------------------------------------------------- */ void PairCGCMMCoulMSM::write_restart(FILE *fp) { write_restart_settings(fp); PairCMMCommon::write_restart(fp); } /* ---------------------------------------------------------------------- */ void PairCGCMMCoulMSM::read_restart(FILE *fp) { read_restart_settings(fp); allocate(); PairCMMCommon::read_restart(fp); } /* ---------------------------------------------------------------------- */ double PairCGCMMCoulMSM::memory_usage() { double bytes=PairCMMCommon::memory_usage(); int n = atom->ntypes; // cut_coul/cut_coulsq/cut_ljsq bytes += (n+1)*(n+1)*sizeof(double)*4; return bytes; } /* ---------------------------------------------------------------------- */ void *PairCGCMMCoulMSM::extract(char *str) { if (strcmp(str,"cut_coul") == 0) return (void *) &cut_coul_global; return NULL; } /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/pair_eam_alloy_gpu.cpp b/src/GPU/pair_eam_alloy_gpu.cpp index ffccd8516..2cb80a34d 100644 --- a/src/GPU/pair_eam_alloy_gpu.cpp +++ b/src/GPU/pair_eam_alloy_gpu.cpp @@ -1,323 +1,323 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL) + Contributing authors: Trung Dac Nguyen (ORNL), W. Michael Brown (ORNL) ------------------------------------------------------------------------- */ #include "stdio.h" #include "stdlib.h" #include "string.h" #include "pair_eam_alloy_gpu.h" #include "atom.h" #include "comm.h" #include "memory.h" #include "error.h" using namespace LAMMPS_NS; #define MAXLINE 1024 /* ---------------------------------------------------------------------- */ PairEAMAlloyGPU::PairEAMAlloyGPU(LAMMPS *lmp) : PairEAMGPU(lmp) { one_coeff = 1; } /* ---------------------------------------------------------------------- set coeffs for one or more type pairs read DYNAMO setfl file ------------------------------------------------------------------------- */ void PairEAMAlloyGPU::coeff(int narg, char **arg) { int i,j; if (!allocated) allocate(); if (narg != 3 + atom->ntypes) error->all(FLERR,"Incorrect args for pair coefficients"); // insure I,J args are * * if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0) error->all(FLERR,"Incorrect args for pair coefficients"); // read EAM setfl file if (setfl) { for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i]; delete [] setfl->elements; delete [] setfl->mass; memory->destroy(setfl->frho); memory->destroy(setfl->rhor); memory->destroy(setfl->z2r); delete setfl; } setfl = new Setfl(); read_file(arg[2]); // read args that map atom types to elements in potential file // map[i] = which element the Ith atom type is, -1 if NULL for (i = 3; i < narg; i++) { if (strcmp(arg[i],"NULL") == 0) { map[i-2] = -1; continue; } for (j = 0; j < setfl->nelements; j++) if (strcmp(arg[i],setfl->elements[j]) == 0) break; if (j < setfl->nelements) map[i-2] = j; else error->all(FLERR,"No matching element in EAM potential file"); } // clear setflag since coeff() called once with I,J = * * int n = atom->ntypes; for (i = 1; i <= n; i++) for (j = i; j <= n; j++) setflag[i][j] = 0; // set setflag i,j for type pairs where both are mapped to elements // set mass of atom type if i = j int count = 0; for (i = 1; i <= n; i++) { for (j = i; j <= n; j++) { if (map[i] >= 0 && map[j] >= 0) { setflag[i][j] = 1; if (i == j) atom->set_mass(i,setfl->mass[map[i]]); count++; } } } if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients"); } /* ---------------------------------------------------------------------- read a multi-element DYNAMO setfl file ------------------------------------------------------------------------- */ void PairEAMAlloyGPU::read_file(char *filename) { Setfl *file = setfl; // open potential file int me = comm->me; FILE *fptr; char line[MAXLINE]; if (me == 0) { fptr = fopen(filename,"r"); if (fptr == NULL) { char str[128]; sprintf(str,"Cannot open EAM potential file %s",filename); error->one(FLERR,str); } } // read and broadcast header // extract element names from nelements line int n; if (me == 0) { fgets(line,MAXLINE,fptr); fgets(line,MAXLINE,fptr); fgets(line,MAXLINE,fptr); fgets(line,MAXLINE,fptr); n = strlen(line) + 1; } MPI_Bcast(&n,1,MPI_INT,0,world); MPI_Bcast(line,n,MPI_CHAR,0,world); sscanf(line,"%d",&file->nelements); int nwords = atom->count_words(line); if (nwords != file->nelements + 1) error->all(FLERR,"Incorrect element names in EAM potential file"); char **words = new char*[file->nelements+1]; nwords = 0; strtok(line," \t\n\r\f"); while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue; file->elements = new char*[file->nelements]; for (int i = 0; i < file->nelements; i++) { n = strlen(words[i]) + 1; file->elements[i] = new char[n]; strcpy(file->elements[i],words[i]); } delete [] words; if (me == 0) { fgets(line,MAXLINE,fptr); sscanf(line,"%d %lg %d %lg %lg", &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut); } MPI_Bcast(&file->nrho,1,MPI_INT,0,world); MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world); MPI_Bcast(&file->nr,1,MPI_INT,0,world); MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world); MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world); file->mass = new double[file->nelements]; memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho"); memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor"); memory->create(file->z2r,file->nelements,file->nelements,file->nr+1, "pair:z2r"); int i,j,tmp; for (i = 0; i < file->nelements; i++) { if (me == 0) { fgets(line,MAXLINE,fptr); sscanf(line,"%d %lg",&tmp,&file->mass[i]); } MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world); if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]); MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world); if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]); MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world); } for (i = 0; i < file->nelements; i++) for (j = 0; j <= i; j++) { if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]); MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world); } // close the potential file if (me == 0) fclose(fptr); } /* ---------------------------------------------------------------------- copy read-in setfl potential to standard array format ------------------------------------------------------------------------- */ void PairEAMAlloyGPU::file2array() { int i,j,m,n; int ntypes = atom->ntypes; // set function params directly from setfl file nrho = setfl->nrho; nr = setfl->nr; drho = setfl->drho; dr = setfl->dr; // ------------------------------------------------------------------ // setup frho arrays // ------------------------------------------------------------------ // allocate frho arrays // nfrho = # of setfl elements + 1 for zero array nfrho = setfl->nelements + 1; memory->destroy(frho); memory->create(frho,nfrho,nrho+1,"pair:frho"); // copy each element's frho to global frho for (i = 0; i < setfl->nelements; i++) for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m]; // add extra frho of zeroes for non-EAM types to point to (pair hybrid) // this is necessary b/c fp is still computed for non-EAM atoms for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0; // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to // if atom type doesn't point to element (non-EAM atom in pair hybrid) // then map it to last frho array of zeroes for (i = 1; i <= ntypes; i++) if (map[i] >= 0) type2frho[i] = map[i]; else type2frho[i] = nfrho-1; // ------------------------------------------------------------------ // setup rhor arrays // ------------------------------------------------------------------ // allocate rhor arrays // nrhor = # of setfl elements nrhor = setfl->nelements; memory->destroy(rhor); memory->create(rhor,nrhor,nr+1,"pair:rhor"); // copy each element's rhor to global rhor for (i = 0; i < setfl->nelements; i++) for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m]; // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to // for setfl files, I,J mapping only depends on I // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used for (i = 1; i <= ntypes; i++) for (j = 1; j <= ntypes; j++) type2rhor[i][j] = map[i]; // ------------------------------------------------------------------ // setup z2r arrays // ------------------------------------------------------------------ // allocate z2r arrays // nz2r = N*(N+1)/2 where N = # of setfl elements nz2r = setfl->nelements * (setfl->nelements+1) / 2; memory->destroy(z2r); memory->create(z2r,nz2r,nr+1,"pair:z2r"); // copy each element pair z2r to global z2r, only for I >= J n = 0; for (i = 0; i < setfl->nelements; i++) for (j = 0; j <= i; j++) { for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m]; n++; } // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to // set of z2r arrays only fill lower triangular Nelement matrix // value = n = sum over rows of lower-triangular matrix until reach irow,icol // swap indices when irow < icol to stay lower triangular // if map = -1 (non-EAM atom in pair hybrid): // type2z2r is not used by non-opt // but set type2z2r to 0 since accessed by opt int irow,icol; for (i = 1; i <= ntypes; i++) { for (j = 1; j <= ntypes; j++) { irow = map[i]; icol = map[j]; if (irow == -1 || icol == -1) { type2z2r[i][j] = 0; continue; } if (irow < icol) { irow = map[j]; icol = map[i]; } n = 0; for (m = 0; m < irow; m++) n += m + 1; n += icol; type2z2r[i][j] = n; } } } diff --git a/src/GPU/pair_eam_fs_gpu.cpp b/src/GPU/pair_eam_fs_gpu.cpp index 382302a8a..1b52920e3 100644 --- a/src/GPU/pair_eam_fs_gpu.cpp +++ b/src/GPU/pair_eam_fs_gpu.cpp @@ -1,332 +1,332 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL) + Contributing authors: Trung Dac Nguyen (ORNL), W. Michael Brown (ORNL) ------------------------------------------------------------------------- */ #include "stdio.h" #include "stdlib.h" #include "string.h" #include "pair_eam_fs_gpu.h" #include "atom.h" #include "comm.h" #include "memory.h" #include "error.h" using namespace LAMMPS_NS; #define MAXLINE 1024 /* ---------------------------------------------------------------------- */ PairEAMFSGPU::PairEAMFSGPU(LAMMPS *lmp) : PairEAMGPU(lmp) { one_coeff = 1; } /* ---------------------------------------------------------------------- set coeffs for one or more type pairs read EAM Finnis-Sinclair file ------------------------------------------------------------------------- */ void PairEAMFSGPU::coeff(int narg, char **arg) { int i,j; if (!allocated) allocate(); if (narg != 3 + atom->ntypes) error->all(FLERR,"Incorrect args for pair coefficients"); // insure I,J args are * * if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0) error->all(FLERR,"Incorrect args for pair coefficients"); // read EAM Finnis-Sinclair file if (fs) { for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i]; delete [] fs->elements; delete [] fs->mass; memory->destroy(fs->frho); memory->destroy(fs->rhor); memory->destroy(fs->z2r); delete fs; } fs = new Fs(); read_file(arg[2]); // read args that map atom types to elements in potential file // map[i] = which element the Ith atom type is, -1 if NULL for (i = 3; i < narg; i++) { if (strcmp(arg[i],"NULL") == 0) { map[i-2] = -1; continue; } for (j = 0; j < fs->nelements; j++) if (strcmp(arg[i],fs->elements[j]) == 0) break; if (j < fs->nelements) map[i-2] = j; else error->all(FLERR,"No matching element in EAM potential file"); } // clear setflag since coeff() called once with I,J = * * int n = atom->ntypes; for (i = 1; i <= n; i++) for (j = i; j <= n; j++) setflag[i][j] = 0; // set setflag i,j for type pairs where both are mapped to elements // set mass of atom type if i = j int count = 0; for (i = 1; i <= n; i++) { for (j = i; j <= n; j++) { if (map[i] >= 0 && map[j] >= 0) { setflag[i][j] = 1; if (i == j) atom->set_mass(i,fs->mass[map[i]]); count++; } } } if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients"); } /* ---------------------------------------------------------------------- read a multi-element DYNAMO setfl file ------------------------------------------------------------------------- */ void PairEAMFSGPU::read_file(char *filename) { Fs *file = fs; // open potential file int me = comm->me; FILE *fptr; char line[MAXLINE]; if (me == 0) { fptr = fopen(filename,"r"); if (fptr == NULL) { char str[128]; sprintf(str,"Cannot open EAM potential file %s",filename); error->one(FLERR,str); } } // read and broadcast header // extract element names from nelements line int n; if (me == 0) { fgets(line,MAXLINE,fptr); fgets(line,MAXLINE,fptr); fgets(line,MAXLINE,fptr); fgets(line,MAXLINE,fptr); n = strlen(line) + 1; } MPI_Bcast(&n,1,MPI_INT,0,world); MPI_Bcast(line,n,MPI_CHAR,0,world); sscanf(line,"%d",&file->nelements); int nwords = atom->count_words(line); if (nwords != file->nelements + 1) error->all(FLERR,"Incorrect element names in EAM potential file"); char **words = new char*[file->nelements+1]; nwords = 0; strtok(line," \t\n\r\f"); while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue; file->elements = new char*[file->nelements]; for (int i = 0; i < file->nelements; i++) { n = strlen(words[i]) + 1; file->elements[i] = new char[n]; strcpy(file->elements[i],words[i]); } delete [] words; if (me == 0) { fgets(line,MAXLINE,fptr); sscanf(line,"%d %lg %d %lg %lg", &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut); } MPI_Bcast(&file->nrho,1,MPI_INT,0,world); MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world); MPI_Bcast(&file->nr,1,MPI_INT,0,world); MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world); MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world); file->mass = new double[file->nelements]; memory->create(file->frho,file->nelements,file->nrho+1, "pair:frho"); memory->create(file->rhor,file->nelements,file->nelements, file->nr+1,"pair:rhor"); memory->create(file->z2r,file->nelements,file->nelements, file->nr+1,"pair:z2r"); int i,j,tmp; for (i = 0; i < file->nelements; i++) { if (me == 0) { fgets(line,MAXLINE,fptr); sscanf(line,"%d %lg",&tmp,&file->mass[i]); } MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world); if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]); MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world); for (j = 0; j < file->nelements; j++) { if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]); MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world); } } for (i = 0; i < file->nelements; i++) for (j = 0; j <= i; j++) { if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]); MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world); } // close the potential file if (me == 0) fclose(fptr); } /* ---------------------------------------------------------------------- copy read-in setfl potential to standard array format ------------------------------------------------------------------------- */ void PairEAMFSGPU::file2array() { int i,j,m,n; int ntypes = atom->ntypes; // set function params directly from fs file nrho = fs->nrho; nr = fs->nr; drho = fs->drho; dr = fs->dr; // ------------------------------------------------------------------ // setup frho arrays // ------------------------------------------------------------------ // allocate frho arrays // nfrho = # of fs elements + 1 for zero array nfrho = fs->nelements + 1; memory->destroy(frho); memory->create(frho,nfrho,nrho+1,"pair:frho"); // copy each element's frho to global frho for (i = 0; i < fs->nelements; i++) for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m]; // add extra frho of zeroes for non-EAM types to point to (pair hybrid) // this is necessary b/c fp is still computed for non-EAM atoms for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0; // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to // if atom type doesn't point to element (non-EAM atom in pair hybrid) // then map it to last frho array of zeroes for (i = 1; i <= ntypes; i++) if (map[i] >= 0) type2frho[i] = map[i]; else type2frho[i] = nfrho-1; // ------------------------------------------------------------------ // setup rhor arrays // ------------------------------------------------------------------ // allocate rhor arrays // nrhor = square of # of fs elements nrhor = fs->nelements * fs->nelements; memory->destroy(rhor); memory->create(rhor,nrhor,nr+1,"pair:rhor"); // copy each element pair rhor to global rhor n = 0; for (i = 0; i < fs->nelements; i++) for (j = 0; j < fs->nelements; j++) { for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m]; n++; } // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to // for fs files, there is a full NxN set of rhor arrays // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used for (i = 1; i <= ntypes; i++) for (j = 1; j <= ntypes; j++) type2rhor[i][j] = map[i] * fs->nelements + map[j]; // ------------------------------------------------------------------ // setup z2r arrays // ------------------------------------------------------------------ // allocate z2r arrays // nz2r = N*(N+1)/2 where N = # of fs elements nz2r = fs->nelements * (fs->nelements+1) / 2; memory->destroy(z2r); memory->create(z2r,nz2r,nr+1,"pair:z2r"); // copy each element pair z2r to global z2r, only for I >= J n = 0; for (i = 0; i < fs->nelements; i++) for (j = 0; j <= i; j++) { for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m]; n++; } // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to // set of z2r arrays only fill lower triangular Nelement matrix // value = n = sum over rows of lower-triangular matrix until reach irow,icol // swap indices when irow < icol to stay lower triangular // if map = -1 (non-EAM atom in pair hybrid): // type2z2r is not used by non-opt // but set type2z2r to 0 since accessed by opt int irow,icol; for (i = 1; i <= ntypes; i++) { for (j = 1; j <= ntypes; j++) { irow = map[i]; icol = map[j]; if (irow == -1 || icol == -1) { type2z2r[i][j] = 0; continue; } if (irow < icol) { irow = map[j]; icol = map[i]; } n = 0; for (m = 0; m < irow; m++) n += m + 1; n += icol; type2z2r[i][j] = n; } } } diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp index ff03a0290..68a9e9c3c 100644 --- a/src/GPU/pair_eam_gpu.cpp +++ b/src/GPU/pair_eam_gpu.cpp @@ -1,421 +1,243 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL) + Contributing authors: Trung Dac Nguyen (ORNL), W. Michael Brown (ORNL) ------------------------------------------------------------------------- */ #include "math.h" #include "stdio.h" #include "stdlib.h" #include "string.h" #include "pair_eam_gpu.h" #include "atom.h" #include "force.h" #include "comm.h" #include "domain.h" #include "neighbor.h" #include "neigh_list.h" #include "memory.h" #include "error.h" #include "neigh_request.h" #include "gpu_extra.h" using namespace LAMMPS_NS; #define MAXLINE 1024 // External functions from cuda library for atom decomposition int eam_gpu_init(const int ntypes, double host_cutforcesq, - int **host_type2rhor, int **host_type2z2r, - int *host_type2frho, - double ***host_rhor_spline, double ***host_z2r_spline, - double ***host_frho_spline, - double rdr, double rdrho, int nrhor, int nrho, - int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - int &gpu_mode, FILE *screen, int &fp_size); + int **host_type2rhor, int **host_type2z2r, + int *host_type2frho, double ***host_rhor_spline, + double ***host_z2r_spline, double ***host_frho_spline, + double rdr, double rdrho, int nrhor, int nrho, int nz2r, + int nfrho, int nr, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + int &fp_size); void eam_gpu_clear(); -int** eam_gpu_compute_n(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, int &inum, void **fp_ptr); +int** eam_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, + int **jnum, const double cpu_time, bool &success, + int &inum, void **fp_ptr); void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, - const int nall,double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, void **fp_ptr); + const int nall,double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, void **fp_ptr); void eam_gpu_compute_force(int *ilist, const bool eflag, const bool vflag, - const bool eatom, const bool vatom); - + const bool eatom, const bool vatom); double eam_gpu_bytes(); /* ---------------------------------------------------------------------- */ PairEAMGPU::PairEAMGPU(LAMMPS *lmp) : PairEAM(lmp), gpu_mode(GPU_FORCE) { respa_enable = 0; cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } /* ---------------------------------------------------------------------- check if allocated, since class can be destructed when incomplete ------------------------------------------------------------------------- */ PairEAMGPU::~PairEAMGPU() { eam_gpu_clear(); } /* ---------------------------------------------------------------------- */ double PairEAMGPU::memory_usage() { double bytes = Pair::memory_usage(); return bytes + eam_gpu_bytes(); } /* ---------------------------------------------------------------------- */ void PairEAMGPU::compute(int eflag, int vflag) { int i,j,ii,jj,m,jnum,itype,jtype; double evdwl,*coeff; evdwl = 0.0; if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = eflag_global = eflag_atom = 0; int nlocal = atom->nlocal; int newton_pair = force->newton_pair; // compute density on each atom on GPU int nall = atom->nlocal + atom->nghost; int inum, host_start, inum_dev; bool success = true; int *ilist, *numneigh, **firstneigh; if (gpu_mode != GPU_FORCE) { inum = atom->nlocal; - firstneigh = eam_gpu_compute_n(neighbor->ago, inum, nall, - atom->x,atom->type, domain->sublo, domain->subhi, - atom->tag, atom->nspecial, atom->special, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, inum_dev, &fp_pinned); + firstneigh = eam_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, inum_dev, &fp_pinned); } else { // gpu_mode == GPU_FORCE inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; eam_gpu_compute(neighbor->ago, inum, nlocal, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, &fp_pinned); } if (!success) error->one(FLERR,"Out of memory on GPGPU"); - if (host_start<inum) { - cpu_time = MPI_Wtime(); - cpu_compute_energy(host_start, inum, eflag, vflag, - ilist, numneigh, firstneigh); - cpu_time = MPI_Wtime() - cpu_time; - } - // communicate derivative of embedding function comm->forward_comm_pair(this); // compute forces on each atom on GPU if (gpu_mode != GPU_FORCE) eam_gpu_compute_force(NULL, eflag, vflag, eflag_atom, vflag_atom); else eam_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom); - - if (host_start<inum) { - double cpu_time2 = MPI_Wtime(); - cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); - cpu_time += MPI_Wtime() - cpu_time2; - } - -} - -void PairEAMGPU::cpu_compute_energy(int start, int inum, int eflag, int vflag, - int *ilist, int *numneigh, - int **firstneigh) -{ - int i,j,ii,jj,m,jnum,itype,jtype; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r,p,rhoip,rhojp,phi; - double *coeff; - int *jlist; - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; - int nlocal = atom->nlocal; - int newton_pair = force->newton_pair; - - // rho = density at each atom - // loop over neighbors of my atoms - - for (ii = start; ii < inum; ii++) { - i = ilist[ii]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - jlist = firstneigh[i]; - jnum = numneigh[i]; - - for (jj = 0; jj < jnum; jj++) { - j = jlist[jj]; - j &= NEIGHMASK; - - delx = xtmp - x[j][0]; - dely = ytmp - x[j][1]; - delz = ztmp - x[j][2]; - rsq = delx*delx + dely*dely + delz*delz; - - if (rsq < cutforcesq) { - jtype = type[j]; - p = sqrt(rsq)*rdr + 1.0; - m = static_cast<int> (p); - m = MIN(m,nr-1); - p -= m; - p = MIN(p,1.0); - coeff = rhor_spline[type2rhor[jtype][itype]][m]; - rho[i] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; - } - } - } - - // communicate and sum densities - - if (newton_pair) comm->reverse_comm_pair(this); - - - // fp = derivative of embedding energy at each atom - // phi = embedding energy at each atom - - for (ii = start; ii < inum; ii++) { - i = ilist[ii]; - p = rho[i]*rdrho + 1.0; - m = static_cast<int> (p); - m = MAX(1,MIN(m,nrho-1)); - p -= m; - p = MIN(p,1.0); - - coeff = frho_spline[type2frho[type[i]]][m]; - fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2]; - if (eflag) { - phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; - if (eflag_global) eng_vdwl += phi; - if (eflag_atom) eatom[i] += phi; - - } - } -} - -/* ---------------------------------------------------------------------- */ - -void PairEAMGPU::cpu_compute(int start, int inum, int eflag, int vflag, - int *ilist, int *numneigh, - int **firstneigh) -{ - int i,j,ii,jj,m,jnum,itype,jtype; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi; - double *coeff; - int *jlist; - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; - int nlocal = atom->nlocal; - int newton_pair = force->newton_pair; - - // compute forces on each atom - // loop over neighbors of my atoms - - for (ii = start; ii < inum; ii++) { - i = ilist[ii]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - - jlist = firstneigh[i]; - jnum = numneigh[i]; - - for (jj = 0; jj < jnum; jj++) { - j = jlist[jj]; - j &= NEIGHMASK; - - delx = xtmp - x[j][0]; - dely = ytmp - x[j][1]; - delz = ztmp - x[j][2]; - rsq = delx*delx + dely*dely + delz*delz; - - if (rsq < cutforcesq) { - jtype = type[j]; - r = sqrt(rsq); - p = r*rdr + 1.0; - m = static_cast<int> (p); - m = MIN(m,nr-1); - p -= m; - p = MIN(p,1.0); - - // rhoip = derivative of (density at atom j due to atom i) - // rhojp = derivative of (density at atom i due to atom j) - // phi = pair potential energy - // phip = phi' - // z2 = phi * r - // z2p = (phi * r)' = (phi' r) + phi - // psip needs both fp[i] and fp[j] terms since r_ij appears in two - // terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji) - // hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip - - coeff = rhor_spline[type2rhor[itype][jtype]][m]; - rhoip = (coeff[0]*p + coeff[1])*p + coeff[2]; - coeff = rhor_spline[type2rhor[jtype][itype]][m]; - rhojp = (coeff[0]*p + coeff[1])*p + coeff[2]; - coeff = z2r_spline[type2z2r[itype][jtype]][m]; - z2p = (coeff[0]*p + coeff[1])*p + coeff[2]; - z2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; - - recip = 1.0/r; - phi = z2*recip; - phip = z2p*recip - phi*recip; - psip = fp[i]*rhojp + fp[j]*rhoip + phip; - fpair = -psip*recip; - - f[i][0] += delx*fpair; - f[i][1] += dely*fpair; - f[i][2] += delz*fpair; - - if (eflag) evdwl = phi; - if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); - } - } - } - } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairEAMGPU::init_style() { if (force->newton_pair) error->all(FLERR,"Cannot use newton pair with eam/gpu pair style"); if (!allocated) error->all(FLERR,"Not allocate memory eam/gpu pair style"); // convert read-in file(s) to arrays and spline them file2array(); array2spline(); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; double cut; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); cut *= cut; if (cut > maxcut) maxcut = cut; cutsq[i][j] = cutsq[j][i] = cut; } else cutsq[i][j] = cutsq[j][i] = 0.0; } } double cell_size = sqrt(maxcut) + neighbor->skin; int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; int fp_size; - int success = eam_gpu_init(atom->ntypes+1, cutforcesq, - type2rhor, type2z2r, type2frho, - rhor_spline, z2r_spline, frho_spline, - rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, fp_size); + int success = eam_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r, + type2frho, rhor_spline, z2r_spline, frho_spline, + rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, + atom->nlocal, atom->nlocal+atom->nghost, 300, + maxspecial, cell_size, gpu_mode, screen, fp_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { int irequest = neighbor->request(this); neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } if (fp_size == sizeof(double)) fp_single = false; else fp_single = true; } /* ---------------------------------------------------------------------- */ int PairEAMGPU::pack_comm(int n, int *list, double *buf, int pbc_flag, int *pbc) { int i,j,m; m = 0; if (fp_single) { float *fp_ptr = (float *)fp_pinned; for (i = 0; i < n; i++) { j = list[i]; buf[m++] = static_cast<double>(fp_ptr[j]); } } else { double *fp_ptr = (double *)fp_pinned; for (i = 0; i < n; i++) { j = list[i]; buf[m++] = fp_ptr[j]; } } return 1; } /* ---------------------------------------------------------------------- */ void PairEAMGPU::unpack_comm(int n, int first, double *buf) { int i,m,last; m = 0; last = first + n; if (fp_single) { float *fp_ptr = (float *)fp_pinned; for (i = first; i < last; i++) fp_ptr[i] = buf[m++]; } else { double *fp_ptr = (double *)fp_pinned; for (i = first; i < last; i++) fp_ptr[i] = buf[m++]; } } diff --git a/src/GPU/pair_eam_gpu.h b/src/GPU/pair_eam_gpu.h index 7cdecb010..f7e41be91 100644 --- a/src/GPU/pair_eam_gpu.h +++ b/src/GPU/pair_eam_gpu.h @@ -1,55 +1,53 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ #ifdef PAIR_CLASS PairStyle(eam/gpu,PairEAMGPU) #else #ifndef LMP_PAIR_EAM_GPU_H #define LMP_PAIR_EAM_GPU_H #include "stdio.h" #include "pair_eam.h" namespace LAMMPS_NS { class PairEAMGPU : public PairEAM { public: PairEAMGPU(class LAMMPS *); virtual ~PairEAMGPU(); - void cpu_compute(int, int, int, int, int *, int *, int **); - void cpu_compute_energy(int, int, int, int, int *, int *, int **); void compute(int, int); void init_style(); double memory_usage(); int pack_comm(int, int *, double *, int, int *); void unpack_comm(int, int, double *); enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; - private: + protected: int gpu_mode; double cpu_time; int *gpulist; void *fp_pinned; bool fp_single; }; } #endif #endif diff --git a/src/GPU/pair_eam_lj_gpu.cpp b/src/GPU/pair_eam_lj_gpu.cpp index 992a0d494..02bfde93e 100755 --- a/src/GPU/pair_eam_lj_gpu.cpp +++ b/src/GPU/pair_eam_lj_gpu.cpp @@ -1,655 +1,656 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL) ------------------------------------------------------------------------- */ #include "math.h" #include "stdio.h" #include "stdlib.h" #include "string.h" #include "pair_eam_lj_gpu.h" #include "atom.h" #include "force.h" #include "comm.h" #include "domain.h" #include "neighbor.h" #include "neigh_list.h" #include "memory.h" #include "math_const.h" #include "error.h" #include "neigh_request.h" #include "gpu_extra.h" using namespace LAMMPS_NS; using namespace MathConst; // External functions from cuda library for atom decomposition int eam_lj_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double **host_cutljsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, double rdr, double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size); void eam_lj_gpu_clear(); int** eam_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr); void eam_lj_gpu_compute(const int ago, const int inum_full, const int nlocal, const int nall,double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr); void eam_lj_gpu_compute_force(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom); double eam_lj_gpu_bytes(); /* ---------------------------------------------------------------------- */ PairEAMLJGPU::PairEAMLJGPU(LAMMPS *lmp) : PairEAM(lmp), gpu_mode(GPU_FORCE) { respa_enable = 0; cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } /* ---------------------------------------------------------------------- check if allocated, since class can be destructed when incomplete ------------------------------------------------------------------------- */ PairEAMLJGPU::~PairEAMLJGPU() { eam_lj_gpu_clear(); memory->destroy(cut); memory->destroy(epsilon); memory->destroy(sigma); memory->destroy(lj1); memory->destroy(lj2); memory->destroy(lj3); memory->destroy(lj4); memory->destroy(offset); } /* ---------------------------------------------------------------------- */ double PairEAMLJGPU::memory_usage() { double bytes = Pair::memory_usage(); return bytes + eam_lj_gpu_bytes(); } /* ---------------------------------------------------------------------- */ void PairEAMLJGPU::compute(int eflag, int vflag) { int i,j,ii,jj,m,jnum,itype,jtype; double evdwl,*coeff; evdwl = 0.0; if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = eflag_global = eflag_atom = 0; int nlocal = atom->nlocal; int newton_pair = force->newton_pair; // compute density on each atom on GPU int nall = atom->nlocal + atom->nghost; int inum, host_start, inum_dev; bool success = true; int *ilist, *numneigh, **firstneigh; if (gpu_mode != GPU_FORCE) { inum = atom->nlocal; firstneigh = eam_lj_gpu_compute_n(neighbor->ago, inum, nall, atom->x,atom->type, domain->sublo, domain->subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success, inum_dev, &fp_pinned); } else { // gpu_mode == GPU_FORCE inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; eam_lj_gpu_compute(neighbor->ago, inum, nlocal, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, &fp_pinned); } if (!success) error->one(FLERR,"Out of memory on GPGPU"); if (host_start<inum) { cpu_time = MPI_Wtime(); cpu_compute_energy(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_time = MPI_Wtime() - cpu_time; } // communicate derivative of embedding function comm->forward_comm_pair(this); // compute forces on each atom on GPU if (gpu_mode != GPU_FORCE) eam_lj_gpu_compute_force(NULL, eflag, vflag, eflag_atom, vflag_atom); else eam_lj_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom); if (host_start<inum) { double cpu_time2 = MPI_Wtime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_time += MPI_Wtime() - cpu_time2; } } void PairEAMLJGPU::cpu_compute_energy(int start, int inum, int eflag, int vflag, int *ilist, int *numneigh, int **firstneigh) { int i,j,ii,jj,m,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r,p,rhoip,rhojp,phi; double *coeff; int *jlist; double **x = atom->x; double **f = atom->f; int *type = atom->type; int nlocal = atom->nlocal; int newton_pair = force->newton_pair; // rho = density at each atom // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { i = ilist[ii]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; delz = ztmp - x[j][2]; rsq = delx*delx + dely*dely + delz*delz; if (rsq < cutforcesq) { jtype = type[j]; p = sqrt(rsq)*rdr + 1.0; m = static_cast<int> (p); m = MIN(m,nr-1); p -= m; p = MIN(p,1.0); coeff = rhor_spline[type2rhor[jtype][itype]][m]; rho[i] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; } } } // communicate and sum densities if (newton_pair) comm->reverse_comm_pair(this); // fp = derivative of embedding energy at each atom // phi = embedding energy at each atom for (ii = start; ii < inum; ii++) { i = ilist[ii]; p = rho[i]*rdrho + 1.0; m = static_cast<int> (p); m = MAX(1,MIN(m,nrho-1)); p -= m; p = MIN(p,1.0); coeff = frho_spline[type2frho[type[i]]][m]; fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2]; if (eflag) { phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; if (eflag_global) eng_vdwl += phi; if (eflag_atom) eatom[i] += phi; } } } /* ---------------------------------------------------------------------- */ void PairEAMLJGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist, int *numneigh, int **firstneigh) { int i,j,ii,jj,m,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi; double r2inv,r6inv,factor_lj,force_eam,force_lj; double *coeff; int *jlist; double **x = atom->x; double **f = atom->f; int *type = atom->type; int nlocal = atom->nlocal; double *special_lj = force->special_lj; // compute forces on each atom // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { i = ilist[ii]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; factor_lj = special_lj[sbmask(j)]; j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; delz = ztmp - x[j][2]; rsq = delx*delx + dely*dely + delz*delz; if (rsq < cutsq[itype][jtype]) { jtype = type[j]; if (rsq < cutforcesq && (itype ==2 && jtype ==2)) { r = sqrt(rsq); p = r*rdr + 1.0; m = static_cast<int> (p); m = MIN(m,nr-1); p -= m; p = MIN(p,1.0); // rhoip = derivative of (density at atom j due to atom i) // rhojp = derivative of (density at atom i due to atom j) // phi = pair potential energy // phip = phi' // z2 = phi * r // z2p = (phi * r)' = (phi' r) + phi // psip needs both fp[i] and fp[j] terms since r_ij appears in two // terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji) // hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip coeff = rhor_spline[type2rhor[itype][jtype]][m]; rhoip = (coeff[0]*p + coeff[1])*p + coeff[2]; coeff = rhor_spline[type2rhor[jtype][itype]][m]; rhojp = (coeff[0]*p + coeff[1])*p + coeff[2]; coeff = z2r_spline[type2z2r[itype][jtype]][m]; z2p = (coeff[0]*p + coeff[1])*p + coeff[2]; z2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; recip = 1.0/r; phi = z2*recip; phip = z2p*recip - phi*recip; psip = fp[i]*rhojp + fp[j]*rhoip + phip; force_eam = -psip*recip; } else { force_eam=0.0; phi = 0.0; } if (rsq < cutsq[itype][jtype] && (itype !=2 || jtype !=2)) { r2inv = 1.0/rsq; r6inv = r2inv*r2inv*r2inv; force_lj = factor_lj*r2inv*r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); } else force_lj=0.0; fpair = force_eam + force_lj; f[i][0] += delx*fpair; f[i][1] += dely*fpair; f[i][2] += delz*fpair; if (eflag) { evdwl = phi; if (rsq < cutsq[itype][jtype] && (itype !=2 || jtype !=2)) { double e = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) - offset[itype][jtype]; evdwl += factor_lj*e; } } if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); } } } } /* ---------------------------------------------------------------------- allocate all arrays ------------------------------------------------------------------------- */ void PairEAMLJGPU::allocate() { allocated = 1; int n = atom->ntypes; memory->create(setflag,n+1,n+1,"pair:setflag"); for (int i = 1; i <= n; i++) for (int j = i; j <= n; j++) setflag[i][j] = 0; memory->create(cutsq,n+1,n+1,"pair:cutsq"); map = new int[n+1]; for (int i = 1; i <= n; i++) map[i] = -1; type2frho = new int[n+1]; memory->create(type2rhor,n+1,n+1,"pair:type2rhor"); memory->create(type2z2r,n+1,n+1,"pair:type2z2r"); // LJ memory->create(cut,n+1,n+1,"pair:cut"); memory->create(epsilon,n+1,n+1,"pair:epsilon"); memory->create(sigma,n+1,n+1,"pair:sigma"); memory->create(lj1,n+1,n+1,"pair:lj1"); memory->create(lj2,n+1,n+1,"pair:lj2"); memory->create(lj3,n+1,n+1,"pair:lj3"); memory->create(lj4,n+1,n+1,"pair:lj4"); memory->create(offset,n+1,n+1,"pair:offset"); } /* ---------------------------------------------------------------------- global settings ------------------------------------------------------------------------- */ void PairEAMLJGPU::settings(int narg, char **arg) { if (narg != 1) error->all(FLERR,"Illegal pair_style eam/lj/gpu command"); cut_global = force->numeric(arg[0]); // reset cutoffs that have been explicitly set if (allocated) { int i,j; for (i = 1; i <= atom->ntypes; i++) for (j = i+1; j <= atom->ntypes; j++) if (setflag[i][j]) cut[i][j] = cut_global; } } /* ---------------------------------------------------------------------- set coeffs for one or more type pairs read DYNAMO funcfl file ------------------------------------------------------------------------- */ void PairEAMLJGPU::coeff(int narg, char **arg) { if (!allocated) allocate(); if (narg > 5) error->all(FLERR,"Incorrect args for pair coefficients"); // parse pair of atom types int ilo,ihi,jlo,jhi; force->bounds(arg[0],atom->ntypes,ilo,ihi); force->bounds(arg[1],atom->ntypes,jlo,jhi); int count = 0; if (narg == 3) { // eam // read funcfl file if hasn't already been read // store filename in Funcfl data struct int ifuncfl; for (ifuncfl = 0; ifuncfl < nfuncfl; ifuncfl++) if (strcmp(arg[2],funcfl[ifuncfl].file) == 0) break; if (ifuncfl == nfuncfl) { nfuncfl++; funcfl = (Funcfl *) memory->srealloc(funcfl,nfuncfl*sizeof(Funcfl),"pair:funcfl"); read_file(arg[2]); int n = strlen(arg[2]) + 1; funcfl[ifuncfl].file = new char[n]; strcpy(funcfl[ifuncfl].file,arg[2]); } // set setflag and map only for i,i type pairs // set mass of atom type if i = j for (int i = ilo; i <= ihi; i++) { for (int j = MAX(jlo,i); j <= jhi; j++) { if (i == j) { setflag[i][i] = 1; map[i] = ifuncfl; atom->set_mass(i,funcfl[ifuncfl].mass); count++; } } } } else if (narg >= 4 || narg <= 5) { // LJ double epsilon_one = force->numeric(arg[2]); double sigma_one = force->numeric(arg[3]); double cut_one = cut_global; if (narg == 5) cut_one = force->numeric(arg[4]); for (int i = ilo; i <= ihi; i++) { for (int j = MAX(jlo,i); j <= jhi; j++) { epsilon[i][j] = epsilon_one; sigma[i][j] = sigma_one; cut[i][j] = cut_one; setflag[i][j] = 1; count++; } } } if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients"); } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairEAMLJGPU::init_style() { if (force->newton_pair) error->all(FLERR,"Cannot use newton pair with eam/lj/gpu pair style"); if (!allocated) error->all(FLERR,"Not allocate memory eam/lj/gpu pair style"); // convert read-in file(s) to arrays and spline them file2array(); array2spline(); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; double cut; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); cut *= cut; if (cut > maxcut) maxcut = cut; cutsq[i][j] = cutsq[j][i] = cut; } else cutsq[i][j] = cutsq[j][i] = 0.0; } } double cell_size = sqrt(maxcut) + neighbor->skin; int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; int fp_size; int success = eam_lj_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r, type2frho, rhor_spline, z2r_spline, frho_spline, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, atom->nlocal, atom->nlocal+atom->nghost, 300, maxspecial, cell_size, gpu_mode, screen, fp_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { int irequest = neighbor->request(this); neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } if (fp_size == sizeof(double)) fp_single = false; else fp_single = true; } /* ---------------------------------------------------------------------- init for one type pair i,j and corresponding j,i ------------------------------------------------------------------------- */ double PairEAMLJGPU::init_one(int i, int j) { if (setflag[i][j] == 0) { epsilon[i][j] = mix_energy(epsilon[i][i],epsilon[j][j], sigma[i][i],sigma[j][j]); sigma[i][j] = mix_distance(sigma[i][i],sigma[j][j]); cut[i][j] = mix_distance(cut[i][i],cut[j][j]); } if (i==2 && j==2) { // single global cutoff = max of cut from all files read in // for funcfl could be multiple files // for setfl or fs, just one file if (funcfl) { cutmax = 0.0; for (int m = 0; m < nfuncfl; m++) cutmax = MAX(cutmax,funcfl[m].cut); } else if (setfl) cutmax = setfl->cut; else if (fs) cutmax = fs->cut; cutforcesq = cutmax*cutmax; cut[i][j] = cutforcesq; return cutmax; } lj1[i][j] = 48.0 * epsilon[i][j] * pow(sigma[i][j],12.0); lj2[i][j] = 24.0 * epsilon[i][j] * pow(sigma[i][j],6.0); lj3[i][j] = 4.0 * epsilon[i][j] * pow(sigma[i][j],12.0); lj4[i][j] = 4.0 * epsilon[i][j] * pow(sigma[i][j],6.0); if (offset_flag) { double ratio = sigma[i][j] / cut[i][j]; offset[i][j] = 4.0 * epsilon[i][j] * (pow(ratio,12.0) - pow(ratio,6.0)); } else offset[i][j] = 0.0; lj1[j][i] = lj1[i][j]; lj2[j][i] = lj2[i][j]; lj3[j][i] = lj3[i][j]; lj4[j][i] = lj4[i][j]; offset[j][i] = offset[i][j]; // compute I,J contribution to long-range tail correction // count total # of atoms of type I and J via Allreduce if (tail_flag) { int *type = atom->type; int nlocal = atom->nlocal; double count[2],all[2]; count[0] = count[1] = 0.0; for (int k = 0; k < nlocal; k++) { if (type[k] == i) count[0] += 1.0; if (type[k] == j) count[1] += 1.0; } MPI_Allreduce(count,all,2,MPI_DOUBLE,MPI_SUM,world); double sig2 = sigma[i][j]*sigma[i][j]; double sig6 = sig2*sig2*sig2; double rc3 = cut[i][j]*cut[i][j]*cut[i][j]; double rc6 = rc3*rc3; double rc9 = rc3*rc6; etail_ij = 8.0*MY_PI*all[0]*all[1]*epsilon[i][j] * sig6 * (sig6 - 3.0*rc6) / (9.0*rc9); ptail_ij = 16.0*MY_PI*all[0]*all[1]*epsilon[i][j] * sig6 * (2.0*sig6 - 3.0*rc6) / (9.0*rc9); } return cut[i][j]; } /* ---------------------------------------------------------------------- */ int PairEAMLJGPU::pack_comm(int n, int *list, double *buf, int pbc_flag, int *pbc) { int i,j,m; m = 0; if (fp_single) { float *fp_ptr = (float *)fp_pinned; for (i = 0; i < n; i++) { j = list[i]; buf[m++] = static_cast<double>(fp_ptr[j]); } } else { double *fp_ptr = (double *)fp_pinned; for (i = 0; i < n; i++) { j = list[i]; buf[m++] = fp_ptr[j]; } } return 1; } /* ---------------------------------------------------------------------- */ void PairEAMLJGPU::unpack_comm(int n, int first, double *buf) { int i,m,last; m = 0; last = first + n; if (fp_single) { float *fp_ptr = (float *)fp_pinned; for (i = first; i < last; i++) fp_ptr[i] = buf[m++]; } else { double *fp_ptr = (double *)fp_pinned; for (i = first; i < last; i++) fp_ptr[i] = buf[m++]; } } diff --git a/src/GPU/pair_eam_lj_gpu.h b/src/GPU/pair_eam_lj_gpu.h index 1dc682277..e3aa4e45a 100755 --- a/src/GPU/pair_eam_lj_gpu.h +++ b/src/GPU/pair_eam_lj_gpu.h @@ -1,68 +1,59 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ #ifdef PAIR_CLASS PairStyle(eam/lj/gpu,PairEAMLJGPU) #else #ifndef LMP_PAIR_EAM_LJ_GPU_H #define LMP_PAIR_EAM_LJ_GPU_H #include "stdio.h" #include "pair_eam.h" namespace LAMMPS_NS { class PairEAMLJGPU : public PairEAM { public: PairEAMLJGPU(class LAMMPS *); virtual ~PairEAMLJGPU(); void compute(int, int); void settings(int, char **); void coeff(int, char **); void init_style(); double init_one(int, int); double memory_usage(); void cpu_compute(int, int, int, int, int *, int *, int **); void cpu_compute_energy(int, int, int, int, int *, int *, int **); int pack_comm(int, int *, double *, int, int *); void unpack_comm(int, int, double *); enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; protected: double cut_global; double **cut; double **epsilon,**sigma; double **lj1,**lj2,**lj3,**lj4,**offset; void allocate(); - - private: - int gpu_mode; - double cpu_time; - int *gpulist; - void *fp_pinned; - bool fp_single; - - }; } #endif #endif diff --git a/src/GPU/pair_table_gpu.cpp b/src/GPU/pair_table_gpu.cpp index 40c7e95de..dba7a9a82 100644 --- a/src/GPU/pair_table_gpu.cpp +++ b/src/GPU/pair_table_gpu.cpp @@ -1,345 +1,346 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL) ------------------------------------------------------------------------- */ #include "lmptype.h" #include "math.h" #include "stdio.h" #include "stdlib.h" #include "pair_table_gpu.h" #include "atom.h" #include "atom_vec.h" #include "comm.h" #include "force.h" #include "neighbor.h" #include "neigh_list.h" #include "integrate.h" #include "memory.h" #include "error.h" #include "neigh_request.h" #include "universe.h" #include "update.h" #include "domain.h" #include "string.h" #include "gpu_extra.h" #define LOOKUP 0 #define LINEAR 1 #define SPLINE 2 #define BITMAP 3 // External functions from cuda library for atom decomposition int table_gpu_init(const int ntypes, double **cutsq, double ***host_table_coeffs, double **host_table_data, double *special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int tabstyle, int ntables, int tablength); void table_gpu_clear(); int ** table_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success); void table_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); double table_gpu_bytes(); using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ PairTableGPU::PairTableGPU(LAMMPS *lmp) : PairTable(lmp), gpu_mode(GPU_FORCE) { respa_enable = 0; cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } /* ---------------------------------------------------------------------- free all arrays ------------------------------------------------------------------------- */ PairTableGPU::~PairTableGPU() { table_gpu_clear(); } /* ---------------------------------------------------------------------- */ void PairTableGPU::compute(int eflag, int vflag) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; int nall = atom->nlocal + atom->nghost; int inum, host_start; bool success = true; int *ilist, *numneigh, **firstneigh; if (gpu_mode != GPU_FORCE) { inum = atom->nlocal; firstneigh = table_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, domain->sublo, domain->subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success); } else { inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; table_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success); } if (!success) error->one(FLERR,"Out of memory on GPGPU"); if (host_start<inum) { cpu_time = MPI_Wtime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_time = MPI_Wtime() - cpu_time; } } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairTableGPU::init_style() { if (force->newton_pair) error->all(FLERR,"Cannot use newton pair with table/gpu pair style"); int ntypes = atom->ntypes; // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; double cut; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); cut *= cut; if (cut > maxcut) maxcut = cut; cutsq[i][j] = cutsq[j][i] = cut; } else cutsq[i][j] = cutsq[j][i] = 0.0; } } double cell_size = sqrt(maxcut) + neighbor->skin; // pack tables and send them to device double ***table_coeffs = NULL; double **table_data = NULL; memory->create(table_coeffs, ntypes+1, ntypes+1, 8, "table:coeffs"); Table *tb; for (int i = 1; i <= atom->ntypes; i++) for (int j = 1; j <= atom->ntypes; j++) { int n = tabindex[i][j]; tb = &tables[n]; table_coeffs[i][j][0] = n; table_coeffs[i][j][1] = tb->ntablebits; table_coeffs[i][j][2] = tb->nshiftbits; table_coeffs[i][j][3] = tb->nmask; table_coeffs[i][j][4] = tb->innersq; table_coeffs[i][j][5] = tb->invdelta; table_coeffs[i][j][6] = tb->deltasq6; table_coeffs[i][j][7] = 0.0; } if (tabstyle != BITMAP) { memory->create(table_data, ntables, 8*tablength, "table:data"); for (int n = 0; n < ntables; n++) { tb = &tables[n]; if (tabstyle == LOOKUP) { for (int k = 0; k<tablength-1; k++) { table_data[n][8*k+1] = tb->e[k]; table_data[n][8*k+2] = tb->f[k]; } } else if (tabstyle == LINEAR) { for (int k = 0; k<tablength; k++) { table_data[n][8*k+0] = tb->rsq[k]; table_data[n][8*k+1] = tb->e[k]; table_data[n][8*k+2] = tb->f[k]; if (k<tablength-1) { table_data[n][8*k+3] = tb->de[k]; table_data[n][8*k+4] = tb->df[k]; } } } else if (tabstyle == SPLINE) { for (int k = 0; k<tablength; k++) { table_data[n][8*k+0] = tb->rsq[k]; table_data[n][8*k+1] = tb->e[k]; table_data[n][8*k+2] = tb->f[k]; table_data[n][8*k+3] = tb->e2[k]; table_data[n][8*k+4] = tb->f2[k]; } } } } else { int ntable = 1 << tablength; memory->create(table_data, ntables, 8*ntable, "table:data"); for (int n = 0; n < ntables; n++) { tb = &tables[n]; for (int k = 0; k<ntable; k++) { table_data[n][8*k+0] = tb->rsq[k]; table_data[n][8*k+1] = tb->e[k]; table_data[n][8*k+2] = tb->f[k]; table_data[n][8*k+3] = tb->de[k]; table_data[n][8*k+4] = tb->df[k]; table_data[n][8*k+5] = tb->drsq[k]; } } } int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; int success = table_gpu_init(atom->ntypes+1, cutsq, table_coeffs, table_data, force->special_lj, atom->nlocal, atom->nlocal+atom->nghost, 300, maxspecial, cell_size, gpu_mode, screen, tabstyle, ntables, tablength); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { int irequest = neighbor->request(this); neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } memory->destroy(table_coeffs); memory->destroy(table_data); } /* ---------------------------------------------------------------------- */ double PairTableGPU::memory_usage() { double bytes = Pair::memory_usage(); return bytes + table_gpu_bytes(); } /* ---------------------------------------------------------------------- */ void PairTableGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist, int *numneigh, int **firstneigh) { int i,j,ii,jj,jnum,itype,jtype,itable; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,factor_lj,fraction,value,a,b; int *jlist; Table *tb; union_int_float_t rsq_lookup; int tlm1 = tablength - 1; double **x = atom->x; double **f = atom->f; int *type = atom->type; double *special_lj = force->special_lj; // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { i = ilist[ii]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; factor_lj = special_lj[sbmask(j)]; j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; delz = ztmp - x[j][2]; rsq = delx*delx + dely*dely + delz*delz; jtype = type[j]; if (rsq < cutsq[itype][jtype]) { tb = &tables[tabindex[itype][jtype]]; if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff"); if (tabstyle == LOOKUP) { itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta); if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff"); fpair = factor_lj * tb->f[itable]; } else if (tabstyle == LINEAR) { itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta); if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff"); fraction = (rsq - tb->rsq[itable]) * tb->invdelta; value = tb->f[itable] + fraction*tb->df[itable]; fpair = factor_lj * value; } else if (tabstyle == SPLINE) { itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta); if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff"); b = (rsq - tb->rsq[itable]) * tb->invdelta; a = 1.0 - b; value = a * tb->f[itable] + b * tb->f[itable+1] + ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) * tb->deltasq6; fpair = factor_lj * value; } else { rsq_lookup.f = rsq; itable = rsq_lookup.i & tb->nmask; itable >>= tb->nshiftbits; fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable]; value = tb->f[itable] + fraction*tb->df[itable]; fpair = factor_lj * value; } f[i][0] += delx*fpair; f[i][1] += dely*fpair; f[i][2] += delz*fpair; if (eflag) { if (tabstyle == LOOKUP) evdwl = tb->e[itable]; else if (tabstyle == LINEAR || tabstyle == BITMAP) evdwl = tb->e[itable] + fraction*tb->de[itable]; else evdwl = a * tb->e[itable] + b * tb->e[itable+1] + ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6; evdwl *= factor_lj; } if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); } } } } diff --git a/src/GPU/pair_yukawa_gpu.cpp b/src/GPU/pair_yukawa_gpu.cpp index 63186f1ad..f03687d02 100644 --- a/src/GPU/pair_yukawa_gpu.cpp +++ b/src/GPU/pair_yukawa_gpu.cpp @@ -1,229 +1,229 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Trung Dac Nguyen, W. Michael Brown (ORNL) + Contributing author: Trung Dac Nguyen (ORNL) ------------------------------------------------------------------------- */ #include "lmptype.h" #include "math.h" #include "stdio.h" #include "stdlib.h" #include "pair_yukawa_gpu.h" #include "atom.h" #include "atom_vec.h" #include "comm.h" #include "force.h" #include "neighbor.h" #include "neigh_list.h" #include "integrate.h" #include "memory.h" #include "error.h" #include "neigh_request.h" #include "universe.h" #include "update.h" #include "domain.h" #include "string.h" #include "gpu_extra.h" // External functions from cuda library for atom decomposition int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, - double **host_a, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - int &gpu_mode, FILE *screen); + double **host_a, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + int &gpu_mode, FILE *screen); void yukawa_gpu_clear(); -int ** yukawa_gpu_compute_n(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success); +int ** yukawa_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success); void yukawa_gpu_compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); double yukawa_gpu_bytes(); using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ PairYukawaGPU::PairYukawaGPU(LAMMPS *lmp) : PairYukawa(lmp), - gpu_mode(GPU_FORCE) + gpu_mode(GPU_FORCE) { respa_enable = 0; cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } /* ---------------------------------------------------------------------- free all arrays ------------------------------------------------------------------------- */ PairYukawaGPU::~PairYukawaGPU() { yukawa_gpu_clear(); } /* ---------------------------------------------------------------------- */ void PairYukawaGPU::compute(int eflag, int vflag) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; int nall = atom->nlocal + atom->nghost; int inum, host_start; bool success = true; int *ilist, *numneigh, **firstneigh; if (gpu_mode != GPU_FORCE) { inum = atom->nlocal; firstneigh = yukawa_gpu_compute_n(neighbor->ago, inum, nall, - atom->x, atom->type, domain->sublo, - domain->subhi, atom->tag, atom->nspecial, - atom->special, eflag, vflag, eflag_atom, - vflag_atom, host_start, - &ilist, &numneigh, cpu_time, success); + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, + &ilist, &numneigh, cpu_time, success); } else { inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; yukawa_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, - ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success); + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); } if (!success) error->one(FLERR,"Out of memory on GPGPU"); if (host_start<inum) { cpu_time = MPI_Wtime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); cpu_time = MPI_Wtime() - cpu_time; } } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairYukawaGPU::init_style() { - if (force->newton_pair) + if (force->newton_pair) error->all(FLERR,"Cannot use newton pair with yukawa/gpu pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; double cut; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); cut *= cut; if (cut > maxcut) maxcut = cut; cutsq[i][j] = cutsq[j][i] = cut; } else cutsq[i][j] = cutsq[j][i] = 0.0; } } double cell_size = sqrt(maxcut) + neighbor->skin; int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; int success = yukawa_gpu_init(atom->ntypes+1, cutsq, kappa, a, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { int irequest = neighbor->request(this); neighbor->requests[irequest]->half = 0; neighbor->requests[irequest]->full = 1; } } /* ---------------------------------------------------------------------- */ double PairYukawaGPU::memory_usage() { double bytes = Pair::memory_usage(); return bytes + yukawa_gpu_bytes(); } /* ---------------------------------------------------------------------- */ void PairYukawaGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist, int *numneigh, int **firstneigh) { int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r2inv,r,rinv,screening,forceyukawa,factor; int *jlist; double **x = atom->x; double **f = atom->f; int *type = atom->type; double *special_lj = force->special_lj; // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { i = ilist[ii]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; factor = special_lj[sbmask(j)]; j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; delz = ztmp - x[j][2]; rsq = delx*delx + dely*dely + delz*delz; jtype = type[j]; if (rsq < cutsq[itype][jtype]) { r2inv = 1.0/rsq; r = sqrt(rsq); rinv = 1.0/r; screening = exp(-kappa*r); forceyukawa = a[itype][jtype] * screening * (kappa + rinv); fpair = factor*forceyukawa * r2inv; f[i][0] += delx*fpair; f[i][1] += dely*fpair; f[i][2] += delz*fpair; if (eflag) { - evdwl = a[itype][jtype] * screening * rinv - - offset[itype][jtype]; + evdwl = a[itype][jtype] * screening * rinv - offset[itype][jtype]; evdwl *= factor; } if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); } } } }