diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index 0c3558e67..766131f4f 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -1,190 +1,609 @@ /*************************************************************************** lal_eam.cpp ------------------- W. Michael Brown, Trung Dac Nguyen (ORNL) Class for acceleration of the eam pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ #ifdef USE_OPENCL #include "eam_cl.h" #else #include "eam_ptx.h" #endif #include "lal_eam.h" #include <cassert> using namespace LAMMPS_AL; #define EAMT EAM<numtyp, acctyp> -extern Device<PRECISION,ACC_PRECISION> device; +extern Device<PRECISION,ACC_PRECISION> global_device; template <class numtyp, class acctyp> -EAMT::EAM() : BaseCharge<numtyp,acctyp>(), - _allocated(false) { +EAMT::EAM() : _compiled(false), _max_bytes(0), _allocated(false) { + device=&global_device; + ans=new Answer<numtyp,acctyp>(); + nbor=new Neighbor(); } template <class numtyp, class acctyp> EAMT::~EAM() { + delete ans; + delete nbor; clear(); } template <class numtyp, class acctyp> int EAMT::bytes_per_atom(const int max_nbors) const { - return this->bytes_per_atom_atomic(max_nbors); + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template <class numtyp, class acctyp> +int EAMT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const char *pair_program) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_charge(); + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + bool charge = true; + bool rot = false; + bool pre_cut = false; + int success=device->init(*ans,charge,rot,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,pre_cut, + _threads_per_atom); + if (success!=0) + return success; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + _block_bio_size=device->block_bio_pair(); + compile_kernels(*ucl_device,pair_program); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->dev_x,4); + q_tex.bind_float(atom->dev_q,1); + + _max_an_bytes=ans->gpu_bytes() + +nbor->gpu_bytes(); + + return success; } template <class numtyp, class acctyp> int EAMT::init(const int ntypes, double host_cutforcesq, - int **host_type2rhor, int **host_type2z2r, + int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, - double rdr, int nrhor, int nz2r, int nr, + double ***host_frho_spline, + double rdr, double rdrho, int nrhor, int nrho, + int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { - int success; - success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + success=init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, _screen,eam); if (success!=0) return success; - + // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; int max_shared_types=this->device->max_shared_types(); if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { lj_types=max_shared_types; shared_types=true; } _ntypes=lj_types; _cutforcesq=host_cutforcesq; _rdr=rdr; + _rdrho = rdrho; _nrhor=nrhor; + _nrho=nrho; _nz2r=nz2r; + _nfrho=nfrho; _nr=nr; UCL_H_Vec<numtyp> dview_type(lj_types*lj_types*2,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int i=0; i<lj_types*lj_types*2; i++) dview_type[i]=(numtyp)0.0; // pack type2rhor and type2z2r type2rhor_z2r.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack2(ntypes,lj_types,type2rhor_z2r,dview_type, host_type2rhor, host_type2z2r); + + // pack type2frho + UCL_H_Vec<numtyp> dview_type2frho(ntypes,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + type2frho.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<ntypes; i++) + dview_type2frho[i]=(numtyp)host_type2frho[i]; + ucl_copy(type2frho,dview_type2frho,false); + + // pack frho_spline + UCL_H_Vec<numtyp> dview_frho_spline(nfrho*(nr+1)*7,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int ix=0; ix<nfrho; ix++) + for (int iy=0; iy<nr+1; iy++) + for (int iz=0; iz<7; iz++) + dview_frho_spline[ix*(nr+1)*7+iy*7+iz]=host_frho_spline[ix][iy][iz]; + + frho_spline.alloc(nfrho*(nr+1)*7,*(this->ucl_device),UCL_READ_ONLY); + ucl_copy(frho_spline,dview_frho_spline,false); + // pack rhor_spline UCL_H_Vec<numtyp> dview_rhor_spline(nrhor*(nr+1)*7,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int ix=0; ix<nrhor; ix++) for (int iy=0; iy<nr+1; iy++) for (int iz=0; iz<7; iz++) dview_rhor_spline[ix*(nr+1)*7+iy*7+iz]=host_rhor_spline[ix][iy][iz]; rhor_spline.alloc(nrhor*(nr+1)*7,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(rhor_spline,dview_rhor_spline,false); // pack z2r_spline UCL_H_Vec<numtyp> dview_z2r_spline(nz2r*(nr+1)*7,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int ix=0; ix<nz2r; ix++) for (int iy=0; iy<nr+1; iy++) for (int iz=0; iz<7; iz++) dview_z2r_spline[ix*(nr+1)*7+iy*7+iz]=host_z2r_spline[ix][iy][iz]; z2r_spline.alloc(nz2r*(nr+1)*7,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(z2r_spline,dview_z2r_spline,false); _allocated=true; - this->_max_bytes=type2rhor_z2r.row_bytes()+ - rhor_spline.row_bytes()+z2r_spline.row_bytes(); + this->_max_bytes=type2rhor_z2r.row_bytes() + + type2frho.row_bytes() + + rhor_spline.row_bytes()+z2r_spline.row_bytes() + + frho_spline.row_bytes(); return 0; } +template <class numtyp, class acctyp> +void EAMT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +} + template <class numtyp, class acctyp> void EAMT::clear() { if (!_allocated) return; _allocated=false; type2rhor_z2r.clear(); + type2frho.clear(); rhor_spline.clear(); z2r_spline.clear(); - this->clear_atomic(); + frho_spline.clear(); + + host_fp.clear(); + dev_fp.clear(); + + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + if (_compiled) { + k_pair_fast.clear(); + k_pair.clear(); + k_energy.clear(); + delete pair_program; + _compiled=false; + } + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); + device->clear(); } template <class numtyp, class acctyp> double EAMT::host_memory_usage() const { - return this->host_memory_usage_atomic()+sizeof(EAM<numtyp,acctyp>); + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(EAM<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +int * EAMT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return false; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +inline void EAMT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, int *tag, + int **nspecial, int **special, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, + nspecial, special, success, mn); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void EAMT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + acc_timers(); + + // compute density already took care of the neighbor list + + atom->cast_q_data(host_q); + atom->add_q_data(); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +int** EAMT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success, + double *host_q, double *boxlo, double *prd, int inum) { + acc_timers(); + + // use the atom count returned from load balance invoked in compute energy + ans->inum(inum); + host_start=inum; + + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + atom->add_q_data(); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then compute atom energies/forces +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void EAMT::compute_energy(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *fp, + const int nlocal, double *boxlo, double *prd, + double *evdwl) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + + energy(eflag,vflag); + + // copy fp from device to host for comm + + ucl_copy(host_fp,dev_fp,false); + + acctyp *ap=host_fp.begin(); + for (int i=0; i<inum; i++) { + fp[i]=*ap; + ap++; + } + + if (eflag) { + double e=0.0; + ucl_copy(ans->host_engv,ans->dev_engv,false); + for (int i=0; i<inum; i++) + e+=ans->host_engv[i]; + *evdwl+=e; + } + + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU and then compute per-atom densities +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +int** EAMT::compute_energy(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success, + double *fp, double *boxlo, double *prd, + double *evdwl, int &inum) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return NULL; + } + + // load balance, returning the atom count on the device (inum) + hd_balancer.balance(cpu_time); + inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return NULL; + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + energy(eflag,vflag); + + // copy fp from device to host for comm + + ucl_copy(host_fp,dev_fp,false); + + acctyp *ap=host_fp.begin(); + for (int i=0; i<inum; i++) { + fp[i]=*ap; + ap++; + } + + if (eflag) { + double e=0.0; + ucl_copy(ans->host_engv,ans->dev_engv,false); + for (int i=0; i<inum; i++) + e+=ans->host_engv[i]; + *evdwl+=e; + } + + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- // Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void EAMT::loop(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int eflag, vflag; if (_eflag) eflag=1; else eflag=0; if (_vflag) vflag=1; else vflag=0; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &this->atom->dev_q.begin(), &type2rhor_z2r.begin(), &rhor_spline.begin(), &z2r_spline.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &_cutforcesq, &_rdr, - &_nrhor, &_nz2r, &_nr, + &nbor_pitch, &_cutforcesq, &_rdr, &_nr, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &this->atom->dev_q.begin(), &type2rhor_z2r.begin(), &rhor_spline.begin(), &z2r_spline.begin(), &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, - &_nrhor, &_nz2r, &_nr, + &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr, &this->_threads_per_atom); } this->time_pair.stop(); } +// --------------------------------------------------------------------------- +// Calculate per-atom energies and forces +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void EAMT::energy(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + + + this->k_energy.set_size(GX,BX); + this->k_energy.run(&this->atom->dev_x.begin(), + &type2rhor_z2r.begin(), &type2frho.begin(), + &rhor_spline.begin(), &frho_spline.begin(), + &this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(), + &dev_fp.begin(), + &ans->dev_engv.begin(), + &eflag, &vflag, &ainum, + &nbor_pitch, + &_ntypes, &_cutforcesq, + &_rdr, &_rdrho, + &_nrho, &_nr, + &this->_threads_per_atom); + + this->time_pair.stop(); +} + +template <class numtyp, class acctyp> +void EAMT::compile_kernels(UCL_Device &dev, const char *pair_str) { + if (_compiled) + return; + + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE)+" -D"+ + std::string(OCL_VENDOR); + + pair_program=new UCL_Program(dev); + pair_program->load_string(pair_str,flags.c_str()); + k_pair_fast.set_function(*pair_program,"kernel_pair_fast"); + k_pair.set_function(*pair_program,"kernel_pair"); + k_energy.set_function(*pair_program,"kernel_energy"); + pos_tex.get_texture(*pair_program,"pos_tex"); + q_tex.get_texture(*pair_program,"q_tex"); + + _compiled=true; +} + template class EAM<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu index 74539cc32..7e3f8fc42 100644 --- a/lib/gpu/lal_eam.cu +++ b/lib/gpu/lal_eam.cu @@ -1,258 +1,357 @@ // ************************************************************************** // lal_eam.cu // ------------------- -// W. Michael Brown, Trung Dac Nguyen (ORNL) +// Trung Dac Nguyen, W. Michael Brown (ORNL) // -// Device code for acceleration of the lj/cut/coul/fsww/cut pair style +// Device code for acceleration of the eam pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : brownw@ornl.gov nguyentd@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_aux_fun1.h" texture<float4> pos_tex; texture<float> q_tex; #ifndef _DOUBLE_DOUBLE ucl_inline float4 fetch_pos(const int& i, const float4 *pos) { return tex1Dfetch(pos_tex, i); } ucl_inline float fetch_q(const int& i, const float *q) { return tex1Dfetch(q_tex, i); } #endif #endif #define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + +__kernel void kernel_energy(__global numtyp4 *x_, + __global numtyp2 *type2rhor_z2r, __global numtyp *type2frho, + __global numtyp *rhor_spline, __global numtyp *frho_spline, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp *fp_, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, + const int ntypes, const numtyp cutforcesq, + const numtyp rdr, const numtyp rdrho, + const int nrho, const int nr, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + acctyp rho = (acctyp)0; + acctyp energy = (acctyp)0; + + if (ii<inum) { + __global int *nbor, *list_end; + int i, numj, n_stride; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,list_end,nbor); + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + int itype=ix.w; + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + int jtype=jx.w; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<cutforcesq) { + numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0; + int m=p; + m = MIN(m,nr-1); + p -= m; + p = MIN(p,(numtyp)1.0); + + int mtype = jtype*ntypes+itype; + int index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7; + numtyp coeff3 = rhor_spline[index+3]; + numtyp coeff4 = rhor_spline[index+4]; + numtyp coeff5 = rhor_spline[index+5]; + numtyp coeff6 = rhor_spline[index+6]; + rho += ((coeff3*p + coeff4)*p + coeff5)*p + coeff6; + } + } // for nbor + + // reduce to get the density at atom ii + + if (t_per_atom>1) { + __local acctyp red_acc[BLOCK_PAIR]; + red_acc[tid]=rho; + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) + red_acc[tid] += red_acc[tid+s]; + } + rho=red_acc[tid]; + } + + // calculate the embedded force for ii + if (offset==0) { + numtyp p = rho*rdrho + (numtyp)1.0; + int m=p; + m = MAX(1,MIN(m,nrho-1)); + p -= m; + p = MIN(p,(numtyp)1.0); + + int index = type2frho[itype]*(nr+1)*7+m*7; + numtyp coeff0 = frho_spline[index+0]; + numtyp coeff1 = frho_spline[index+1]; + numtyp coeff2 = frho_spline[index+2]; + numtyp fp = (coeff0*p + coeff1)*p + coeff2; + fp_[ii]=fp; + + engv+=ii; + if (eflag>0) { + numtyp coeff3 = frho_spline[index+3]; + numtyp coeff4 = frho_spline[index+4]; + numtyp coeff5 = frho_spline[index+5]; + numtyp coeff6 = frho_spline[index+6]; + energy = ((coeff3*p + coeff4)*p + coeff5)*p + coeff6; + *engv=energy; + } + } + } // if ii +} + __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_, __global numtyp2 *type2rhor_z2r, __global numtyp *rhor_spline, __global numtyp *z2r_spline, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int ntypes, const numtyp cutforcesq, - const numtyp rdr, const int nrhor, const int nz2r, const int nr, + const numtyp rdr, const int nr, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; numtyp ifp=fetch_q(i,fp_); //fp_[i]; int itype=ix.w; for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; numtyp jfp=fetch_q(j,fp_); //fp_[j]; int jtype=jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<cutforcesq) { numtyp r = ucl_sqrt(rsq); numtyp p = r*rdr + (numtyp)1.0; - int m=__float2int_rd(p); + int m=p; m = MIN(m,nr-1); p -= m; p = MIN(p,(numtyp)1.0); int mtype,index; numtyp coeff0,coeff1,coeff2,coeff3,coeff4,coeff5,coeff6; mtype = itype*ntypes+jtype; index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7; coeff0 = rhor_spline[index+0]; coeff1 = rhor_spline[index+1]; coeff2 = rhor_spline[index+2]; numtyp rhoip = (coeff0*p + coeff1)*p + coeff2; mtype = jtype*ntypes+itype; index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7; coeff0 = rhor_spline[index+0]; coeff1 = rhor_spline[index+1]; coeff2 = rhor_spline[index+2]; numtyp rhojp = (coeff0*p + coeff1)*p + coeff2; mtype = itype*ntypes+jtype; index = type2rhor_z2r[mtype].y*(nr+1)*7+m*7; coeff0 = z2r_spline[index+0]; coeff1 = z2r_spline[index+1]; coeff2 = z2r_spline[index+2]; coeff3 = z2r_spline[index+3]; coeff4 = z2r_spline[index+4]; coeff5 = z2r_spline[index+5]; coeff6 = z2r_spline[index+6]; numtyp z2p = (coeff0*p + coeff1)*p + coeff2; numtyp z2 = ((coeff3*p + coeff4)*p + coeff5)*p + coeff6; numtyp recip = (numtyp)1.0/r; numtyp phi = z2*recip; numtyp phip = z2p*recip - phi*recip; numtyp psip = ifp*rhojp + jfp*rhoip + phip; numtyp force = -psip*recip; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { energy += phi; } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); } // if ii } __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_, __global numtyp2 *type2rhor_z2r, __global numtyp *rhor_spline, __global numtyp *z2r_spline, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const numtyp cutforcesq, - const numtyp rdr, const int nrhor, const int nz2r, const int nr, + const numtyp rdr, const int nr, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); acctyp energy=(acctyp)0; acctyp e_coul=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; if (ii<inum) { __global int *nbor, *list_end; int i, numj, n_stride; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); numtyp4 ix=fetch_pos(i,x_); //x_[i]; numtyp ifp=fetch_q(i,fp_); //fp_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); for ( ; nbor<list_end; nbor+=n_stride) { int j=*nbor; j &= NEIGHMASK; numtyp4 jx=fetch_pos(j,x_); //x_[j]; numtyp jfp=fetch_q(j,fp_); //fp_[j]; int jtype=jx.w; // Compute r12 numtyp delx = ix.x-jx.x; numtyp dely = ix.y-jx.y; numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<cutforcesq) { numtyp r = ucl_sqrt(rsq); numtyp p = r*rdr + (numtyp)1.0; - int m=__float2int_rd(p); + int m=p; m = MIN(m,nr-1); p -= m; p = MIN(p,(numtyp)1.0); numtyp coeff0,coeff1,coeff2,coeff3,coeff4,coeff5,coeff6; int mtype,index; mtype = itype+jx.w; index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7; coeff0 = rhor_spline[index+0]; coeff1 = rhor_spline[index+1]; coeff2 = rhor_spline[index+2]; numtyp rhoip = (coeff0*p + coeff1)*p + coeff2; mtype = jtype+ix.w; index = type2rhor_z2r[mtype].x*(nr+1)*7+m*7; coeff0 = rhor_spline[index+0]; coeff1 = rhor_spline[index+1]; coeff2 = rhor_spline[index+2]; numtyp rhojp = (coeff0*p + coeff1)*p + coeff2; mtype = itype+jx.w; index = type2rhor_z2r[mtype].y*(nr+1)*7+m*7; coeff0 = z2r_spline[index+0]; coeff1 = z2r_spline[index+1]; coeff2 = z2r_spline[index+2]; coeff3 = z2r_spline[index+3]; coeff4 = z2r_spline[index+4]; coeff5 = z2r_spline[index+5]; coeff6 = z2r_spline[index+6]; numtyp z2p = (coeff0*p + coeff1)*p + coeff2; numtyp z2 = ((coeff3*p + coeff4)*p + coeff5)*p + coeff6; numtyp recip = (numtyp)1.0/r; numtyp phi = z2*recip; numtyp phip = z2p*recip - phi*recip; numtyp psip = ifp*rhojp + jfp*rhoip + phip; numtyp force = -psip*recip; f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { energy += phi; } if (vflag>0) { virial[0] += delx*delx*force; virial[1] += dely*dely*force; virial[2] += delz*delz*force; virial[3] += delx*dely*force; virial[4] += delx*delz*force; virial[5] += dely*delz*force; } } } // for nbor store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); } // if ii } diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h index 01a159088..4da240079 100644 --- a/lib/gpu/lal_eam.h +++ b/lib/gpu/lal_eam.h @@ -1,87 +1,275 @@ /*************************************************************************** lal_eam.h ------------------- W. Michael Brown, Trung Dac Nguyen (ORNL) Class for acceleration of the eam pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ #ifndef LAL_EAM_H #define LAL_EAM_H -#include "lal_base_charge.h" +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif namespace LAMMPS_AL { template <class numtyp, class acctyp> -class EAM : public BaseCharge<numtyp, acctyp> { +class EAM { public: EAM(); ~EAM(); + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const char *pair_program); + /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * * Returns: * - 0 if successfull * - -1 if fix gpu not found * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double host_cutforcesq, - int **host_type2rhor, int **host_type2z2r, + int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, - double rdr, int nrhor, int nz2r, int nr, + double ***host_frho_spline, + double rdr, double rdrho, int nrhor, int nrho, + int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen); + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->dev_x,4); + q_tex.bind_float(atom->dev_q,1); + } + + // resize rho + dev_fp.clear(); + host_fp.clear(); + + bool cpuview=false; + if (ucl_device->device_type()==UCL_CPU) + cpuview=true; + + int _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10); + host_fp.alloc(_max_atoms,*ucl_device); + if (cpuview) + dev_fp.view(host_fp); + else + dev_fp.alloc(_max_atoms,*ucl_device,UCL_WRITE_ONLY); + + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); /// Returns memory usage on device per atom int bytes_per_atom(const int max_nbors) const; /// Total host memory used by library for pair style double host_memory_usage() const; + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge, + const int nlocal, double *boxlo, double *prd); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd, int inum); + + /// Pair loop with host neighboring + void compute_energy(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge, + const int nlocal, double *boxlo, double *prd, double* evdwl); + + /// Pair loop with device neighboring + int** compute_energy(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd, double* evdwl, int &inum); + +// -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device<numtyp,acctyp> *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance<numtyp,acctyp> hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom<numtyp,acctyp> *atom; + + + // ------------------------ FORCE/ENERGY/DENSITY DATA -------------- + + Answer<numtyp,acctyp> *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_pair_fast, k_pair, k_energy; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + // --------------------------- TYPE DATA -------------------------- UCL_D_Vec<numtyp2> type2rhor_z2r; - UCL_D_Vec<numtyp> rhor_spline; + UCL_D_Vec<numtyp> type2frho; - UCL_D_Vec<numtyp> z2r_spline; + UCL_D_Vec<numtyp> frho_spline, rhor_spline, z2r_spline; - numtyp _cutforcesq,_rdr; + numtyp _cutforcesq,_rdr,_rdrho; - int _nrhor,_nz2r,_nr; + int _nfrho,_nrhor,_nrho,_nz2r,_nr; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; /// Number of atom types int _ntypes; + + /// Per-atom arrays + UCL_H_Vec<numtyp> host_fp; + UCL_D_Vec<numtyp> dev_fp; - // --------------------------- TEXTURES ----------------------------- - UCL_Texture fp_tex; - - private: +protected: + bool _compiled; + int _block_size, _block_bio_size, _threads_per_atom; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec<int> *_nbor_data; + + void compile_kernels(UCL_Device &dev, const char *pair_string); + bool _allocated; void loop(const bool _eflag, const bool _vflag); + void energy(const bool _eflag, const bool _vflag); }; } #endif diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp index d2b178774..521627dfc 100644 --- a/lib/gpu/lal_eam_ext.cpp +++ b/lib/gpu/lal_eam_ext.cpp @@ -1,132 +1,161 @@ // ************************************************************************** -// lal_lj_coul_fsww_ext.cpp +// lal_eam_ext.cpp // ------------------- // W. Michael Brown, Trung Dac Nguyen (ORNL) // -// Class for acceleration of the lj/cut/coul/fsww/cut pair style +// Class for acceleration of the eam pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : brownw@ornl.gov nguyentd@ornl.gov // ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_eam.h" using namespace std; using namespace LAMMPS_AL; static EAM<PRECISION,ACC_PRECISION> EAMMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int eam_gpu_init(const int ntypes, double host_cutforcesq, - int **host_type2rhor, int **host_type2z2r, + int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, - double rdr, int nrhor, int nz2r, int nr, + double ***host_frho_spline, + double rdr, double rdrho, int nrhor, + int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { EAMMF.clear(); gpu_mode=EAMMF.device->gpu_mode(); double gpu_split=EAMMF.device->particle_split(); int first_gpu=EAMMF.device->first_device(); int last_gpu=EAMMF.device->last_device(); int world_me=EAMMF.device->world_me(); int gpu_rank=EAMMF.device->gpu_rank(); int procs_per_gpu=EAMMF.device->procs_per_gpu(); EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu); bool message=false; if (EAMMF.device->replica_me()==0 && screen) message=true; if (message) { fprintf(screen,"Initializing GPU and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=EAMMF.init(ntypes, host_cutforcesq, - host_type2rhor, host_type2z2r, - host_rhor_spline, host_z2r_spline, - rdr, nrhor, nz2r, nr, - nlocal, nall, 300, maxspecial, - cell_size, gpu_split, screen); + host_type2rhor, host_type2z2r, host_type2frho, + host_rhor_spline, host_z2r_spline, + host_frho_spline, + rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, + nlocal, nall, 300, maxspecial, + cell_size, gpu_split, screen); EAMMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); else fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=EAMMF.init(ntypes, host_cutforcesq, - host_type2rhor, host_type2z2r, - host_rhor_spline, host_z2r_spline, - rdr, nrhor, nz2r, nr, - nlocal, nall, 300, maxspecial, - cell_size, gpu_split, screen); + host_type2rhor, host_type2z2r, host_type2frho, + host_rhor_spline, host_z2r_spline, + host_frho_spline, + rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, + nlocal, nall, 300, maxspecial, + cell_size, gpu_split, screen); EAMMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) EAMMF.estimate_gpu_overhead(); return init_ok; } void eam_gpu_clear() { EAMMF.clear(); } int ** eam_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_fp, double *boxlo, - double *prd) { + double *prd, int inum) { return EAMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, - host_fp, boxlo, prd); + host_fp, boxlo, prd, inum); } void eam_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_fp, const int nlocal, double *boxlo, double *prd) { EAMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_fp,nlocal,boxlo,prd); } +int ** eam_gpu_compute_energy_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_fp, double *boxlo, + double *prd, double *eng_vdwl, int &inum) { + return EAMMF.compute_energy(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_fp, boxlo, prd, eng_vdwl, inum); +} + +void eam_gpu_compute_energy(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_fp, + const int nlocal, double *boxlo, double *prd, double* evdwl) { + EAMMF.compute_energy(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_fp,nlocal,boxlo,prd,evdwl); +} + double eam_gpu_bytes() { return EAMMF.host_memory_usage(); } diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp index 62eb38e4d..cc3ecff10 100644 --- a/src/GPU/pair_eam_gpu.cpp +++ b/src/GPU/pair_eam_gpu.cpp @@ -1,387 +1,429 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Trung Dac Nguyen, W. Michael Brown (ORNL) ------------------------------------------------------------------------- */ #include "math.h" #include "stdio.h" #include "stdlib.h" #include "string.h" #include "pair_eam_gpu.h" #include "atom.h" #include "force.h" #include "comm.h" #include "domain.h" #include "neighbor.h" #include "neigh_list.h" #include "memory.h" #include "error.h" #include "neigh_request.h" #include "gpu_extra.h" using namespace LAMMPS_NS; #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MAXLINE 1024 // External functions from cuda library for atom decomposition int eam_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, + int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, - double rdr, int nrhor, int nz2r, int nr, + double ***host_frho_spline, + double rdr, double rdrho, int nrhor, int nrho, + int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void eam_gpu_clear(); int** eam_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_fp, double *boxlo, - double *prd); + double *prd, int inum); void eam_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_fp, const int nlocal, double *boxlo, double *prd); +int** eam_gpu_compute_energy_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_fp, double *boxlo, + double *prd, double *evdwl, int &inum); +void eam_gpu_compute_energy(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_fp, + const int nlocal, double *boxlo, double *prd, double *evdwl); double eam_gpu_bytes(); /* ---------------------------------------------------------------------- */ PairEAMGPU::PairEAMGPU(LAMMPS *lmp) : PairEAM(lmp), gpu_mode(GPU_FORCE) { respa_enable = 0; cpu_time = 0.0; } /* ---------------------------------------------------------------------- check if allocated, since class can be destructed when incomplete ------------------------------------------------------------------------- */ PairEAMGPU::~PairEAMGPU() { eam_gpu_clear(); } /* ---------------------------------------------------------------------- */ double PairEAMGPU::memory_usage() { double bytes = Pair::memory_usage(); return bytes + eam_gpu_bytes(); } /* ---------------------------------------------------------------------- */ void PairEAMGPU::compute(int eflag, int vflag) { - int i,j,ii,jj,m,inum,jnum,itype,jtype; - double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; - double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi; - double *coeff; - int *ilist,*jlist,*numneigh,**firstneigh; + int i,j,ii,jj,m,jnum,itype,jtype; + double evdwl,*coeff; evdwl = 0.0; if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = eflag_global = eflag_atom = 0; // grow energy and fp arrays if necessary // need to be atom->nmax in length if (atom->nmax > nmax) { memory->destroy(rho); memory->destroy(fp); nmax = atom->nmax; memory->create(rho,nmax,"pair:rho"); memory->create(fp,nmax,"pair:fp"); } - - double **x = atom->x; - double **f = atom->f; - int *type = atom->type; + int nlocal = atom->nlocal; int newton_pair = force->newton_pair; - - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - + // zero out density if (newton_pair) { m = nlocal + atom->nghost; - for (i = 0; i < m; i++) rho[i] = 0.0; - } else for (i = 0; i < nlocal; i++) rho[i] = 0.0; - + for (i = 0; i < m; i++) rho[i] = 0.0; + } else for (i = 0; i < nlocal; i++) rho[i] = 0.0; + + + // compute density on each atom on GPU + + int nall = atom->nlocal + atom->nghost; + int inum, host_start, inum_dev; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + inum = atom->nlocal; + + firstneigh = eam_gpu_compute_energy_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, fp, domain->boxlo, + domain->prd, &eng_vdwl, inum_dev); + } else { // gpu_mode == GPU_FORCE + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + eam_gpu_compute_energy(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, fp, + atom->nlocal, domain->boxlo, domain->prd, &eng_vdwl); + } + + if (!success) + error->one(FLERR,"Out of memory on GPGPU"); + + if (host_start<inum) { + cpu_time = MPI_Wtime(); + cpu_compute_energy(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = MPI_Wtime() - cpu_time; + } + + // communicate derivative of embedding function + + comm->forward_comm_pair(this); + + // compute forces on each atom on GPU + + if (gpu_mode != GPU_FORCE) { + inum = atom->nlocal; + firstneigh = eam_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, fp, domain->boxlo, + domain->prd, inum_dev); + } else { // gpu_mode == GPU_FORCE + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + eam_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, fp, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) + error->one(FLERR,"Out of memory on GPGPU"); + + if (host_start<inum) { + double cpu_time2 = MPI_Wtime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time += MPI_Wtime() - cpu_time2; + } + + if (vflag_fdotr) virial_fdotr_compute(); +} + +void PairEAMGPU::cpu_compute_energy(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, + int **firstneigh) +{ + int i,j,ii,jj,m,jnum,itype,jtype; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double rsq,r,p,rhoip,rhojp,phi; + double *coeff; + int *jlist; + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + int nlocal = atom->nlocal; + int newton_pair = force->newton_pair; // rho = density at each atom // loop over neighbors of my atoms - for (ii = 0; ii < inum; ii++) { + for (ii = start; ii < inum; ii++) { i = ilist[ii]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; delz = ztmp - x[j][2]; rsq = delx*delx + dely*dely + delz*delz; if (rsq < cutforcesq) { jtype = type[j]; p = sqrt(rsq)*rdr + 1.0; m = static_cast<int> (p); m = MIN(m,nr-1); p -= m; p = MIN(p,1.0); coeff = rhor_spline[type2rhor[jtype][itype]][m]; rho[i] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; - if (newton_pair || j < nlocal && gpu_mode != GPU_FORCE) { - coeff = rhor_spline[type2rhor[itype][jtype]][m]; - rho[j] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; - } - } - + } } } // communicate and sum densities if (newton_pair) comm->reverse_comm_pair(this); // fp = derivative of embedding energy at each atom // phi = embedding energy at each atom - for (ii = 0; ii < inum; ii++) { + for (ii = start; ii < inum; ii++) { i = ilist[ii]; p = rho[i]*rdrho + 1.0; m = static_cast<int> (p); m = MAX(1,MIN(m,nrho-1)); p -= m; p = MIN(p,1.0); coeff = frho_spline[type2frho[type[i]]][m]; fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2]; if (eflag) { phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; if (eflag_global) eng_vdwl += phi; if (eflag_atom) eatom[i] += phi; } } - - // communicate derivative of embedding function - - comm->forward_comm_pair(this); - -// printf("\nBefore computing force: eng_vdwl = %f\n", eng_vdwl); - - // compute forces on each atom on GPU - { - int nall = atom->nlocal + atom->nghost; - int inum, host_start; - - bool success = true; - int *ilist, *numneigh, **firstneigh; - if (gpu_mode != GPU_FORCE) { - inum = atom->nlocal; - firstneigh = eam_gpu_compute_n(neighbor->ago, inum, nall, atom->x, - atom->type, domain->sublo, domain->subhi, - atom->tag, atom->nspecial, atom->special, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, fp, domain->boxlo, - domain->prd); - } else { // gpu_mode == GPU_FORCE - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - eam_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, - ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, fp, - atom->nlocal, domain->boxlo, domain->prd); - } - if (!success) - error->one(FLERR,"Out of memory on GPGPU"); - - if (host_start<inum) { - cpu_time = MPI_Wtime(); - cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); - cpu_time = MPI_Wtime() - cpu_time; - } - - } - -// printf("\nAfter computing force: eng_vdwl = %f\n", eng_vdwl); - - if (vflag_fdotr) virial_fdotr_compute(); - } /* ---------------------------------------------------------------------- */ void PairEAMGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist, int *numneigh, int **firstneigh) { int i,j,ii,jj,m,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi; double *coeff; int *jlist; double **x = atom->x; double **f = atom->f; int *type = atom->type; int nlocal = atom->nlocal; int newton_pair = force->newton_pair; // compute forces on each atom // loop over neighbors of my atoms for (ii = start; ii < inum; ii++) { i = ilist[ii]; xtmp = x[i][0]; ytmp = x[i][1]; ztmp = x[i][2]; itype = type[i]; jlist = firstneigh[i]; jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; j &= NEIGHMASK; delx = xtmp - x[j][0]; dely = ytmp - x[j][1]; delz = ztmp - x[j][2]; rsq = delx*delx + dely*dely + delz*delz; if (rsq < cutforcesq) { jtype = type[j]; r = sqrt(rsq); p = r*rdr + 1.0; m = static_cast<int> (p); m = MIN(m,nr-1); p -= m; p = MIN(p,1.0); // rhoip = derivative of (density at atom j due to atom i) // rhojp = derivative of (density at atom i due to atom j) // phi = pair potential energy // phip = phi' // z2 = phi * r // z2p = (phi * r)' = (phi' r) + phi // psip needs both fp[i] and fp[j] terms since r_ij appears in two // terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji) // hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip coeff = rhor_spline[type2rhor[itype][jtype]][m]; rhoip = (coeff[0]*p + coeff[1])*p + coeff[2]; coeff = rhor_spline[type2rhor[jtype][itype]][m]; rhojp = (coeff[0]*p + coeff[1])*p + coeff[2]; coeff = z2r_spline[type2z2r[itype][jtype]][m]; z2p = (coeff[0]*p + coeff[1])*p + coeff[2]; z2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]; recip = 1.0/r; phi = z2*recip; phip = z2p*recip - phi*recip; psip = fp[i]*rhojp + fp[j]*rhoip + phip; fpair = -psip*recip; f[i][0] += delx*fpair; f[i][1] += dely*fpair; f[i][2] += delz*fpair; if (eflag) evdwl = phi; if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); } } } } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairEAMGPU::init_style() { if (force->newton_pair) error->all(FLERR,"Cannot use newton pair with eam/gpu pair style"); if (!allocated) error->all(FLERR,"Not allocate memory eam/gpu pair style"); // convert read-in file(s) to arrays and spline them file2array(); array2spline(); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; double cut; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); cut *= cut; if (cut > maxcut) maxcut = cut; cutsq[i][j] = cutsq[j][i] = cut; } else cutsq[i][j] = cutsq[j][i] = 0.0; } } double cell_size = sqrt(maxcut) + neighbor->skin; int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; int success = eam_gpu_init(atom->ntypes+1, cutforcesq, - type2rhor, type2z2r, rhor_spline, z2r_spline, - rdr, nrhor, nz2r, nr, atom->nlocal, + type2rhor, type2z2r, type2frho, + rhor_spline, z2r_spline, frho_spline, + rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, atom->nlocal, atom->nlocal+atom->nghost, 300, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success,error,world); - if (gpu_mode == GPU_FORCE) { - int irequest = neighbor->request(this); - neighbor->requests[irequest]->half = 0; - neighbor->requests[irequest]->full = 1; - } else neighbor->request(this); + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; } diff --git a/src/GPU/pair_eam_gpu.h b/src/GPU/pair_eam_gpu.h index 0c4f7b580..b7eaffcfa 100644 --- a/src/GPU/pair_eam_gpu.h +++ b/src/GPU/pair_eam_gpu.h @@ -1,50 +1,51 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ #ifdef PAIR_CLASS PairStyle(eam/gpu,PairEAMGPU) #else #ifndef LMP_PAIR_EAM_GPU_H #define LMP_PAIR_EAM_GPU_H #include "stdio.h" #include "pair_eam.h" namespace LAMMPS_NS { class PairEAMGPU : public PairEAM { public: PairEAMGPU(class LAMMPS *); virtual ~PairEAMGPU(); void cpu_compute(int, int, int, int, int *, int *, int **); + void cpu_compute_energy(int, int, int, int, int *, int *, int **); void compute(int, int); void init_style(); double memory_usage(); enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; private: int gpu_mode; double cpu_time; int *gpulist; }; } #endif #endif