diff --git a/lib/gpu/coul_long_gpu.cpp b/lib/gpu/coul_long_gpu.cpp new file mode 100644 index 000000000..60c0d35d7 --- /dev/null +++ b/lib/gpu/coul_long_gpu.cpp @@ -0,0 +1,124 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#include <iostream> +#include <cassert> +#include <math.h> + +#include "coul_long_gpu_memory.h" + +using namespace std; + +static CL_GPU_Memory<PRECISION,ACC_PRECISION> CLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int cl_gpu_init(const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald) { + CLMF.clear(); + gpu_mode=CLMF.device->gpu_mode(); + double gpu_split=CLMF.device->particle_split(); + int first_gpu=CLMF.device->first_device(); + int last_gpu=CLMF.device->last_device(); + int world_me=CLMF.device->world_me(); + int gpu_rank=CLMF.device->gpu_rank(); + int procs_per_gpu=CLMF.device->procs_per_gpu(); + + CLMF.device->init_message(screen,"coul/long",first_gpu,last_gpu); + + bool message=false; + if (CLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split, + screen, host_cut_coulsq, host_special_coul, qqrd2e, + g_ewald); + + CLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; i<procs_per_gpu; i++) { + if (message) { + if (last_gpu-first_gpu==0) + fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + else + fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + last_gpu,i); + fflush(screen); + } + if (gpu_rank==i && world_me!=0) + init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split, + screen, host_cut_coulsq, host_special_coul, + qqrd2e, g_ewald); + + CLMF.device->gpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CLMF.estimate_gpu_overhead(); + return init_ok; +} + +void cl_gpu_clear() { + CLMF.clear(); +} + +int** cl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void cl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double cl_gpu_bytes() { + return CLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/coul_long_gpu_kernel.cu b/lib/gpu/coul_long_gpu_kernel.cu new file mode 100644 index 000000000..bc3747a7e --- /dev/null +++ b/lib/gpu/coul_long_gpu_kernel.cu @@ -0,0 +1,411 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef CL_GPU_KERNEL +#define CL_GPU_KERNEL + +#ifdef NV_KERNEL + +#include "nv_kernel_def.h" +texture<float4> pos_tex; +texture<float> q_tex; + +#ifdef _DOUBLE_DOUBLE +__inline double4 fetch_pos(const int& i, const double4 *pos) +{ + return pos[i]; +} +__inline double fetch_q(const int& i, const double *q) +{ + return q[i]; +} +#else +__inline float4 fetch_pos(const int& i, const float4 *pos) +{ + return tex1Dfetch(pos_tex, i); +} +__inline float fetch_q(const int& i, const float *q) +{ + return tex1Dfetch(q_tex, i); +} +#endif + +#else + +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#define THREAD_ID_X get_local_id(0) +#define BLOCK_ID_X get_group_id(0) +#define BLOCK_SIZE_X get_local_size(0) +#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define __inline inline + +#define fetch_pos(i,y) x_[i] +#define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 + +#endif + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#define EWALD_F (numtyp)1.12837917 +#define EWALD_P (numtyp)0.3275911 +#define A1 (numtyp)0.254829592 +#define A2 (numtyp)-0.284496736 +#define A3 (numtyp)1.421413741 +#define A4 (numtyp)-1.453152027 +#define A5 (numtyp)1.061405429 + +#define SBBITS 30 +#define NEIGHMASK 0x3FFFFFFF +__inline int sbmask(int j) { return j >> SBBITS & 3; } + +__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_cl_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp sp_cl[4]; + sp_cl[0]=sp_cl_in[0]; + sp_cl[1]=sp_cl_in[1]; + sp_cl[2]=sp_cl_in[2]; + sp_cl[3]=sp_cl_in[3]; + + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii<inum) { + __global int *nbor=dev_nbor+ii; + int i=*nbor; + nbor+=nbor_pitch; + int numj=*nbor; + nbor+=nbor_pitch; + + int n_stride; + __global int *list_end; + if (dev_nbor==dev_packed) { + list_end=nbor+mul24(numj,nbor_pitch); + nbor+=mul24(offset,nbor_pitch); + n_stride=mul24(t_per_atom,nbor_pitch); + } else { + nbor=dev_packed+*nbor; + list_end=nbor+numj; + n_stride=t_per_atom; + nbor+=offset; + } + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_coul; + factor_coul = (numtyp)1.0-sp_cl[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq < cut_coulsq) { + numtyp r2inv=(numtyp)1.0/rsq; + numtyp force, prefactor, _erfc; + + numtyp r = sqrt(rsq); + numtyp grij = g_ewald * r; + numtyp expm2 = exp(-grij*grij); + numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + e_coul += prefactor*(_erfc-factor_coul); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + e_coul=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii<inum && offset==0) { + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=(acctyp)0; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii +} + +__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_cl_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + + __local numtyp sp_cl[4]; + if (tid<4) + sp_cl[tid]=sp_cl_in[tid]; + + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii<inum) { + __global int *nbor=dev_nbor+ii; + int i=*nbor; + nbor+=nbor_pitch; + int numj=*nbor; + nbor+=nbor_pitch; + + int n_stride; + __global int *list_end; + if (dev_nbor==dev_packed) { + list_end=nbor+mul24(numj,nbor_pitch); + nbor+=mul24(offset,nbor_pitch); + n_stride=mul24(t_per_atom,nbor_pitch); + } else { + nbor=dev_packed+*nbor; + list_end=nbor+numj; + n_stride=t_per_atom; + nbor+=offset; + } + + numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp qtmp=fetch_q(i,q_); + + for ( ; nbor<list_end; nbor+=n_stride) { + int j=*nbor; + + numtyp factor_coul; + factor_coul = (numtyp)1.0-sp_cl[sbmask(j)]; + j &= NEIGHMASK; + + numtyp4 jx=fetch_pos(j,x_); //x_[j]; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq < cut_coulsq) { + numtyp r2inv=(numtyp)1.0/rsq; + numtyp force, prefactor, _erfc; + + numtyp r = sqrt(rsq); + numtyp grij = g_ewald * r; + numtyp expm2 = exp(-grij*grij); + numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij); + _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; + force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) { + e_coul += prefactor*(_erfc-factor_coul); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + e_coul=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii<inum && offset==0) { + __global acctyp *ap1=engv+ii; + if (eflag>0) { + *ap1=(acctyp)0; + ap1+=inum; + *ap1=e_coul; + ap1+=inum; + } + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=inum; + } + } + ans[ii]=f; + } // if ii*/ +} + +#endif + diff --git a/lib/gpu/coul_long_gpu_memory.cpp b/lib/gpu/coul_long_gpu_memory.cpp new file mode 100644 index 000000000..7ea407795 --- /dev/null +++ b/lib/gpu/coul_long_gpu_memory.cpp @@ -0,0 +1,158 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifdef USE_OPENCL +#include "coul_long_gpu_cl.h" +#else +#include "coul_long_gpu_ptx.h" +#endif + +#include "coul_long_gpu_memory.h" +#include <cassert> +#define CL_GPU_MemoryT CL_GPU_Memory<numtyp, acctyp> + +extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device; + +template <class numtyp, class acctyp> +CL_GPU_MemoryT::CL_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(), + _allocated(false) { +} + +template <class numtyp, class acctyp> +CL_GPU_MemoryT::~CL_GPU_Memory() { + clear(); +} + +template <class numtyp, class acctyp> +int CL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template <class numtyp, class acctyp> +int CL_GPU_MemoryT::init(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,coul_long_gpu_kernel); + if (success!=0) + return success; + + // we don't have atom types for coulomb only, + // but go with the minimum so that we can use + // the same infrastructure as lj/cut/coul/long/gpu. + int lj_types=1; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; i<lj_types*lj_types; i++) + host_write[i]=0.0; + + lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + + sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_coul[i]; + } + ucl_copy(sp_cl,host_write,4,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_cl.row_bytes(); + return 0; +} + +template <class numtyp, class acctyp> +void CL_GPU_MemoryT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_cl.clear(); + this->clear_atomic(); +} + +template <class numtyp, class acctyp> +double CL_GPU_MemoryT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CL_GPU_Memory<numtyp,acctyp>); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template <class numtyp, class acctyp> +void CL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), + &lj3.begin(), &sp_cl.begin(), + &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->dev_q.begin(), + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), + &_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(), + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class CL_GPU_Memory<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/coul_long_gpu_memory.h b/lib/gpu/coul_long_gpu_memory.h new file mode 100644 index 000000000..04914a251 --- /dev/null +++ b/lib/gpu/coul_long_gpu_memory.h @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL), brownw@ornl.gov +------------------------------------------------------------------------- */ + +#ifndef CL_GPU_MEMORY_H +#define CL_GPU_MEMORY_H + +#include "charge_gpu_memory.h" + +template <class numtyp, class acctyp> +class CL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> { + public: + CL_GPU_Memory(); + ~CL_GPU_Memory(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1 dummy + UCL_D_Vec<numtyp4> lj1; + /// lj3 dummy + UCL_D_Vec<numtyp4> lj3; + /// Special Coul values [0-3] + UCL_D_Vec<numtyp> sp_cl; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +#endif +