Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F104191853
pppm.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Mar 7, 03:12
Size
12 KB
Mime Type
text/x-c++
Expires
Sun, Mar 9, 03:12 (2 d)
Engine
blob
Format
Raw Data
Handle
24762335
Attached To
rLAMMPS lammps
pppm.cpp
View Options
/***************************************************************************
pppm.cpp
-------------------
W. Michael Brown (ORNL)
Class for PPPM acceleration
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "pppm_cl.h"
#else
#include "pppm_f_ptx.h"
#include "pppm_d_ptx.h"
#endif
#include "pppm.h"
#include <cassert>
using namespace LAMMPS_AL;
#define PPPMT PPPM<numtyp, acctyp, grdtyp, grdtyp4>
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
PPPMT::PPPM() : _allocated(false), _compiled(false),
_max_bytes(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
PPPMT::~PPPM() {
clear(0.0);
delete ans;
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
int PPPMT::bytes_per_atom() const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1;
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
const int order, const int nxlo_out,
const int nylo_out, const int nzlo_out,
const int nxhi_out, const int nyhi_out,
const int nzhi_out, double **rho_coeff,
grdtyp **vd_brick, const double slab_volfactor,
const int nx_pppm, const int ny_pppm,
const int nz_pppm, int &flag) {
_max_bytes=10;
screen=_screen;
bool success=true;
flag=device->init(*ans,nlocal,nall);
if (flag!=0)
return 0;
if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
flag=-5;
return 0;
}
if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
flag=-4;
return 0;
}
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pppm_block();
_pencil_size=device->num_mem_threads();
_block_pencils=_block_size/_pencil_size;
compile_kernels(*ucl_device);
// Initialize timers for the selected GPU
time_in.init(*ucl_device);
time_in.zero();
time_out.init(*ucl_device);
time_out.zero();
time_map.init(*ucl_device);
time_map.zero();
time_rho.init(*ucl_device);
time_rho.zero();
time_interp.init(*ucl_device);
time_interp.zero();
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
_allocated=true;
_max_bytes=0;
_max_an_bytes=ans->gpu_bytes();
_order=order;
_order_m_1=order-1;
_order2=_order_m_1*_order;
_nlower=-(_order-1)/2;
_nupper=order/2;
_nxlo_out=nxlo_out;
_nylo_out=nylo_out;
_nzlo_out=nzlo_out;
_nxhi_out=nxhi_out;
_nyhi_out=nyhi_out;
_nzhi_out=nzhi_out;
_slab_volfactor=slab_volfactor;
_nx_pppm=nx_pppm;
_ny_pppm=ny_pppm;
_nz_pppm=nz_pppm;
_max_brick_atoms=10;
// Get rho_coeff on device
int n2lo=(1-order)/2;
int numel=order*( order/2 - n2lo + 1 );
success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
UCL_SUCCESS);
UCL_H_Vec<double> view;
view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
ucl_copy(d_rho_coeff,view,true);
_max_bytes+=d_rho_coeff.row_bytes();
// Allocate storage for grid
_npts_x=nxhi_out-nxlo_out+1;
_npts_y=nyhi_out-nylo_out+1;
_npts_z=nzhi_out-nzlo_out+1;
_npts_yx=_npts_x*_npts_y;
success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
UCL_SUCCESS);
success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
UCL_SUCCESS);
success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
UCL_SUCCESS);
*vd_brick=h_vd_brick.begin();
_max_bytes+=d_brick.row_bytes();
// Allocate vector with count of atoms assigned to each grid point
_nlocal_x=_npts_x+_nlower-_nupper;
_nlocal_y=_npts_y+_nlower-_nupper;
_nlocal_z=_npts_z+_nlower-_nupper;
_nlocal_yx=_nlocal_x*_nlocal_y;
_atom_stride=_nlocal_x*_nlocal_y*_nlocal_z;
success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)==
UCL_SUCCESS);
_max_bytes+=d_brick_counts.row_bytes();
// Allocate storage for atoms assigned to each grid point
success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,
*ucl_device)==UCL_SUCCESS);
_max_bytes+=d_brick_atoms.row_bytes();
// Allocate error flags for checking out of bounds atoms
success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
UCL_SUCCESS);
if (!success) {
flag=-3;
return 0;
}
d_error_flag.zero();
_max_bytes+=1;
_cpu_idle_time=0.0;
return h_brick.begin();
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
void PPPMT::clear(const double cpu_time) {
if (!_allocated)
return;
_allocated=false;
_precompute_done=false;
d_brick.clear();
h_brick.clear();
h_vd_brick.clear();
d_brick_counts.clear();
h_error_flag.clear();
d_error_flag.clear();
d_brick_atoms.clear();
acc_timers();
device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
*ans,_max_bytes+_max_an_bytes,cpu_time,
_cpu_idle_time,screen);
if (_compiled) {
k_particle_map.clear();
k_make_rho.clear();
k_interp.clear();
delete pppm_program;
_compiled=false;
}
time_in.clear();
time_out.clear();
time_map.clear();
time_rho.clear();
time_interp.clear();
device->clear();
}
// ---------------------------------------------------------------------------
// Charge assignment that can be performed asynchronously
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
double **host_x, int *host_type, bool &success,
double *host_q, double *boxlo,
const double delxinv, const double delyinv,
const double delzinv) {
acc_timers();
if (nlocal==0) {
zero_timers();
return;
}
ans->inum(nlocal);
if (ago==0) {
resize_atom(nlocal,nall,success);
resize_local(nlocal,success);
if (!success)
return;
double bytes=ans->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
atom->add_x_data(host_x,host_type);
atom->add_q_data();
time_map.start();
// Compute the block size and grid size to keep all cores busy
int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->ans->inum();
// Boxlo adjusted to be upper left brick and shift for even spline order
double shift=0.0;
if (_order % 2)
shift=0.5;
_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
_delxinv=delxinv;
_delyinv=delyinv;
_delzinv=delzinv;
double delvolinv = delxinv*delyinv*delzinv;
grdtyp f_delvolinv = delvolinv;
device->zero(d_brick_counts,d_brick_counts.numel());
k_particle_map.set_size(GX,BX);
k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
&ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv,
&_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z,
&_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
time_map.stop();
time_rho.start();
BX=block_size();
GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
_block_pencils));
k_make_rho.set_size(GX,BX);
k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
&d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride,
&_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
&_nlocal_z, &_order_m_1, &_order, &_order2);
time_rho.stop();
time_out.start();
ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
ucl_copy(h_error_flag,d_error_flag,true);
time_out.stop();
_precompute_done=true;
}
// ---------------------------------------------------------------------------
// Charge spreading stuff
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
int PPPMT::spread(const int ago, const int nlocal, const int nall,
double **host_x, int *host_type, bool &success,
double *host_q, double *boxlo,
const double delxinv, const double delyinv,
const double delzinv) {
if (_precompute_done==false) {
atom->acc_timers();
_precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
delyinv,delzinv);
}
device->stop_host_timer();
if (!success || nlocal==0)
return 0;
double t=MPI_Wtime();
time_out.sync_stop();
_cpu_idle_time+=MPI_Wtime()-t;
_precompute_done=false;
if (h_error_flag[0]==2) {
// Not enough storage for atoms on the brick
_max_brick_atoms*=2;
d_error_flag.zero();
d_brick_atoms.clear();
d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
_max_bytes+=d_brick_atoms.row_bytes();
return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
delxinv,delyinv,delzinv);
}
return h_error_flag[0];
}
// ---------------------------------------------------------------------------
// Charge spreading stuff
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
void PPPMT::interp(const grdtyp qqrd2e_scale) {
time_in.start();
ucl_copy(d_brick,h_vd_brick,true);
time_in.stop();
time_interp.start();
// Compute the block size and grid size to keep all cores busy
int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
int ainum=this->ans->inum();
k_interp.set_size(GX,BX);
k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum,
&d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
&_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
time_interp.stop();
ans->copy_answers(false,false,false,false);
device->add_ans_object(ans);
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
double PPPMT::host_memory_usage() const {
return device->atom.host_memory_usage()+
sizeof(PPPM<numtyp,acctyp,grdtyp,grdtyp4>);
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
void PPPMT::compile_kernels(UCL_Device &dev) {
if (_compiled)
return;
if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE);
#ifdef USE_OPENCL
flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
ucl_template_name<grdtyp>()+"4";
#endif
pppm_program=new UCL_Program(dev);
#ifdef USE_OPENCL
pppm_program->load_string(pppm,flags.c_str());
#else
if (sizeof(grdtyp)==sizeof(float))
pppm_program->load_string(pppm_f,flags.c_str());
else
pppm_program->load_string(pppm_d,flags.c_str());
#endif
k_particle_map.set_function(*pppm_program,"particle_map");
k_make_rho.set_function(*pppm_program,"make_rho");
k_interp.set_function(*pppm_program,"interp");
pos_tex.get_texture(*pppm_program,"pos_tex");
q_tex.get_texture(*pppm_program,"q_tex");
_compiled=true;
}
template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
Event Timeline
Log In to Comment