Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F88069560
lal_neighbor.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Oct 16, 15:08
Size
19 KB
Mime Type
text/x-c
Expires
Fri, Oct 18, 15:08 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
21710996
Attached To
rLAMMPS lammps
lal_neighbor.cpp
View Options
/***************************************************************************
neighbor.cpp
-------------------
W. Michael Brown (ORNL)
Peng Wang (Nvidia)
Class for handling neighbor lists
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov, penwang@nvidia.com
***************************************************************************/
#include "lal_precision.h"
#include "lal_neighbor.h"
#include "lal_device.h"
#include "math.h"
using namespace LAMMPS_AL;
int Neighbor::bytes_per_atom(const int max_nbors) const {
if (_gpu_nbor==1)
return (max_nbors+2)*sizeof(int);
else if (_gpu_nbor==2)
return (max_nbors+3)*sizeof(int);
else if (_use_packing)
return ((max_nbors+2)*2)*sizeof(int);
else
return (max_nbors+3)*sizeof(int);
}
bool Neighbor::init(NeighborShared *shared, const int inum,
const int host_inum, const int max_nbors,
const int maxspecial, UCL_Device &devi,
const int gpu_nbor, const int gpu_host,
const bool pre_cut, const int block_cell_2d,
const int block_cell_id, const int block_nbor_build,
const int threads_per_atom, const int warp_size,
const bool time_device,
const std::string compile_flags) {
clear();
_threads_per_atom=threads_per_atom;
_block_cell_2d=block_cell_2d;
_block_cell_id=block_cell_id;
_max_block_nbor_build=block_nbor_build;
_block_nbor_build=block_nbor_build;
_warp_size=warp_size;
_shared=shared;
dev=&devi;
_gpu_nbor=gpu_nbor;
_time_device=time_device;
if (gpu_host==0)
_gpu_host=false;
else if (gpu_host==1)
_gpu_host=true;
else
// Not yet implemented
assert(0==1);
if (pre_cut || gpu_nbor==0)
_alloc_packed=true;
else
_alloc_packed=false;
if (pre_cut)
_packed_permissions=UCL_READ_WRITE;
else
_packed_permissions=UCL_READ_ONLY;
bool success=true;
// Initialize timers for the selected GPU
_nbor_time_avail=false;
time_nbor.init(*dev);
time_kernel.init(*dev);
time_hybrid1.init(*dev);
time_hybrid2.init(*dev);
time_transpose.init(*dev);
time_nbor.zero();
time_kernel.zero();
time_hybrid1.zero();
time_hybrid2.zero();
time_transpose.zero();
_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
if (_max_atoms==0)
_max_atoms=1000;
_max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
_max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom;
_maxspecial=maxspecial;
if (gpu_nbor==0)
_maxspecial=0;
if (gpu_nbor==0)
success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
alloc(success);
if (!success)
return false;
if (_use_packing==false)
_shared->compile_kernels(devi,gpu_nbor,compile_flags);
return success;
}
void Neighbor::alloc(bool &success) {
dev_nbor.clear();
host_acc.clear();
int nt=_max_atoms+_max_host;
if (_use_packing==false || _gpu_nbor>0)
success=success &&
(dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS);
else
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (host_acc.alloc(nt*2,*dev,
UCL_READ_WRITE)==UCL_SUCCESS);
_c_bytes=dev_nbor.row_bytes();
if (_alloc_packed) {
dev_packed.clear();
success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
_packed_permissions)==UCL_SUCCESS);
dev_acc.clear();
success=success && (dev_acc.alloc(_max_atoms,*dev,
UCL_READ_WRITE)==UCL_SUCCESS);
_c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
}
if (_max_host>0) {
nbor_host.clear();
dev_numj_host.clear();
host_ilist.clear();
host_jlist.clear();
success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
UCL_READ_WRITE)==UCL_SUCCESS) && success;
success=success && (dev_numj_host.alloc(_max_host,*dev,
UCL_READ_WRITE)==UCL_SUCCESS);
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success)
return;
for (int i=0; i<nt; i++)
host_ilist[i]=i;
success=success && (host_jlist.alloc(_max_host,*dev,
UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success)
return;
int *ptr=nbor_host.host.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=_max_nbors;
}
_c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes();
} else {
// Some OpenCL implementations return errors for NULL pointers as args
nbor_host.device.view(dev_nbor);
dev_numj_host.view(dev_nbor);
}
if (_maxspecial>0) {
dev_nspecial.clear();
dev_special.clear();
dev_special_t.clear();
int at=_max_atoms+_max_host;
success=success && (dev_nspecial.alloc(3*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (dev_special.alloc(_maxspecial*at,*dev,
UCL_READ_WRITE)==UCL_SUCCESS);
success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
dev_special_t.row_bytes();
}
_allocated=true;
}
void Neighbor::clear() {
_gpu_bytes=0.0;
_cell_bytes=0.0;
_c_bytes=0.0;
_bin_time=0.0;
if (_ncells>0) {
_ncells=0;
cell_counts.clear();
if (_gpu_nbor==2)
delete [] cell_iter;
}
if (_allocated) {
_allocated=false;
_nbor_time_avail=false;
host_packed.clear();
host_acc.clear();
dev_acc.clear();
dev_nbor.clear();
nbor_host.clear();
dev_packed.clear();
dev_numj_host.clear();
host_ilist.clear();
host_jlist.clear();
dev_nspecial.clear();
dev_special.clear();
dev_special_t.clear();
time_kernel.clear();
time_nbor.clear();
time_hybrid1.clear();
time_hybrid2.clear();
time_transpose.clear();
}
}
double Neighbor::host_memory_usage() const {
if (_gpu_nbor>0) {
if (_gpu_host)
return nbor_host.device.row_bytes()*nbor_host.rows()+
host_ilist.row_bytes()+host_jlist.row_bytes();
else
return 0;
} else
return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+
sizeof(Neighbor);
}
void Neighbor::get_host(const int inum, int *ilist, int *numj,
int **firstneigh, const int block_size) {
_nbor_time_avail=true;
time_nbor.start();
UCL_H_Vec<int> ilist_view;
ilist_view.view(ilist,inum,*dev);
ucl_copy(dev_nbor,ilist_view,false);
UCL_D_Vec<int> nbor_offset;
UCL_H_Vec<int> host_offset;
int copy_count=0;
int ij_count=0;
int acc_count=0;
int dev_count=0;
int *h_ptr=host_packed.begin();
_nbor_pitch=inum;
for (int ii=0; ii<inum; ii++) {
int i=ilist[ii];
int nj=numj[i];
host_acc[ii]=nj;
host_acc[ii+inum]=acc_count;
acc_count+=nj;
int *jlist=firstneigh[i];
for (int jj=0; jj<nj; jj++) {
*h_ptr=jlist[jj];
h_ptr++;
ij_count++;
if (ij_count==IJ_SIZE) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
ucl_copy(nbor_offset,host_offset,true);
copy_count++;
ij_count=0;
dev_count+=IJ_SIZE;
h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
}
}
}
if (ij_count!=0) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
nbor_offset.view_offset(dev_count,dev_packed,ij_count);
ucl_copy(nbor_offset,host_offset,true);
}
UCL_D_Vec<int> acc_view;
acc_view.view_offset(inum,dev_nbor,inum*2);
ucl_copy(acc_view,host_acc,true);
UCL_H_Vec<int> host_view;
host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
for (int ii=0; ii<inum; ii++) {
int i=ilist[ii];
host_view[i] = ii;
}
ucl_copy(dev_acc,host_view,true);
time_nbor.stop();
if (_use_packing==false) {
time_kernel.start();
int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
block_size));
_shared->k_nbor.set_size(GX,block_size);
_shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
time_kernel.stop();
}
}
// This is the same as get host, but the requirement that ilist[i]=i and
// inum=nlocal is forced to be true to allow direct indexing of neighbors of
// neighbors
void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
int **firstneigh, const int block_size) {
_nbor_time_avail=true;
time_nbor.start();
UCL_H_Vec<int> ilist_view;
ilist_view.view(ilist,inum,*dev);
ucl_copy(dev_nbor,ilist_view,false);
UCL_D_Vec<int> nbor_offset;
UCL_H_Vec<int> host_offset;
int copy_count=0;
int ij_count=0;
int acc_count=0;
int dev_count=0;
int *h_ptr=host_packed.begin();
_nbor_pitch=inum;
if (nlist!=inum)
host_acc.zero(inum);
for (int ii=0; ii<nlist; ii++) {
int i=ilist[ii];
int nj=numj[i];
host_acc[i]=nj;
host_acc[i+inum]=acc_count;
acc_count+=nj;
}
for (int i=0; i<inum; i++) {
int nj=host_acc[i];
int *jlist=firstneigh[i];
for (int jj=0; jj<nj; jj++) {
*h_ptr=jlist[jj];
h_ptr++;
ij_count++;
if (ij_count==IJ_SIZE) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
ucl_copy(nbor_offset,host_offset,true);
copy_count++;
ij_count=0;
dev_count+=IJ_SIZE;
h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
}
}
}
if (ij_count!=0) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
nbor_offset.view_offset(dev_count,dev_packed,ij_count);
ucl_copy(nbor_offset,host_offset,true);
}
UCL_D_Vec<int> acc_view;
acc_view.view_offset(inum,dev_nbor,inum*2);
ucl_copy(acc_view,host_acc,true);
time_nbor.stop();
if (_use_packing==false) {
time_kernel.start();
int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
block_size));
_shared->k_nbor.set_size(GX,block_size);
_shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
time_kernel.stop();
}
}
template <class numtyp, class acctyp>
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
if (maxn>_max_nbors) {
int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
mn=(mn/_threads_per_atom+1)*_threads_per_atom;
success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS);
_gpu_bytes=dev_nbor.row_bytes();
if (_max_host>0) {
success=success && (nbor_host.resize(mn*_max_host)==UCL_SUCCESS);
int *ptr=nbor_host.host.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=mn;
}
_gpu_bytes+=nbor_host.row_bytes();
} else {
nbor_host.device.view(dev_nbor);
dev_numj_host.view(dev_nbor);
}
if (_alloc_packed) {
success=success && (dev_packed.resize((mn+2)*_max_atoms)==UCL_SUCCESS);
_gpu_bytes+=dev_packed.row_bytes();
}
_max_nbors=mn;
}
}
template <class numtyp, class acctyp>
void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
const int nall, Atom<numtyp,acctyp> &atom,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, bool &success,
int &mn) {
_nbor_time_avail=true;
const int nt=inum+host_inum;
// Calculate number of cells and allocate storage for binning as necessary
int ncellx, ncelly, ncellz, ncell_3d;
int ghost_cells=2*_cells_in_cutoff;
ncellx = static_cast<int>(ceil((subhi[0]-sublo[0])/_cell_size))+ghost_cells;
ncelly = static_cast<int>(ceil((subhi[1]-sublo[1])/_cell_size))+ghost_cells;
ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
ncell_3d = ncellx * ncelly * ncellz;
if (ncell_3d+1>_ncells) {
cell_counts.clear();
if (_gpu_nbor==2) {
if (_ncells>0)
delete [] cell_iter;
cell_iter = new int[ncell_3d+1];
cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY);
} else {
cell_counts.device.clear();
cell_counts.device.alloc(ncell_3d+1,dev_nbor);
}
_ncells=ncell_3d+1;
_cell_bytes=cell_counts.device.row_bytes();
}
const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
if (_maxspecial>0) {
time_nbor.start();
UCL_H_Vec<int> view_nspecial;
UCL_H_Vec<tagint> view_special, view_tag;
view_nspecial.view(nspecial[0],nt*3,*dev);
view_special.view(special[0],nt*_maxspecial,*dev);
view_tag.view(tag,nall,*dev);
ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
ucl_copy(atom.dev_tag,view_tag,nall,false);
time_nbor.stop();
if (_time_device)
time_nbor.add_to_total();
time_transpose.start();
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);
time_transpose.stop();
}
// If binning on CPU, do this now
if (_gpu_nbor==2) {
double stime = MPI_Wtime();
int *cell_id=atom.host_cell_id.begin();
int *particle_id=atom.host_particle_id.begin();
// Build cell list on CPU
cell_counts.host.zero();
double i_cell_size=1.0/_cell_size;
int offset_hi=_cells_in_cutoff+1;
for (int i=0; i<nt; i++) {
double px, py, pz;
px=x[i][0]-sublo[0];
py=x[i][1]-sublo[1];
pz=x[i][2]-sublo[2];
int ix = static_cast<int>(px*i_cell_size+1);
ix = std::max(ix,_cells_in_cutoff);
ix = std::min(ix,ncellx-offset_hi);
int iy = static_cast<int>(py*i_cell_size+1);
iy = std::max(iy,_cells_in_cutoff);
iy = std::min(iy,ncelly-offset_hi);
int iz = static_cast<int>(pz*i_cell_size+1);
iz = std::max(iz,_cells_in_cutoff);
iz = std::min(iz,ncellz-offset_hi);
int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id;
cell_counts[id+1]++;
}
for (int i=nt; i<nall; i++) {
double px, py, pz;
px=x[i][0]-sublo[0];
py=x[i][1]-sublo[1];
pz=x[i][2]-sublo[2];
int ix = static_cast<int>(px*i_cell_size+1);
ix = std::max(ix,0);
ix = std::min(ix,ncellx-1);
int iy = static_cast<int>(py*i_cell_size+1);
iy = std::max(iy,0);
iy = std::min(iy,ncelly-1);
int iz = static_cast<int>(pz*i_cell_size+1);
iz = std::max(iz,0);
iz = std::min(iz,ncellz-1);
int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id;
cell_counts[id+1]++;
}
mn=0;
for (int i=0; i<_ncells; i++)
mn=std::max(mn,cell_counts[i]);
mn*=8;
set_nbor_block_size(mn/2);
resize_max_neighbors<numtyp,acctyp>(mn,success);
if (!success)
return;
_total_atoms=nt;
cell_iter[0]=0;
for (int i=1; i<_ncells; i++) {
cell_counts[i]+=cell_counts[i-1];
cell_iter[i]=cell_counts[i];
}
time_hybrid1.start();
cell_counts.update_device(true);
time_hybrid1.stop();
for (int i=0; i<nall; i++) {
int celli=cell_id[i];
int ploc=cell_iter[celli];
cell_iter[celli]++;
particle_id[ploc]=i;
}
time_hybrid2.start();
ucl_copy(atom.dev_particle_id,atom.host_particle_id,true);
time_hybrid2.stop();
_bin_time+=MPI_Wtime()-stime;
}
time_kernel.start();
_nbor_pitch=inum;
_shared->neigh_tex.bind_float(atom.x,4);
// If binning on GPU, do this now
if (_gpu_nbor==1) {
const numtyp i_cell_size=static_cast<numtyp>(1.0/_cell_size);
const int neigh_block=_block_cell_id;
const int GX=(int)ceil((float)nall/neigh_block);
const numtyp sublo0=static_cast<numtyp>(sublo[0]);
const numtyp sublo1=static_cast<numtyp>(sublo[1]);
const numtyp sublo2=static_cast<numtyp>(sublo[2]);
_shared->k_cell_id.set_size(GX,neigh_block);
_shared->k_cell_id.run(&atom.x, &atom.dev_cell_id,
&atom.dev_particle_id, &sublo0, &sublo1,
&sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz,
&nt, &nall, &_cells_in_cutoff);
atom.sort_neighbor(nall);
/* calculate cell count */
_shared->k_cell_counts.set_size(GX,neigh_block);
_shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall,
&ncell_3d);
}
/* build the neighbor list */
const int cell_block=_block_nbor_build;
_shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
(ncellz-ghost_cells),cell_block,1);
_shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
&cell_counts, &dev_nbor, &nbor_host,
&dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
&ncelly, &ncellz, &inum, &nt, &nall,
&_threads_per_atom, &_cells_in_cutoff);
/* Get the maximum number of nbors and realloc if necessary */
UCL_D_Vec<int> numj;
numj.view_offset(inum,dev_nbor,inum);
ucl_copy(host_acc,numj,inum,true);
if (nt>inum) {
UCL_H_Vec<int> host_offset;
host_offset.view_offset(inum,host_acc,nt-inum);
ucl_copy(host_offset,dev_numj_host,nt-inum,true);
}
if (_gpu_nbor!=2) {
host_acc.sync();
mn=host_acc[0];
for (int i=1; i<nt; i++)
mn=std::max(mn,host_acc[i]);
set_nbor_block_size(mn);
if (mn>_max_nbors) {
resize_max_neighbors<numtyp,acctyp>(mn,success);
if (!success)
return;
time_kernel.stop();
if (_time_device)
time_kernel.add_to_total();
build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag,
nspecial, special, success, mn);
return;
}
}
if (_maxspecial>0) {
const int GX2=static_cast<int>(ceil(static_cast<double>
(nt*_threads_per_atom)/cell_block));
_shared->k_special.set_size(GX2,cell_block);
_shared->k_special.run(&dev_nbor, &nbor_host, &dev_numj_host,
&atom.dev_tag, &dev_nspecial, &dev_special,
&inum, &nt, &_max_nbors, &_threads_per_atom);
}
time_kernel.stop();
time_nbor.start();
if (inum<nt) {
nbor_host.update_host(true);
nbor_host.sync();
}
time_nbor.stop();
}
template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
(double **x, const int inum, const int host_inum, const int nall,
Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
tagint *, int **, tagint **, bool &success, int &mn);
Event Timeline
Log In to Comment