lal_neighbor.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Jul 2, 15:17

lal_neighbor.cpp
View Options

	/***************************************************************************
	neighbor.cpp
	-------------------
	W. Michael Brown (ORNL)
	Peng Wang (Nvidia)

	Class for handling neighbor lists

	__________________________________________________________________________
	This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
	__________________________________________________________________________

	begin :
	email : brownw@ornl.gov, penwang@nvidia.com
	***************************************************************************/

	#include "lal_precision.h"
	#include "lal_neighbor.h"
	#include "lal_device.h"
	#include "math.h"
	using namespace LAMMPS_AL;

	int Neighbor::bytes_per_atom(const int max_nbors) const {
	if (_gpu_nbor==1)
	return (max_nbors+2)*sizeof(int);
	else if (_gpu_nbor==2)
	return (max_nbors+3)*sizeof(int);
	else if (_use_packing)
	return ((max_nbors+2)2)sizeof(int);
	else
	return (max_nbors+3)*sizeof(int);
	}

	bool Neighbor::init(NeighborShared *shared, const int inum,
	const int host_inum, const int max_nbors,
	const int maxspecial, UCL_Device &devi,
	const int gpu_nbor, const int gpu_host,
	const bool pre_cut, const int block_cell_2d,
	const int block_cell_id, const int block_nbor_build,
	const int threads_per_atom, const bool time_device) {
	clear();

	_threads_per_atom=threads_per_atom;
	_block_cell_2d=block_cell_2d;
	_block_cell_id=block_cell_id;
	_block_nbor_build=block_nbor_build;
	_shared=shared;
	dev=&devi;
	_gpu_nbor=gpu_nbor;
	_time_device=time_device;
	if (gpu_host==0)
	_gpu_host=false;
	else if (gpu_host==1)
	_gpu_host=true;
	else
	// Not yet implemented
	assert(0==1);

	if (pre_cut \|\| gpu_nbor==0)
	_alloc_packed=true;
	else
	_alloc_packed=false;

	bool success=true;

	// Initialize timers for the selected GPU
	_nbor_time_avail=false;
	time_nbor.init(*dev);
	time_kernel.init(*dev);
	time_hybrid1.init(*dev);
	time_hybrid2.init(*dev);
	time_nbor.zero();
	time_kernel.zero();
	time_hybrid1.zero();
	time_hybrid2.zero();

	_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
	if (_max_atoms==0)
	_max_atoms=1000;

	_max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
	_max_nbors=max_nbors;

	_maxspecial=maxspecial;
	if (gpu_nbor==0)
	_maxspecial=0;

	if (gpu_nbor==0)
	success=success && (host_packed.alloc(2IJ_SIZE,dev,
	UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
	alloc(success);
	if (!success)
	return false;

	if (_use_packing==false)
	_shared->compile_kernels(devi,gpu_nbor);

	return success;
	}

	void Neighbor::alloc(bool &success) {
	dev_nbor.clear();
	host_acc.clear();
	int nt=_max_atoms+_max_host;
	if (_use_packing==false \|\| _gpu_nbor>0)
	success=success &&
	(dev_nbor.alloc((_max_nbors+2)_max_atoms,dev)==UCL_SUCCESS);
	else
	success=success && (dev_nbor.alloc(3_max_atoms,dev,
	UCL_READ_ONLY)==UCL_SUCCESS);
	success=success && (host_acc.alloc(nt2,dev,
	UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);

	_c_bytes=dev_nbor.row_bytes();
	if (_alloc_packed) {
	dev_packed.clear();
	success=success && (dev_packed.alloc((_max_nbors+2)_max_atoms,dev,
	UCL_READ_ONLY)==UCL_SUCCESS);
	_c_bytes+=dev_packed.row_bytes();
	}
	if (_max_host>0) {
	host_nbor.clear();
	dev_host_nbor.clear();
	dev_host_numj.clear();
	host_ilist.clear();
	host_jlist.clear();

	success=success && (host_nbor.alloc(_max_nbors_max_host,dev,
	UCL_RW_OPTIMIZED)==UCL_SUCCESS);
	success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
	*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
	success=success && (dev_host_numj.alloc(_max_host,*dev,
	UCL_WRITE_ONLY)==UCL_SUCCESS);
	success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
	if (!success)
	return;
	for (int i=0; i<nt; i++)
	host_ilist[i]=i;
	success=success && (host_jlist.alloc(_max_host,*dev,
	UCL_NOT_PINNED)==UCL_SUCCESS);
	if (!success)
	return;
	int *ptr=host_nbor.begin();
	for (int i=0; i<_max_host; i++) {
	host_jlist[i]=ptr;
	ptr+=_max_nbors;
	}
	_c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
	} else {
	// Some OpenCL implementations return errors for NULL pointers as args
	dev_host_nbor.view(dev_nbor);
	dev_host_numj.view(dev_nbor);
	}
	if (_maxspecial>0) {
	dev_nspecial.clear();
	dev_special.clear();
	dev_special_t.clear();
	int at=_max_atoms+_max_host;
	success=success && (dev_nspecial.alloc(3at,dev,
	UCL_READ_ONLY)==UCL_SUCCESS);
	success=success && (dev_special.alloc(_maxspecialat,dev,
	UCL_READ_ONLY)==UCL_SUCCESS);
	success=success && (dev_special_t.alloc(_maxspecialat,dev,
	UCL_READ_ONLY)==UCL_SUCCESS);
	_gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
	dev_special_t.row_bytes();
	}

	_allocated=true;
	}

	void Neighbor::clear() {
	_gpu_bytes=0.0;
	_cell_bytes=0.0;
	_c_bytes=0.0;
	_bin_time=0.0;
	if (_ncells>0) {
	_ncells=0;
	dev_cell_counts.clear();
	if (_gpu_nbor==2) {
	host_cell_counts.clear();
	delete [] cell_iter;
	}
	}
	if (_allocated) {
	_allocated=false;
	_nbor_time_avail=false;

	host_packed.clear();
	host_acc.clear();
	dev_nbor.clear();
	dev_host_nbor.clear();
	dev_packed.clear();
	host_nbor.clear();
	dev_host_numj.clear();
	host_ilist.clear();
	host_jlist.clear();
	dev_nspecial.clear();
	dev_special.clear();
	dev_special_t.clear();

	time_kernel.clear();
	time_nbor.clear();
	time_hybrid1.clear();
	time_hybrid2.clear();
	}
	}

	double Neighbor::host_memory_usage() const {
	if (_gpu_nbor>0) {
	if (_gpu_host)
	return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
	host_jlist.row_bytes();
	else
	return 0;
	} else
	return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+
	sizeof(Neighbor);
	}

	void Neighbor::get_host(const int inum, int ilist, int numj,
	int **firstneigh, const int block_size) {
	_nbor_time_avail=true;
	time_nbor.start();

	UCL_H_Vec<int> ilist_view;
	ilist_view.view(ilist,inum,*dev);
	ucl_copy(dev_nbor,ilist_view,false);

	UCL_D_Vec<int> nbor_offset;
	UCL_H_Vec<int> host_offset;

	int copy_count=0;
	int ij_count=0;
	int acc_count=0;
	int dev_count=0;
	int *h_ptr=host_packed.begin();
	_nbor_pitch=inum;

	for (int ii=0; ii<inum; ii++) {
	int i=ilist[ii];
	int nj=numj[i];
	host_acc[ii]=nj;
	host_acc[ii+inum]=acc_count;

	acc_count+=nj;

	int *jlist=firstneigh[i];
	for (int jj=0; jj<nj; jj++) {
	*h_ptr=jlist[jj];
	h_ptr++;
	ij_count++;

	if (ij_count==IJ_SIZE) {
	dev_nbor.sync();
	host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
	nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
	ucl_copy(nbor_offset,host_offset,true);
	copy_count++;
	ij_count=0;
	dev_count+=IJ_SIZE;
	h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
	}
	}
	}
	if (ij_count!=0) {
	dev_nbor.sync();
	host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
	nbor_offset.view_offset(dev_count,dev_packed,ij_count);
	ucl_copy(nbor_offset,host_offset,true);
	}
	UCL_D_Vec<int> acc_view;
	acc_view.view_offset(inum,dev_nbor,inum*2);
	ucl_copy(acc_view,host_acc,true);
	time_nbor.stop();

	if (_use_packing==false) {
	time_kernel.start();
	int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
	block_size));
	_shared->k_nbor.set_size(GX,block_size);
	_shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum,
	&_threads_per_atom);
	time_kernel.stop();
	}
	}

	template <class numtyp, class acctyp>
	void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
	const int nall, Atom<numtyp,acctyp> &atom,
	double sublo, double subhi, int *tag,
	int nspecial, int special, bool &success,
	int &mn) {
	_nbor_time_avail=true;
	const int nt=inum+host_inum;

	// Calculate number of cells and allocate storage for binning as necessary
	int ncellx, ncelly, ncellz, ncell_3d;
	ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
	2.0*_cell_size)/_cell_size));
	ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
	2.0*_cell_size)/_cell_size));
	ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
	2.0*_cell_size)/_cell_size));
	ncell_3d = ncellx * ncelly * ncellz;
	if (ncell_3d+1>_ncells) {
	dev_cell_counts.clear();
	dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
	if (_gpu_nbor==2) {
	if (_ncells>0) {
	host_cell_counts.clear();
	delete [] cell_iter;
	}
	cell_iter = new int[ncell_3d+1];
	host_cell_counts.alloc(ncell_3d+1,dev_nbor);
	}
	_ncells=ncell_3d+1;
	_cell_bytes=dev_cell_counts.row_bytes();
	}

	const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);

	// If binning on CPU, do this now
	if (_gpu_nbor==2) {
	double stime = MPI_Wtime();
	int *cell_id=atom.host_cell_id.begin();
	int *particle_id=atom.host_particle_id.begin();

	// Build cell list on CPU
	host_cell_counts.zero();
	double m_cell_size=-_cell_size;
	double dx=subhi[0]-sublo[0]+_cell_size;
	double dy=subhi[1]-sublo[1]+_cell_size;
	double dz=subhi[2]-sublo[2]+_cell_size;

	for (int i=0; i<nall; i++) {
	double px, py, pz;
	px=x[i][0]-sublo[0];
	py=x[i][1]-sublo[1];
	pz=x[i][2]-sublo[2];
	if (px<m_cell_size) px=m_cell_size;
	if (py<m_cell_size) py=m_cell_size;
	if (pz<m_cell_size) pz=m_cell_size;
	if (px>dx) px=dx;
	if (py>dy) py=dy;
	if (pz>dz) pz=dz;

	int id=static_cast<int>(px/_cell_size + 1.0) +
	static_cast<int>(py/_cell_size + 1.0) * ncellx +
	static_cast<int>(pz/_cell_size + 1.0) * ncellx * ncelly;

	cell_id[i]=id;
	host_cell_counts[id+1]++;
	}
	cell_iter[0]=0;
	for (int i=1; i<_ncells; i++) {
	host_cell_counts[i]+=host_cell_counts[i-1];
	cell_iter[i]=host_cell_counts[i];
	}
	time_hybrid1.start();
	ucl_copy(dev_cell_counts,host_cell_counts,true);
	time_hybrid1.stop();
	for (int i=0; i<nall; i++) {
	int celli=cell_id[i];
	int ploc=cell_iter[celli];
	cell_iter[celli]++;
	particle_id[ploc]=i;
	}
	time_hybrid2.start();
	ucl_copy(atom.dev_particle_id,atom.host_particle_id,true);
	time_hybrid2.stop();
	_bin_time+=MPI_Wtime()-stime;
	}

	if (_maxspecial>0) {
	time_nbor.start();
	UCL_H_Vec<int> view_nspecial, view_special, view_tag;
	view_nspecial.view(nspecial[0],nt3,dev);
	view_special.view(special[0],nt_maxspecial,dev);
	view_tag.view(tag,nall,*dev);
	ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
	ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
	ucl_copy(atom.dev_tag,view_tag,nall,false);
	time_nbor.stop();
	if (_time_device)
	time_nbor.add_to_total();
	time_kernel.start();
	const int b2x=_block_cell_2d;
	const int b2y=_block_cell_2d;
	const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
	const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
	_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
	_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
	&_maxspecial,&nt);
	} else
	time_kernel.start();

	_nbor_pitch=inum;
	_shared->neigh_tex.bind_float(atom.dev_x,4);

	// If binning on GPU, do this now
	if (_gpu_nbor==1) {
	const int neigh_block=_block_cell_id;
	const int GX=(int)ceil((float)nall/neigh_block);
	const numtyp sublo0=static_cast<numtyp>(sublo[0]);
	const numtyp sublo1=static_cast<numtyp>(sublo[1]);
	const numtyp sublo2=static_cast<numtyp>(sublo[2]);
	const numtyp subhi0=static_cast<numtyp>(subhi[0]);
	const numtyp subhi1=static_cast<numtyp>(subhi[1]);
	const numtyp subhi2=static_cast<numtyp>(subhi[2]);
	_shared->k_cell_id.set_size(GX,neigh_block);
	_shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
	&atom.dev_particle_id.begin(),
	&sublo0, &sublo1, &sublo2, &subhi0, &subhi1,
	&subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);

	atom.sort_neighbor(nall);

	/* calculate cell count */
	_shared->k_cell_counts.set_size(GX,neigh_block);
	_shared->k_cell_counts.run(&atom.dev_cell_id.begin(),
	&dev_cell_counts.begin(), &nall, &ncell_3d);
	}

	/* build the neighbor list */
	const int cell_block=_block_nbor_build;
	_shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
	_shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
	&dev_cell_counts.begin(), &dev_nbor.begin(),
	&dev_host_nbor.begin(), &dev_host_numj.begin(),
	&_max_nbors,&cell_size_cast,
	&ncellx, &ncelly, &ncellz, &inum, &nt, &nall,
	&_threads_per_atom);

	/* Get the maximum number of nbors and realloc if necessary */
	UCL_D_Vec<int> numj;
	numj.view_offset(inum,dev_nbor,inum);
	ucl_copy(host_acc,numj,inum,false);
	if (nt>inum) {
	UCL_H_Vec<int> host_offset;
	host_offset.view_offset(inum,host_acc,nt-inum);
	ucl_copy(host_offset,dev_host_numj,nt-inum,false);
	}
	mn=host_acc[0];
	for (int i=1; i<nt; i++)
	mn=std::max(mn,host_acc[i]);

	if (mn>_max_nbors) {
	mn=static_cast<int>(static_cast<double>(mn)*1.10);
	dev_nbor.clear();
	success=success &&
	(dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS);
	_gpu_bytes=dev_nbor.row_bytes();
	if (_max_host>0) {
	host_nbor.clear();
	dev_host_nbor.clear();
	success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
	UCL_RW_OPTIMIZED)==UCL_SUCCESS);
	success=success && (dev_host_nbor.alloc(mn*_max_host,
	dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
	int *ptr=host_nbor.begin();
	for (int i=0; i<_max_host; i++) {
	host_jlist[i]=ptr;
	ptr+=mn;
	}
	_gpu_bytes+=dev_host_nbor.row_bytes();
	} else
	dev_host_nbor.view(dev_nbor);
	if (_alloc_packed) {
	dev_packed.clear();
	success=success && (dev_packed.alloc((mn+2)_max_atoms,dev,
	UCL_READ_ONLY)==UCL_SUCCESS);
	_gpu_bytes+=dev_packed.row_bytes();
	}
	if (!success)
	return;
	_max_nbors=mn;
	time_kernel.stop();
	if (_time_device)
	time_kernel.add_to_total();
	build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
	special, success, mn);
	return;
	}

	if (_maxspecial>0) {
	const int GX2=static_cast<int>(ceil(static_cast<double>
	(nt*_threads_per_atom)/cell_block));
	_shared->k_special.set_size(GX2,cell_block);
	_shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
	&dev_host_numj.begin(), &atom.dev_tag.begin(),
	&dev_nspecial.begin(), &dev_special.begin(),
	&inum, &nt, &_max_nbors, &_threads_per_atom);
	}
	time_kernel.stop();

	time_nbor.start();
	if (_gpu_host)
	ucl_copy(host_nbor,dev_host_nbor,false);
	time_nbor.stop();
	}

	template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
	(double **x, const int inum, const int host_inum, const int nall,
	Atom<PRECISION,ACC_PRECISION> &atom, double sublo, double subhi,
	int , int , int *, bool &success, int &mn);

lal_neighbor.cppNo OneTemporaryActions

File Metadata

lal_neighbor.cppView Options

Event Timeline

lal_neighbor.cpp
No OneTemporary
Actions

lal_neighbor.cpp
View Options