pppm_gpu_memory.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Jun 30, 06:18

pppm_gpu_memory.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
	------------------------------------------------------------------------- */

	#ifdef USE_OPENCL
	#include "pppm_gpu_cl.h"
	#else
	#include "pppm_f_gpu_ptx.h"
	#include "pppm_d_gpu_ptx.h"
	#endif
	#include "pppm_gpu_memory.h"
	#include <cassert>

	#define PPPMGPUMemoryT PPPMGPUMemory<numtyp, acctyp, grdtyp, grdtyp4>

	extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;

	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	PPPMGPUMemoryT::PPPMGPUMemory() : _allocated(false), _compiled(false),
	_max_bytes(0) {
	device=&pair_gpu_device;
	ans=new PairGPUAns<numtyp,acctyp>();
	}

	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	PPPMGPUMemoryT::~PPPMGPUMemory() {
	clear(0.0);
	delete ans;
	}

	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	int PPPMGPUMemoryT::bytes_per_atom() const {
	return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1;
	}

	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
	const int order, const int nxlo_out,
	const int nylo_out, const int nzlo_out,
	const int nxhi_out, const int nyhi_out,
	const int nzhi_out, grdtyp **rho_coeff,
	grdtyp **vd_brick, const double slab_volfactor,
	const int nx_pppm, const int ny_pppm,
	const int nz_pppm, int &flag) {
	_max_bytes=10;
	screen=_screen;
	bool success=true;

	flag=device->init(*ans,nlocal,nall);
	if (flag!=0)
	return 0;
	if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
	flag=-5;
	return 0;
	}
	if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
	flag=-4;
	return 0;
	}

	ucl_device=device->gpu;
	atom=&device->atom;

	_block_size=device->pppm_block();
	_pencil_size=device->num_mem_threads();
	_block_pencils=_block_size/_pencil_size;

	compile_kernels(*ucl_device);

	// Initialize timers for the selected GPU
	time_in.init(*ucl_device);
	time_in.zero();
	time_out.init(*ucl_device);
	time_out.zero();
	time_map.init(*ucl_device);
	time_map.zero();
	time_rho.init(*ucl_device);
	time_rho.zero();
	time_interp.init(*ucl_device);
	time_interp.zero();

	pos_tex.bind_float(atom->dev_x,4);
	q_tex.bind_float(atom->dev_q,1);

	_allocated=true;
	_max_bytes=0;
	_max_an_bytes=ans->gpu_bytes();

	_order=order;
	_order_m_1=order-1;
	_order2=_order_m_1*_order;
	_nlower=-(_order-1)/2;
	_nupper=order/2;
	_nxlo_out=nxlo_out;
	_nylo_out=nylo_out;
	_nzlo_out=nzlo_out;
	_nxhi_out=nxhi_out;
	_nyhi_out=nyhi_out;
	_nzhi_out=nzhi_out;

	_slab_volfactor=slab_volfactor;
	_nx_pppm=nx_pppm;
	_ny_pppm=ny_pppm;
	_nz_pppm=nz_pppm;

	_max_brick_atoms=10;

	// Get rho_coeff on device
	int n2lo=(1-order)/2;
	int numel=order*( order/2 - n2lo + 1 );
	success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
	UCL_SUCCESS);
	UCL_H_Vec<grdtyp> view;
	view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
	ucl_copy(d_rho_coeff,view,true);
	_max_bytes+=d_rho_coeff.row_bytes();

	// Allocate storage for grid
	_npts_x=nxhi_out-nxlo_out+1;
	_npts_y=nyhi_out-nylo_out+1;
	_npts_z=nzhi_out-nzlo_out+1;
	_npts_yx=_npts_x*_npts_y;
	success=success && (d_brick.alloc(_npts_x_npts_y_npts_z4,ucl_device)==
	UCL_SUCCESS);
	success=success && (h_brick.alloc(_npts_x_npts_y_npts_z,*ucl_device)==
	UCL_SUCCESS);
	success=success && (h_vd_brick.alloc(_npts_x_npts_y_npts_z4,ucl_device)==
	UCL_SUCCESS);
	*vd_brick=h_vd_brick.begin();
	_max_bytes+=d_brick.row_bytes();

	// Allocate vector with count of atoms assigned to each grid point
	_nlocal_x=_npts_x+_nlower-_nupper;
	_nlocal_y=_npts_y+_nlower-_nupper;
	_nlocal_z=_npts_z+_nlower-_nupper;
	_nlocal_yx=_nlocal_x*_nlocal_y;
	_atom_stride=_nlocal_x_nlocal_y_nlocal_z;
	success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)==
	UCL_SUCCESS);
	_max_bytes+=d_brick_counts.row_bytes();

	// Allocate storage for atoms assigned to each grid point
	success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,
	*ucl_device)==UCL_SUCCESS);
	_max_bytes+=d_brick_atoms.row_bytes();

	// Allocate error flags for checking out of bounds atoms
	success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
	success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
	UCL_SUCCESS);
	if (!success) {
	flag=-3;
	return 0;
	}

	d_error_flag.zero();
	_max_bytes+=1;

	_cpu_idle_time=0.0;

	return h_brick.begin();
	}

	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	void PPPMGPUMemoryT::clear(const double cpu_time) {
	if (!_allocated)
	return;
	_allocated=false;
	_precompute_done=false;

	d_brick.clear();
	h_brick.clear();
	h_vd_brick.clear();
	d_brick_counts.clear();
	h_error_flag.clear();
	d_error_flag.clear();
	d_brick_atoms.clear();

	acc_timers();
	device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
	*ans,_max_bytes+_max_an_bytes,cpu_time,
	_cpu_idle_time,screen);

	if (_compiled) {
	k_particle_map.clear();
	k_make_rho.clear();
	k_interp.clear();
	delete pppm_program;
	_compiled=false;
	}

	time_in.clear();
	time_out.clear();
	time_map.clear();
	time_rho.clear();
	time_interp.clear();

	device->clear();
	}

	// ---------------------------------------------------------------------------
	// Charge assignment that can be performed asynchronously
	// ---------------------------------------------------------------------------
	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	void PPPMGPUMemoryT::_precompute(const int ago, const int nlocal, const int nall,
	double *host_x, int host_type, bool &success,
	double host_q, double boxlo,
	const double delxinv, const double delyinv,
	const double delzinv) {
	acc_timers();
	if (nlocal==0) {
	zero_timers();
	return;
	}

	ans->inum(nlocal);

	if (ago==0) {
	resize_atom(nlocal,nall,success);
	resize_local(nlocal,success);
	if (!success)
	return;

	double bytes=ans->gpu_bytes();
	if (bytes>_max_an_bytes)
	_max_an_bytes=bytes;
	}

	atom->cast_x_data(host_x,host_type);
	atom->cast_q_data(host_q);
	atom->add_x_data(host_x,host_type);
	atom->add_q_data();

	time_map.start();

	// Compute the block size and grid size to keep all cores busy
	int BX=this->block_size();
	int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

	int ainum=this->ans->inum();

	// Boxlo adjusted to be upper left brick and shift for even spline order
	double shift=0.0;
	if (_order % 2)
	shift=0.5;
	_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
	_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
	_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;

	_delxinv=delxinv;
	_delyinv=delyinv;
	_delzinv=delzinv;
	double delvolinv = delxinvdelyinvdelzinv;
	grdtyp f_delvolinv = delvolinv;

	device->zero(d_brick_counts,d_brick_counts.numel());
	k_particle_map.set_size(GX,BX);
	k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
	&ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
	&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv,
	&_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z,
	&_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
	time_map.stop();

	time_rho.start();
	BX=block_size();

	GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
	_block_pencils));
	k_make_rho.set_size(GX,BX);
	k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
	&d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride,
	&_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
	&_nlocal_z, &_order_m_1, &_order, &_order2);
	time_rho.stop();

	time_out.start();
	ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
	ucl_copy(h_error_flag,d_error_flag,true);
	time_out.stop();

	_precompute_done=true;
	}

	// ---------------------------------------------------------------------------
	// Charge spreading stuff
	// ---------------------------------------------------------------------------
	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall,
	double *host_x, int host_type, bool &success,
	double host_q, double boxlo,
	const double delxinv, const double delyinv,
	const double delzinv) {
	if (_precompute_done==false) {
	atom->acc_timers();
	_precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
	delyinv,delzinv);
	}

	device->stop_host_timer();

	if (!success \|\| nlocal==0)
	return 0;

	double t=MPI_Wtime();
	time_out.sync_stop();
	_cpu_idle_time+=MPI_Wtime()-t;

	_precompute_done=false;

	if (h_error_flag[0]==2) {
	// Not enough storage for atoms on the brick
	_max_brick_atoms*=2;
	d_error_flag.zero();
	d_brick_atoms.clear();
	d_brick_atoms.alloc(_atom_stride_max_brick_atoms,ucl_device);
	_max_bytes+=d_brick_atoms.row_bytes();
	return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
	delxinv,delyinv,delzinv);
	}

	return h_error_flag[0];
	}

	// ---------------------------------------------------------------------------
	// Charge spreading stuff
	// ---------------------------------------------------------------------------
	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	void PPPMGPUMemoryT::interp(const grdtyp qqrd2e_scale) {
	time_in.start();
	ucl_copy(d_brick,h_vd_brick,true);
	time_in.stop();

	time_interp.start();
	// Compute the block size and grid size to keep all cores busy
	int BX=this->block_size();
	int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));

	int ainum=this->ans->inum();

	k_interp.set_size(GX,BX);
	k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum,
	&d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
	&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
	&_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
	time_interp.stop();

	ans->copy_answers(false,false,false,false);
	device->add_ans_object(ans);
	}


	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	double PPPMGPUMemoryT::host_memory_usage() const {
	return device->atom.host_memory_usage()+
	sizeof(PPPMGPUMemory<numtyp,acctyp,grdtyp,grdtyp4>);
	}

	template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
	void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) {
	if (_compiled)
	return;

	if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
	return;

	std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
	std::string(OCL_PRECISION_COMPILE);
	#ifdef USE_OPENCL
	flags+=std::string(" -D grdtyp=")+ucl_template_name<grdtyp>()+" -D grdtyp4="+
	ucl_template_name<grdtyp>()+"4";
	#endif

	pppm_program=new UCL_Program(dev);

	#ifdef USE_OPENCL
	pppm_program->load_string(pppm_gpu_kernel,flags.c_str());
	#else
	if (sizeof(grdtyp)==sizeof(float))
	pppm_program->load_string(pppm_f_gpu_kernel,flags.c_str());
	else
	pppm_program->load_string(pppm_d_gpu_kernel,flags.c_str());
	#endif

	k_particle_map.set_function(*pppm_program,"particle_map");
	k_make_rho.set_function(*pppm_program,"make_rho");
	k_interp.set_function(*pppm_program,"interp");
	pos_tex.get_texture(*pppm_program,"pos_tex");
	q_tex.get_texture(*pppm_program,"q_tex");

	_compiled=true;
	}

	template class PPPMGPUMemory<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
	template class PPPMGPUMemory<PRECISION,ACC_PRECISION,double,_lgpu_double4>;

pppm_gpu_memory.cppNo OneTemporaryActions

File Metadata

pppm_gpu_memory.cppView Options

Event Timeline

pppm_gpu_memory.cpp
No OneTemporary
Actions

pppm_gpu_memory.cpp
View Options