gb_gpu_memory.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jul 6, 00:30

gb_gpu_memory.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
	------------------------------------------------------------------------- */

	#ifdef USE_OPENCL
	#include "gb_gpu_cl.h"
	#include "gb_gpu_nbor_cl.h"
	#else
	#include "gb_gpu_ptx.h"
	#endif

	#include "gb_gpu_memory.h"
	#include <cassert>
	#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>

	extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;

	template <class numtyp, class acctyp>
	GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
	_max_bytes(0.0) {
	device=&pair_gpu_device;
	ans=new PairGPUAns<numtyp,acctyp>();
	nbor=new PairGPUNbor;
	}

	template <class numtyp, class acctyp>
	GB_GPU_MemoryT::~GB_GPU_Memory() {
	clear();
	delete ans;
	delete nbor;
	}

	template <class numtyp, class acctyp>
	int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
	return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
	nbor->bytes_per_atom(max_nbors);
	}

	template <class numtyp, class acctyp>
	int GB_GPU_MemoryT::init(const int ntypes, const double gamma,
	const double upsilon, const double mu,
	double host_shape, double host_well,
	double host_cutsq, double host_sigma,
	double *host_epsilon, double host_lshape,
	int h_form, double host_lj1, double **host_lj2,
	double host_lj3, double host_lj4,
	double *host_offset, const double host_special_lj,
	const int nlocal, const int nall,
	const int max_nbors, const double cell_size,
	const double gpu_split, FILE *_screen) {
	nbor_time_avail=false;
	screen=_screen;

	bool gpu_nbor=false;
	if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
	gpu_nbor=true;

	int _gpu_host=0;
	int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
	if (host_nlocal>0)
	_gpu_host=1;

	_threads_per_atom=device->threads_per_atom();
	int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
	_gpu_host,max_nbors,cell_size,true);
	if (success!=0)
	return success;

	ucl_device=device->gpu;
	atom=&device->atom;

	_block_size=device->pair_block_size();
	compile_kernels(*ucl_device);

	// Initialize host-device load balancer
	hd_balancer.init(device,gpu_nbor,gpu_split);

	// Initialize timers for the selected GPU
	time_pair.init(*ucl_device);
	time_pair.zero();

	// If atom type constants fit in shared memory use fast kernel
	int lj_types=ntypes;
	shared_types=false;
	int max_shared_types=device->max_shared_types();
	if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
	lj_types=max_shared_types;
	shared_types=true;
	}
	_lj_types=lj_types;

	// Allocate a host write buffer for copying type data
	UCL_H_Vec<numtyp> host_write(lj_typeslj_types32,*ucl_device,
	UCL_WRITE_OPTIMIZED);

	for (int i=0; i<lj_types*lj_types; i++)
	host_write[i]=0.0;

	sigma_epsilon.alloc(lj_typeslj_types,ucl_device,UCL_READ_ONLY);
	this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
	host_sigma,host_epsilon);

	cut_form.alloc(lj_typeslj_types,ucl_device,UCL_READ_ONLY);
	this->atom->type_pack2(ntypes,lj_types,cut_form,host_write,
	host_cutsq,h_form);

	lj1.alloc(lj_typeslj_types,ucl_device,UCL_READ_ONLY);
	this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
	host_cutsq,h_form);

	lj3.alloc(lj_typeslj_types,ucl_device,UCL_READ_ONLY);
	this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
	host_offset);

	dev_error.alloc(1,*ucl_device);
	dev_error.zero();

	_allocated=true;

	host_form=h_form;

	// Initialize timers for the selected GPU
	time_kernel.init(*ucl_device);
	time_gayberne.init(*ucl_device);
	time_kernel2.init(*ucl_device);
	time_gayberne2.init(*ucl_device);
	time_kernel.zero();
	time_gayberne.zero();
	time_kernel2.zero();
	time_gayberne2.zero();

	// Allocate, cast and asynchronous memcpy of constant data
	// Copy data for bonded interactions
	gamma_upsilon_mu.alloc(7,*ucl_device,UCL_READ_ONLY);
	host_write[0]=static_cast<numtyp>(gamma);
	host_write[1]=static_cast<numtyp>(upsilon);
	host_write[2]=static_cast<numtyp>(mu);
	host_write[3]=static_cast<numtyp>(host_special_lj[0]);
	host_write[4]=static_cast<numtyp>(host_special_lj[1]);
	host_write[5]=static_cast<numtyp>(host_special_lj[2]);
	host_write[6]=static_cast<numtyp>(host_special_lj[3]);
	ucl_copy(gamma_upsilon_mu,host_write,7,false);

	lshape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
	UCL_H_Vec<double> d_view;
	d_view.view(host_lshape,lshape.numel(),*ucl_device);
	ucl_copy(lshape,d_view,false);

	// Copy shape, well, sigma, epsilon, and cutsq onto GPU
	// - cast if necessary
	shape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
	for (int i=0; i<ntypes; i++) {
	host_write[i*4]=host_shape[i][0];
	host_write[i*4+1]=host_shape[i][1];
	host_write[i*4+2]=host_shape[i][2];
	}
	UCL_H_Vec<numtyp4> view4;
	view4.view((numtyp4)host_write.begin(),shape.numel(),ucl_device);
	ucl_copy(shape,view4,false);

	well.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
	for (int i=0; i<ntypes; i++) {
	host_write[i*4]=host_well[i][0];
	host_write[i*4+1]=host_well[i][1];
	host_write[i*4+2]=host_well[i][2];
	}
	view4.view((numtyp4)host_write.begin(),well.numel(),ucl_device);
	ucl_copy(well,view4,false);

	// See if we want fast GB-sphere or sphere-sphere calculations
	multiple_forms=false;
	for (int i=1; i<ntypes; i++)
	for (int j=i; j<ntypes; j++)
	if (host_form[i][j]!=ELLIPSE_ELLIPSE)
	multiple_forms=true;
	if (multiple_forms && host_nlocal>0) {
	std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
	exit(1);
	}

	if (multiple_forms)
	ans->dev_ans.zero();

	_max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

	// Memory for ilist ordered by particle type
	if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
	return 0;
	else return -3;
	}

	template <class numtyp, class acctyp>
	void GB_GPU_MemoryT::estimate_gpu_overhead() {
	device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
	}

	template <class numtyp, class acctyp>
	void GB_GPU_MemoryT::clear() {
	if (!_allocated)
	return;

	UCL_H_Vec<int> err_flag(1,*ucl_device);
	ucl_copy(err_flag,dev_error,false);
	if (err_flag[0] == 2)
	std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
	err_flag.clear();

	_allocated=false;

	// Output any timing information
	acc_timers();
	double single[9], times[9];

	single[0]=atom->transfer_time()+ans->transfer_time();
	single[1]=nbor->time_nbor.total_seconds();
	single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
	nbor->time_kernel.total_seconds();
	single[3]=time_gayberne.total_seconds()+time_gayberne2.total_seconds();
	if (multiple_forms)
	single[4]=time_pair.total_seconds();
	else
	single[4]=0;
	single[5]=atom->cast_time()+ans->cast_time();
	single[6]=_gpu_overhead;
	single[7]=_driver_overhead;
	single[8]=ans->cpu_idle_time();

	MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
	double avg_split=hd_balancer.all_avg_split();

	_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
	sigma_epsilon.row_bytes()+cut_form.row_bytes()+
	shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
	gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
	double mpi_max_bytes;
	MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
	device->replica());
	double max_mb=mpi_max_bytes/(1024*1024);

	if (device->replica_me()==0)
	if (screen && times[3]>0.0) {
	int replica_size=device->replica_size();

	fprintf(screen,"\n\n-------------------------------------");
	fprintf(screen,"--------------------------------\n");
	fprintf(screen," GPU Time Info (average): ");
	fprintf(screen,"\n-------------------------------------");
	fprintf(screen,"--------------------------------\n");

	if (device->procs_per_gpu()==1) {
	fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
	fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size);
	fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
	if (nbor->gpu_nbor())
	fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size);
	else
	fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
	fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
	fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
	}
	fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size);
	fprintf(screen,"Average split: %.4f.\n",avg_split);
	fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
	fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
	fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
	fprintf(screen,"-------------------------------------");
	fprintf(screen,"--------------------------------\n\n");


	fprintf(screen,"Average split: %.4f.\n",avg_split);
	fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);


	}
	_max_bytes=0.0;

	dev_error.clear();
	lj1.clear();
	lj3.clear();
	sigma_epsilon.clear();
	cut_form.clear();

	shape.clear();
	well.clear();
	lshape.clear();
	gamma_upsilon_mu.clear();
	host_olist.clear();

	time_kernel.clear();
	time_gayberne.clear();
	time_kernel2.clear();
	time_gayberne2.clear();
	time_pair.clear();
	hd_balancer.clear();

	if (_compiled) {
	k_gb_nbor_fast.clear();
	k_gb_nbor.clear();
	k_gayberne.clear();
	k_sphere_gb.clear();
	k_lj_fast.clear();
	k_lj.clear();
	delete pair_program;
	delete gb_program;
	delete gb_lj_program;
	_compiled=false;
	}

	nbor->clear();
	ans->clear();
	device->clear();
	}

	template <class numtyp, class acctyp>
	double GB_GPU_MemoryT::host_memory_usage() const {
	return device->atom.host_memory_usage()+nbor->host_memory_usage()+
	4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
	nbor->max_atoms()*sizeof(int);
	}

	template <class numtyp, class acctyp>
	void GB_GPU_MemoryT::compile_kernels(UCL_Device &dev) {
	if (_compiled)
	return;

	std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
	std::string(OCL_PRECISION_COMPILE);

	pair_program=new UCL_Program(dev);
	pair_program->load_string(gb_gpu_kernel_nbor,flags.c_str());
	k_gb_nbor_fast.set_function(*pair_program,"kernel_gb_nbor_fast");
	k_gb_nbor.set_function(*pair_program,"kernel_gb_nbor");

	gb_program=new UCL_Program(dev);
	gb_program->load_string(gb_gpu_kernel,flags.c_str());
	k_gayberne.set_function(*gb_program,"kernel_gayberne");

	gb_lj_program=new UCL_Program(dev);
	gb_lj_program->load_string(gb_gpu_kernel_lj,flags.c_str());
	k_sphere_gb.set_function(*gb_lj_program,"kernel_sphere_gb");
	k_lj_fast.set_function(*gb_lj_program,"kernel_lj_fast");
	k_lj.set_function(*gb_lj_program,"kernel_lj");

	_compiled=true;
	}

	template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;

gb_gpu_memory.cppNo OneTemporaryActions

File Metadata

gb_gpu_memory.cppView Options

Event Timeline

gb_gpu_memory.cpp
No OneTemporary
Actions

gb_gpu_memory.cpp
View Options