pair_gpu_atom.h
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Oct 7, 00:48

pair_gpu_atom.h
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
	------------------------------------------------------------------------- */

	#ifndef PAIR_GPU_ATOM_H
	#define PAIR_GPU_ATOM_H

	#include <math.h>
	#include "mpi.h"

	#ifdef USE_OPENCL

	#include "geryon/ocl_timer.h"
	#include "geryon/ocl_mat.h"
	#include "geryon/ocl_kernel.h"
	using namespace ucl_opencl;

	#else

	#include "cudpp.h"
	#include "geryon/nvd_timer.h"
	#include "geryon/nvd_mat.h"
	#include "geryon/nvd_kernel.h"
	using namespace ucl_cudadr;

	#endif

	#include "pair_gpu_precision.h"

	template <class numtyp, class acctyp>
	class PairGPUAtom {
	public:
	PairGPUAtom();
	~PairGPUAtom() { clear(); }

	/// Maximum number of atoms that can be stored with current allocation
	inline int max_atoms() const { return _max_atoms; }
	/// Current number of local+ghost atoms stored
	inline int nall() const { return _nall; }

	/// Set number of local+ghost atoms for future copy operations
	inline void nall(const int n) { _nall=n; }

	/// Memory usage per atom in this class
	int bytes_per_atom() const;

	/// Clear any previous data and set up for a new LAMMPS run
	/** \param rot True if atom storage needs quaternions
	* \param gpu_nbor True if neighboring will be performed on device **/
	bool init(const int nall, const bool charge, const bool rot,
	UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);

	/// Check if we have enough device storage and realloc if not
	/ Returns true if resized with any call during this timestep /
	inline bool resize(const int nall, bool &success) {
	_nall=nall;
	if (nall>_max_atoms) {
	clear_resize();
	success = success && alloc(nall);
	_resized=true;
	}
	return _resized;
	}

	/// If already initialized by another LAMMPS style, add fields as necessary
	/** \param rot True if atom storage needs quaternions
	* \param gpu_nbor True if neighboring will be performed on device **/
	bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
	const bool bonds);

	/// Returns true if GPU is using charges
	bool charge() { return _charge; }

	/// Returns true if GPU is using quaternions
	bool quat() { return _rot; }

	/// Only free matrices of length inum or nall for resizing
	void clear_resize();

	/// Free all memory on host and device
	void clear();

	/// Return the total amount of host memory used by class in bytes
	double host_memory_usage() const;

	/// Sort arrays for neighbor list calculation on device
	void sort_neighbor(const int num_atoms);

	/// Add copy times to timers
	inline void acc_timers() {
	time_pos.add_to_total();
	if (_charge)
	time_q.add_to_total();
	if (_rot)
	time_quat.add_to_total();
	}

	/// Add copy times to timers
	inline void zero_timers() {
	time_pos.zero();
	if (_charge)
	time_q.zero();
	if (_rot)
	time_quat.zero();
	}

	/// Return the total time for host/device data transfer
	/ Zeros the total so that the atom times are only included once /
	inline double transfer_time() {
	double total=time_pos.total_seconds();
	time_pos.zero_total();
	if (_charge) {
	total+=time_q.total_seconds();
	time_q.zero_total();
	}
	if (_rot) {
	total+=time_q.total_seconds();
	time_quat.zero_total();
	}

	return total;
	}

	/// Return the total time for data cast/pack
	/ Zeros the time so that atom times are only included once /
	inline double cast_time()
	{ double t=_time_cast; _time_cast=0.0; return t; }

	/// Pack LAMMPS atom type constants into matrix and copy to device
	template <class dev_typ, class t1>
	inline void type_pack1(const int n, const int m_size,
	UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
	t1 **one) {
	int ii=0;
	for (int i=0; i<n; i++) {
	for (int j=0; j<n; j++) {
	buffer[ii]=static_cast<numtyp>(one[i][j]);
	ii++;
	}
	ii+=m_size-n;
	}
	UCL_H_Vec<dev_typ> view;
	view.view((dev_typ)buffer.begin(),m_sizem_size,*dev);
	ucl_copy(dev_v,view,false);
	}

	/// Pack LAMMPS atom type constants into 2 vectors and copy to device
	template <class dev_typ, class t1, class t2>
	inline void type_pack2(const int n, const int m_size,
	UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
	t1 one, t2 two) {
	int ii=0;
	for (int i=0; i<n; i++) {
	for (int j=0; j<n; j++) {
	buffer[ii*2]=static_cast<numtyp>(one[i][j]);
	buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
	ii++;
	}
	ii+=m_size-n;
	}
	UCL_H_Vec<dev_typ> view;
	view.view((dev_typ)buffer.begin(),m_sizem_size,*dev);
	ucl_copy(dev_v,view,false);
	}

	/// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
	template <class dev_typ, class t1, class t2, class t3>
	inline void type_pack4(const int n, const int m_size,
	UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
	t1 one, t2 two, t3 **three) {
	int ii=0;
	for (int i=0; i<n; i++) {
	for (int j=0; j<n; j++) {
	buffer[ii*4]=static_cast<numtyp>(one[i][j]);
	buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
	buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
	ii++;
	}
	ii+=m_size-n;
	}
	UCL_H_Vec<dev_typ> view;
	view.view((dev_typ)buffer.begin(),m_sizem_size,*dev);
	ucl_copy(dev_v,view,false);
	}

	/// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
	template <class dev_typ, class t1, class t2, class t3, class t4>
	inline void type_pack4(const int n, const int m_size,
	UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
	t1 one, t2 two, t3 three, t4 four) {
	int ii=0;
	for (int i=0; i<n; i++) {
	for (int j=0; j<n; j++) {
	buffer[ii*4]=static_cast<numtyp>(one[i][j]);
	buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
	buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
	buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
	ii++;
	}
	ii+=m_size-n;
	}
	UCL_H_Vec<dev_typ> view;
	view.view((dev_typ)buffer.begin(),m_sizem_size,*dev);
	ucl_copy(dev_v,view,false);
	}

	/// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
	template <class dev_typ, class t1, class t2>
	inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
	UCL_H_Vec<numtyp> &buffer, t1 one, t2 two) {
	for (int i=0; i<n; i++) {
	buffer[i*2]=static_cast<numtyp>(one[i][i]);
	buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
	}
	UCL_H_Vec<dev_typ> view;
	view.view((dev_typ)buffer.begin(),n,dev);
	ucl_copy(dev_v,view,false);
	}

	// -------------------------COPY TO GPU ----------------------------------

	/// Signal that we need to transfer atom data for next timestep
	inline void data_unavail()
	{ _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }

	/// Cast positions and types to write buffer
	inline void cast_x_data(double *host_ptr, const int host_type) {
	if (_x_avail==false) {
	double t=MPI_Wtime();
	#ifdef GPU_CAST
	memcpy(host_x_cast.begin(),host_ptr[0],_nall3sizeof(double));
	memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
	#else
	numtyp *_write_loc=host_x.begin();
	for (int i=0; i<_nall; i++) {
	*_write_loc=host_ptr[i][0];
	_write_loc++;
	*_write_loc=host_ptr[i][1];
	_write_loc++;
	*_write_loc=host_ptr[i][2];
	_write_loc++;
	*_write_loc=host_type[i];
	_write_loc++;
	}
	#endif
	_time_cast+=MPI_Wtime()-t;
	}
	}

	/// Copy positions and types to device asynchronously
	/ Copies nall() elements /
	inline void add_x_data(double *host_ptr, int host_type) {
	time_pos.start();
	if (_x_avail==false) {
	#ifdef GPU_CAST
	ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
	ucl_copy(dev_type_cast,host_type_cast,_nall,true);
	int block_size=64;
	int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
	k_cast_x.set_size(GX,block_size);
	k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
	&_nall);
	#else
	ucl_copy(dev_x,host_x,_nall*4,true);
	#endif
	_x_avail=true;
	}
	time_pos.stop();
	}

	/// Calls cast_x_data and add_x_data and times the routines
	inline void cast_copy_x(double *host_ptr, int host_type) {
	cast_x_data(host_ptr,host_type);
	add_x_data(host_ptr,host_type);
	}

	// Cast charges to write buffer
	template<class cpytyp>
	inline void cast_q_data(cpytyp *host_ptr) {
	if (_q_avail==false) {
	double t=MPI_Wtime();
	if (dev->device_type()==UCL_CPU) {
	if (sizeof(numtyp)==sizeof(double)) {
	host_q.view((numtyp)host_ptr,_nall,dev);
	dev_q.view(host_q);
	} else
	for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
	} else {
	if (sizeof(numtyp)==sizeof(double))
	memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
	else
	for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
	}
	_time_cast+=MPI_Wtime()-t;
	}
	}

	// Copy charges to device asynchronously
	inline void add_q_data() {
	if (_q_avail==false) {
	ucl_copy(dev_q,host_q,_nall,true);
	_q_avail=true;
	}
	}

	// Cast quaternions to write buffer
	template<class cpytyp>
	inline void cast_quat_data(cpytyp *host_ptr) {
	if (_quat_avail==false) {
	double t=MPI_Wtime();
	if (dev->device_type()==UCL_CPU) {
	if (sizeof(numtyp)==sizeof(double)) {
	host_quat.view((numtyp)host_ptr,_nall4,*dev);
	dev_quat.view(host_quat);
	} else
	for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
	} else {
	if (sizeof(numtyp)==sizeof(double))
	memcpy(host_quat.begin(),host_ptr,_nall4sizeof(numtyp));
	else
	for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
	}
	_time_cast+=MPI_Wtime()-t;
	}
	}

	// Copy quaternions to device
	/** Copies nall()4 elements */
	inline void add_quat_data() {
	if (_quat_avail==false) {
	ucl_copy(dev_quat,host_quat,_nall*4,true);
	_quat_avail=true;
	}
	}

	/// Return number of bytes used on device
	inline double max_gpu_bytes()
	{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }

	// ------------------------------ DATA ----------------------------------

	/// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
	UCL_D_Vec<numtyp> dev_x;
	/// Charges
	UCL_D_Vec<numtyp> dev_q;
	/// Quaterions
	UCL_D_Vec<numtyp> dev_quat;

	#ifdef GPU_CAST
	UCL_D_Vec<double> dev_x_cast;
	UCL_D_Vec<int> dev_type_cast;
	UCL_H_Vec<double> host_x_cast;
	UCL_H_Vec<int> host_type_cast;
	#endif

	/// Buffer for moving positions to device
	UCL_H_Vec<numtyp> host_x;
	/// Buffer for moving charge data to GPU
	UCL_H_Vec<numtyp> host_q;
	/// Buffer for moving quat data to GPU
	UCL_H_Vec<numtyp> host_quat;

	/// Cell list identifiers for device nbor builds
	UCL_D_Vec<unsigned> dev_cell_id;
	/// Cell list identifiers for device nbor builds
	UCL_D_Vec<int> dev_particle_id;
	/// Atom tag information for device nbor builds
	UCL_D_Vec<int> dev_tag;

	/// Device timers
	UCL_Timer time_pos, time_q, time_quat;

	/// Geryon device
	UCL_Device *dev;

	private:
	#ifdef GPU_CAST
	UCL_Program *atom_program;
	UCL_Kernel k_cast_x;
	void compile_kernels(UCL_Device &dev);
	#endif

	bool _compiled;

	// True if data has been copied to device already
	bool _x_avail, _q_avail, _quat_avail, _resized;

	bool alloc(const int nall);

	bool _allocated, _rot, _charge, _other;
	int _max_atoms, _nall;
	bool _gpu_nbor, _bonds;
	double _time_cast;

	double _max_gpu_bytes;

	#ifndef USE_OPENCL
	CUDPPConfiguration sort_config;
	CUDPPHandle sort_plan;
	#endif
	};

	#endif

pair_gpu_atom.hNo OneTemporaryActions

File Metadata

pair_gpu_atom.hView Options

Event Timeline

pair_gpu_atom.h
No OneTemporary
Actions

pair_gpu_atom.h
View Options