lal_eam_lj.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Sep 10, 10:15

lal_eam_lj.cpp
View Options

	/***************************************************************************
	lal_eam_lj.cpp
	-------------------
	W. Michael Brown, Trung Dac Nguyen (ORNL)

	Class for acceleration of the eam pair style.

	__________________________________________________________________________
	This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
	__________________________________________________________________________

	begin :
	email : brownw@ornl.gov nguyentd@ornl.gov
	***************************************************************************/

	#ifdef USE_OPENCL
	#include "eam_lj_cl.h"
	#else
	#include "eam_lj_ptx.h"
	#endif

	#include "lal_eam_lj.h"
	#include <cassert>
	using namespace LAMMPS_AL;
	#define EAMLJT EAMLJ<numtyp, acctyp>


	#define MIN(A,B) ((A) < (B) ? (A) : (B))
	#define MAX(A,B) ((A) > (B) ? (A) : (B))

	extern Device<PRECISION,ACC_PRECISION> device;

	template <class numtyp, class acctyp>
	EAMLJT::EAMLJ() : BaseAtomic<numtyp,acctyp>(),
	_compiled_energy(false), _allocated(false) {
	}

	template <class numtyp, class acctyp>
	EAMLJT::~EAMLJ() {
	clear();
	}

	template <class numtyp, class acctyp>
	int EAMLJT::init(const int ntypes, double host_cutforcesq,
	int host_type2rhor, int host_type2z2r, int *host_type2frho,
	double *host_rhor_spline, double *host_z2r_spline,
	double *host_frho_spline, double host_cutsq, double **host_lj1,
	double host_lj2, double host_lj3,
	double host_lj4, double host_offset,
	double *host_special_lj,
	double rdr, double rdrho, int nrhor, int nrho,
	int nz2r, int nfrho, int nr,
	const int nlocal, const int nall, const int max_nbors,
	const int maxspecial, const double cell_size,
	const double gpu_split, FILE *_screen)
	{
	int success;
	success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
	_screen,eam_lj);

	if (success!=0)
	return success;

	// allocate fp

	bool cpuview=false;
	if (this->ucl_device->device_type()==UCL_CPU)
	cpuview=true;

	int ef_nall=nall;
	if (ef_nall==0)
	ef_nall=2000;

	_max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
	host_fp.alloc(_max_fp_size,*(this->ucl_device));
	if (cpuview)
	dev_fp.view(host_fp);
	else
	dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY);

	k_energy.set_function(*(this->pair_program),"kernel_energy");
	k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast");
	fp_tex.get_texture(*(this->pair_program),"fp_tex");
	fp_tex.bind_float(dev_fp,1);
	_compiled_energy = true;

	// Initialize timers for selected GPU
	time_pair2.init(*(this->ucl_device));
	time_pair2.zero();

	time_fp1.init(*(this->ucl_device));
	time_fp1.zero();

	time_fp2.init(*(this->ucl_device));
	time_fp2.zero();

	// If atom type constants fit in shared memory use fast kernel
	int lj_types=ntypes;
	shared_types=false;

	int max_shared_types=this->device->max_shared_types();
	if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
	lj_types=max_shared_types;
	shared_types=true;
	}

	_ntypes=lj_types;
	_cutforcesq=host_cutforcesq;
	_rdr=rdr;
	_rdrho = rdrho;
	_nrhor=nrhor;
	_nrho=nrho;
	_nz2r=nz2r;
	_nfrho=nfrho;
	_nr=nr;

	// EAM allocation
	UCL_H_Vec<int2> dview_type(lj_typeslj_types,(this->ucl_device),
	UCL_WRITE_OPTIMIZED);

	for (int i=0; i<lj_types*lj_types; i++) {
	dview_type[i].x=0; dview_type[i].y=0;
	}

	// pack type2rhor and type2z2r
	type2rhor_z2r.alloc(lj_typeslj_types,(this->ucl_device),UCL_READ_ONLY);

	for (int i=0; i<ntypes; i++) {
	for (int j=0; j<ntypes; j++) {
	dview_type[i*lj_types+j].x=host_type2rhor[i][j];
	dview_type[i*lj_types+j].y=host_type2z2r[i][j];
	}
	}

	ucl_copy(type2rhor_z2r,dview_type,false);

	// pack type2frho
	UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
	UCL_WRITE_OPTIMIZED);

	type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY);
	for (int i=0; i<ntypes; i++)
	dview_type2frho[i]=host_type2frho[i];
	ucl_copy(type2frho,dview_type2frho,false);



	// pack frho_spline
	UCL_H_Vec<numtyp4> dview_frho_spline(nfrho(nrho+1),(this->ucl_device),
	UCL_WRITE_OPTIMIZED);

	for (int ix=0; ix<nfrho; ix++)
	for (int iy=0; iy<nrho+1; iy++) {
	dview_frho_spline[ix*(nrho+1)+iy].x=host_frho_spline[ix][iy][0];
	dview_frho_spline[ix*(nrho+1)+iy].y=host_frho_spline[ix][iy][1];
	dview_frho_spline[ix*(nrho+1)+iy].z=host_frho_spline[ix][iy][2];
	dview_frho_spline[ix*(nrho+1)+iy].w=0;
	}

	frho_spline1.alloc(nfrho(nrho+1),(this->ucl_device),UCL_READ_ONLY);
	ucl_copy(frho_spline1,dview_frho_spline,false);
	frho_spline1_tex.get_texture(*(this->pair_program),"frho_sp1_tex");
	frho_spline1_tex.bind_float(frho_spline1,4);


	for (int ix=0; ix<nfrho; ix++)
	for (int iy=0; iy<nrho+1; iy++) {
	dview_frho_spline[ix*(nrho+1)+iy].x=host_frho_spline[ix][iy][3];
	dview_frho_spline[ix*(nrho+1)+iy].y=host_frho_spline[ix][iy][4];
	dview_frho_spline[ix*(nrho+1)+iy].z=host_frho_spline[ix][iy][5];
	dview_frho_spline[ix*(nrho+1)+iy].w=host_frho_spline[ix][iy][6];
	}

	frho_spline2.alloc(nfrho(nrho+1),(this->ucl_device),UCL_READ_ONLY);
	ucl_copy(frho_spline2,dview_frho_spline,false);
	frho_spline2_tex.get_texture(*(this->pair_program),"frho_sp2_tex");
	frho_spline2_tex.bind_float(frho_spline2,4);

	// pack rhor_spline
	UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor(nr+1),(this->ucl_device),
	UCL_WRITE_OPTIMIZED);

	for (int ix=0; ix<nrhor; ix++)
	for (int iy=0; iy<nr+1; iy++) {
	dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][0];
	dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][1];
	dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][2];
	dview_rhor_spline[ix*(nr+1)+iy].w=(numtyp)0;
	}

	rhor_spline1.alloc(nrhor(nr+1),(this->ucl_device),UCL_READ_ONLY);
	ucl_copy(rhor_spline1,dview_rhor_spline,false);
	rhor_spline1_tex.get_texture(*(this->pair_program),"rhor_sp1_tex");
	rhor_spline1_tex.bind_float(rhor_spline1,4);

	for (int ix=0; ix<nrhor; ix++)
	for (int iy=0; iy<nr+1; iy++) {
	dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][3];
	dview_rhor_spline[ix*(nr+1)+iy].y=host_rhor_spline[ix][iy][4];
	dview_rhor_spline[ix*(nr+1)+iy].z=host_rhor_spline[ix][iy][5];
	dview_rhor_spline[ix*(nr+1)+iy].w=host_rhor_spline[ix][iy][6];
	}

	rhor_spline2.alloc(nrhor(nr+1),(this->ucl_device),UCL_READ_ONLY);
	ucl_copy(rhor_spline2,dview_rhor_spline,false);
	rhor_spline2_tex.get_texture(*(this->pair_program),"rhor_sp2_tex");
	rhor_spline2_tex.bind_float(rhor_spline2,4);

	// pack z2r_spline
	UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r(nr+1),(this->ucl_device),
	UCL_WRITE_OPTIMIZED);

	for (int ix=0; ix<nz2r; ix++)
	for (int iy=0; iy<nr+1; iy++) {
	dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][0];
	dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][1];
	dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][2];
	dview_z2r_spline[ix*(nr+1)+iy].w=(numtyp)0;
	}

	z2r_spline1.alloc(nz2r(nr+1),(this->ucl_device),UCL_READ_ONLY);
	ucl_copy(z2r_spline1,dview_z2r_spline,false);
	z2r_spline1_tex.get_texture(*(this->pair_program),"z2r_sp1_tex");
	z2r_spline1_tex.bind_float(z2r_spline1,4);

	for (int ix=0; ix<nz2r; ix++)
	for (int iy=0; iy<nr+1; iy++) {
	dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][3];
	dview_z2r_spline[ix*(nr+1)+iy].y=host_z2r_spline[ix][iy][4];
	dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][5];
	dview_z2r_spline[ix*(nr+1)+iy].w=host_z2r_spline[ix][iy][6];
	}

	z2r_spline2.alloc(nz2r(nr+1),(this->ucl_device),UCL_READ_ONLY);
	ucl_copy(z2r_spline2,dview_z2r_spline,false);
	z2r_spline2_tex.get_texture(*(this->pair_program),"z2r_sp2_tex");
	z2r_spline2_tex.bind_float(z2r_spline2,4);

	// LJ allocation
	// Allocate a host write buffer for data initialization
	UCL_H_Vec<numtyp> host_write(lj_typeslj_types32,*(this->ucl_device),
	UCL_WRITE_OPTIMIZED);

	for (int i=0; i<lj_types*lj_types; i++)
	host_write[i]=0.0;

	lj1.alloc(lj_typeslj_types,(this->ucl_device),UCL_READ_ONLY);
	this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
	host_cutsq);

	lj3.alloc(lj_typeslj_types,(this->ucl_device),UCL_READ_ONLY);
	this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
	host_offset);

	UCL_H_Vec<double> dview;
	sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
	dview.view(host_special_lj,4,*(this->ucl_device));
	ucl_copy(sp_lj,dview,false);

	_allocated=true;
	this->_max_bytes=type2rhor_z2r.row_bytes()
	+ type2frho.row_bytes()
	+ rhor_spline1.row_bytes()
	+ rhor_spline2.row_bytes()
	+ frho_spline1.row_bytes()
	+ frho_spline2.row_bytes()
	+ z2r_spline1.row_bytes()
	+ z2r_spline2.row_bytes()
	+ dev_fp.row_bytes()
	+ lj1.row_bytes()
	+ lj3.row_bytes()
	+ sp_lj.row_bytes();
	return 0;
	}

	template <class numtyp, class acctyp>
	void EAMLJT::clear() {
	if (!_allocated)
	return;
	_allocated=false;

	type2rhor_z2r.clear();
	type2frho.clear();
	rhor_spline1.clear();
	rhor_spline2.clear();
	frho_spline1.clear();
	frho_spline2.clear();
	z2r_spline1.clear();
	z2r_spline2.clear();

	host_fp.clear();
	dev_fp.clear();

	lj1.clear();
	lj3.clear();
	sp_lj.clear();

	time_pair2.clear();
	time_fp1.clear();
	time_fp2.clear();

	if (_compiled_energy) {
	k_energy_fast.clear();
	k_energy.clear();
	_compiled_energy=false;
	}

	this->clear_atomic();
	}

	template <class numtyp, class acctyp>
	double EAMLJT::host_memory_usage() const {
	return this->host_memory_usage_atomic()+sizeof(EAMLJ<numtyp,acctyp>);
	}

	// ---------------------------------------------------------------------------
	// Copy nbor list from host if necessary and then compute atom energies/forces
	// ---------------------------------------------------------------------------
	template <class numtyp, class acctyp>
	void EAMLJT::compute(const int f_ago, const int inum_full, const int nlocal,
	const int nall, double *host_x, int host_type,
	int ilist, int numj, int **firstneigh,
	const bool eflag, const bool vflag,
	const bool eatom, const bool vatom,
	int &host_start, const double cpu_time,
	bool &success, void **fp_ptr) {
	this->acc_timers();

	if (this->device->time_device()) {
	// Put time from the second part to the total time_pair
	this->time_pair.add_time_to_total(time_pair2.time());

	// Add transfer time from device -> host after part 1
	this->atom->add_transfer_time(time_fp1.time());

	// Add transfer time from host -> device before part 2
	this->atom->add_transfer_time(time_fp2.time());
	}

	// ------------------- Resize FP Array for EAM --------------------

	if (nall>_max_fp_size) {
	dev_fp.clear();
	host_fp.clear();

	_max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
	host_fp.alloc(_max_fp_size,*(this->ucl_device));
	if (this->ucl_device->device_type()==UCL_CPU)
	dev_fp.view(host_fp);
	else
	dev_fp.alloc(_max_fp_size,*(this->ucl_device));

	fp_tex.bind_float(dev_fp,1);
	}
	*fp_ptr=host_fp.begin();

	// ----------------------------------------------------------------

	if (inum_full==0) {
	host_start=0;
	// Make sure textures are correct if realloc by a different hybrid style
	this->resize_atom(0,nall,success);
	this->zero_timers();
	return;
	}

	int ago=this->hd_balancer.ago_first(f_ago);
	int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
	this->ans->inum(inum);
	host_start=inum;

	// -----------------------------------------------------------------

	if (ago==0) {
	this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
	if (!success)
	return;
	}

	this->atom->cast_x_data(host_x,host_type);
	this->atom->add_x_data(host_x,host_type);

	loop(eflag,vflag);

	// copy fp from device to host for comm
	_nlocal=nlocal;
	time_fp1.start();
	ucl_copy(host_fp,dev_fp,nlocal,false);
	time_fp1.stop();

	}

	// ---------------------------------------------------------------------------
	// Reneighbor on GPU and then compute per-atom densities
	// ---------------------------------------------------------------------------
	template <class numtyp, class acctyp>
	int** EAMLJT::compute(const int ago, const int inum_full,
	const int nall, double *host_x, int host_type,
	double sublo, double subhi, int *tag,
	int nspecial, int special, const bool eflag,
	const bool vflag, const bool eatom,
	const bool vatom, int &host_start,
	int ilist, int jnum,
	const double cpu_time, bool &success,
	int &inum, void **fp_ptr) {
	this->acc_timers();

	if (this->device->time_device()) {
	// Put time from the second part to the total time_pair
	this->time_pair.add_time_to_total(time_pair2.time());

	// Add transfer time from device -> host after part 1
	this->atom->add_transfer_time(time_fp1.time());

	// Add transfer time from host -> device before part 2
	this->atom->add_transfer_time(time_fp2.time());
	}

	// ------------------- Resize FP Array for EAM --------------------

	if (nall>_max_fp_size) {
	dev_fp.clear();
	host_fp.clear();

	_max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
	host_fp.alloc(_max_fp_size,*(this->ucl_device));
	if (this->ucl_device->device_type()==UCL_CPU)
	dev_fp.view(host_fp);
	else
	dev_fp.alloc(_max_fp_size,*(this->ucl_device));

	fp_tex.bind_float(dev_fp,1);
	}
	*fp_ptr=host_fp.begin();

	// -----------------------------------------------------------------

	if (inum_full==0) {
	host_start=0;
	// Make sure textures are correct if realloc by a different hybrid style
	this->resize_atom(0,nall,success);
	this->zero_timers();
	return NULL;
	}

	// load balance, returning the atom count on the device (inum)
	this->hd_balancer.balance(cpu_time);
	inum=this->hd_balancer.get_gpu_count(ago,inum_full);
	this->ans->inum(inum);
	host_start=inum;

	// Build neighbor list on GPU if necessary
	if (ago==0) {
	this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
	sublo, subhi, tag, nspecial, special, success);
	if (!success)
	return NULL;
	} else {
	this->atom->cast_x_data(host_x,host_type);
	this->atom->add_x_data(host_x,host_type);
	}
	*ilist=this->nbor->host_ilist.begin();
	*jnum=this->nbor->host_acc.begin();

	loop(eflag,vflag);

	// copy fp from device to host for comm
	_nlocal=inum_full;
	time_fp1.start();
	ucl_copy(host_fp,dev_fp,inum_full,false);
	time_fp1.stop();

	return this->nbor->host_jlist.begin()-host_start;
	}

	// ---------------------------------------------------------------------------
	// Copy nbor list from host if necessary and then calculate forces, virials,..
	// ---------------------------------------------------------------------------
	template <class numtyp, class acctyp>
	void EAMLJT::compute2(int *ilist, const bool eflag, const bool vflag,
	const bool eatom, const bool vatom) {
	if (this->ans->inum()==0)
	return;

	this->hd_balancer.start_timer();
	time_fp2.start();
	this->add_fp_data();
	time_fp2.stop();

	loop2(eflag,vflag);
	if (ilist == NULL)
	this->ans->copy_answers(eflag,vflag,eatom,vatom);
	else
	this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist);

	this->device->add_ans_object(this->ans);
	this->hd_balancer.stop_timer();
	}

	// ---------------------------------------------------------------------------
	// Calculate per-atom energies and forces
	// ---------------------------------------------------------------------------
	template <class numtyp, class acctyp>
	void EAMLJT::loop(const bool _eflag, const bool _vflag) {
	// Compute the block size and grid size to keep all cores busy
	const int BX=this->block_size();
	int eflag, vflag;
	if (_eflag)
	eflag=1;
	else
	eflag=0;

	if (_vflag)
	vflag=1;
	else
	vflag=0;

	int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
	(BX/this->_threads_per_atom)));

	int ainum=this->ans->inum();
	int nbor_pitch=this->nbor->nbor_pitch();
	this->time_pair.start();

	if (shared_types) {
	this->k_energy_fast.set_size(GX,BX);
	this->k_energy_fast.run(&this->atom->dev_x.begin(),
	&type2rhor_z2r.begin(), &type2frho.begin(),
	&rhor_spline2.begin(),
	&frho_spline1.begin(),&frho_spline2.begin(),
	&this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(),
	&dev_fp.begin(),
	&this->ans->dev_engv.begin(),
	&eflag, &ainum,
	&nbor_pitch,
	&_ntypes, &_cutforcesq,
	&_rdr, &_rdrho,
	&_nrho, &_nr,
	&this->_threads_per_atom);
	} else {
	this->k_energy.set_size(GX,BX);
	this->k_energy.run(&this->atom->dev_x.begin(),
	&type2rhor_z2r.begin(), &type2frho.begin(),
	&rhor_spline2.begin(),
	&frho_spline1.begin(),&frho_spline2.begin(),
	&this->nbor->dev_nbor.begin(), &this->_nbor_data->begin(),
	&dev_fp.begin(),
	&this->ans->dev_engv.begin(),
	&eflag, &ainum,
	&nbor_pitch,
	&_ntypes, &_cutforcesq,
	&_rdr, &_rdrho,
	&_nrho, &_nr,
	&this->_threads_per_atom);
	}

	this->time_pair.stop();
	}

	// ---------------------------------------------------------------------------
	// Calculate energies, forces, and torques
	// ---------------------------------------------------------------------------
	template <class numtyp, class acctyp>
	void EAMLJT::loop2(const bool _eflag, const bool _vflag) {
	// Compute the block size and grid size to keep all cores busy
	const int BX=this->block_size();
	int eflag, vflag;
	if (_eflag)
	eflag=1;
	else
	eflag=0;

	if (_vflag)
	vflag=1;
	else
	vflag=0;

	int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
	(BX/this->_threads_per_atom)));

	int ainum=this->ans->inum();
	int nbor_pitch=this->nbor->nbor_pitch();
	this->time_pair2.start();

	if (shared_types) {
	this->k_pair_fast.set_size(GX,BX);
	this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(),
	&type2rhor_z2r.begin(),
	&rhor_spline1.begin(),
	&z2r_spline1.begin(),
	&z2r_spline2.begin(),
	&lj1.begin(), &lj3.begin(),
	&sp_lj.begin(),
	&this->nbor->dev_nbor.begin(),
	&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
	&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
	&nbor_pitch, &_cutforcesq, &_rdr, &_nr,
	&this->_threads_per_atom);
	} else {
	this->k_pair.set_size(GX,BX);
	this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(),
	&type2rhor_z2r.begin(),
	&rhor_spline1.begin(),
	&z2r_spline1.begin(),
	&z2r_spline2.begin(),
	&lj1.begin(), &lj3.begin(),
	&sp_lj.begin(),
	&this->nbor->dev_nbor.begin(),
	&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
	&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
	&nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr,
	&this->_threads_per_atom);
	}

	this->time_pair2.stop();
	}

	template class EAMLJ<PRECISION,ACC_PRECISION>;

lal_eam_lj.cppNo OneTemporaryActions

File Metadata

lal_eam_lj.cppView Options

Event Timeline

lal_eam_lj.cpp
No OneTemporary
Actions

lal_eam_lj.cpp
View Options