cuda.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Jul 4, 07:04

cuda.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

	Original Version:
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	See the README file in the top-level LAMMPS directory.

	-----------------------------------------------------------------------

	USER-CUDA Package and associated modifications:
	https://sourceforge.net/projects/lammpscuda/

	Christian Trott, christian.trott@tu-ilmenau.de
	Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
	Theoretical Physics II, University of Technology Ilmenau, Germany

	See the README file in the USER-CUDA directory.

	This software is distributed under the GNU General Public License.
	------------------------------------------------------------------------- */

	#include <cstdlib>
	#include <cstdio>
	#include <cstring>
	#include "user_cuda.h"
	#include "atom.h"
	#include "domain.h"
	#include "force.h"
	#include "pair.h"
	#include "update.h"
	#include "neighbor.h"
	#include "neigh_list.h"
	#include "universe.h"
	#include "input.h"
	#include "atom_masks.h"
	#include "error.h"

	#include "cuda_neigh_list.h"
	//#include "pre_binning_cu.h"
	//#include "reverse_binning_cu.h"
	#include <ctime>
	#include <cmath>
	#include "cuda_pair_cu.h"
	#include "cuda_cu.h"

	using namespace LAMMPS_NS;

	/* ---------------------------------------------------------------------- */

	Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp)
	{
	cuda_exists = true;
	lmp->cuda = this;

	if (universe->me == 0) printf("# Using LAMMPS_CUDA \n");

	shared_data.me = universe->me;

	device_set = false;
	devicelist = NULL;

	Cuda_Cuda_GetCompileSettings(&shared_data);

	if (universe->me == 0) {

	if(shared_data.compile_settings.prec_glob != sizeof(CUDA_CFLOAT) / 4)
	printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
	" # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n",
	shared_data.compile_settings.prec_glob, (int) sizeof(CUDA_CFLOAT) / 4);

	if(shared_data.compile_settings.prec_x != sizeof(X_CFLOAT) / 4)
	printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
	" # CUDA WARNING: X Precision: cuda %i cpp %i\n\n",
	shared_data.compile_settings.prec_x, (int) sizeof(X_CFLOAT) / 4);

	if(shared_data.compile_settings.prec_v != sizeof(V_CFLOAT) / 4)
	printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
	" # CUDA WARNING: V Precision: cuda %i cpp %i\n\n",
	shared_data.compile_settings.prec_v, (int) sizeof(V_CFLOAT) / 4);

	if(shared_data.compile_settings.prec_f != sizeof(F_CFLOAT) / 4)
	printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
	" # CUDA WARNING: F Precision: cuda %i cpp %i\n\n",
	shared_data.compile_settings.prec_f, (int) sizeof(F_CFLOAT) / 4);

	if(shared_data.compile_settings.prec_pppm != sizeof(PPPM_CFLOAT) / 4)
	printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
	" # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n",
	shared_data.compile_settings.prec_pppm, (int) sizeof(PPPM_CFLOAT) / 4);

	if(shared_data.compile_settings.prec_fft != sizeof(FFT_CFLOAT) / 4)
	printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
	" # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n",
	shared_data.compile_settings.prec_fft, (int) sizeof(FFT_CFLOAT) / 4);

	#ifdef FFT_CUFFT
	if(shared_data.compile_settings.cufft != 1)
	printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
	" # CUDA WARNING: cufft: cuda %i cpp %i\n\n",
	shared_data.compile_settings.cufft, 1);
	#else
	if(shared_data.compile_settings.cufft != 0)
	printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
	" # CUDA WARNING: cufft: cuda %i cpp %i\n\n",
	shared_data.compile_settings.cufft, 0);
	#endif

	if(shared_data.compile_settings.arch != CUDA_ARCH)
	printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
	" # CUDA WARNING: arch: cuda %i cpp %i\n\n",
	shared_data.compile_settings.cufft, CUDA_ARCH);
	}

	cu_x = 0;
	cu_v = 0;
	cu_f = 0;
	cu_tag = 0;
	cu_type = 0;
	cu_mask = 0;
	cu_image = 0;
	cu_xhold = 0;
	cu_q = 0;
	cu_rmass = 0;
	cu_mass = 0;
	cu_virial = 0;
	cu_eatom = 0;
	cu_vatom = 0;
	cu_radius = 0;
	cu_density = 0;
	cu_omega = 0;
	cu_torque = 0;

	cu_special = 0;
	cu_nspecial = 0;

	cu_molecule = 0;

	cu_x_type = 0;
	x_type = 0;
	cu_v_radius = 0;
	v_radius = 0;
	cu_omega_rmass = 0;
	omega_rmass = 0;

	binned_id = 0;
	cu_binned_id = 0;
	binned_idnew = 0;
	cu_binned_idnew = 0;

	cu_map_array = 0;

	copy_buffer = 0;
	copy_buffersize = 0;

	neighbor_decide_by_integrator = 0;
	pinned = true;

	debugdata = 0;

	finished_setup = false;
	begin_setup = false;
	finished_run = false;

	setSharedDataZero();

	uploadtime = 0;
	downloadtime = 0;
	dotiming = false;

	dotestatom = false;
	testatom = 0;
	oncpu = true;

	self_comm = 0;
	MYDBG(printf("# CUDA: Cuda::Cuda Done...\n");)
	//cCudaData<double, float, yx >
	}

	/* ---------------------------------------------------------------------- */

	Cuda::~Cuda()
	{
	print_timings();

	if (universe->me == 0) printf("# CUDA: Free memory...\n");

	delete [] devicelist;

	delete cu_q;
	delete cu_x;
	delete cu_v;
	delete cu_f;
	delete cu_tag;
	delete cu_type;
	delete cu_mask;
	delete cu_image;
	delete cu_xhold;
	delete cu_mass;
	delete cu_rmass;
	delete cu_virial;
	delete cu_eng_vdwl;
	delete cu_eng_coul;
	delete cu_extent;
	delete cu_eatom;
	delete cu_vatom;
	delete cu_radius;
	delete cu_density;
	delete cu_omega;
	delete cu_torque;
	delete cu_molecule;

	delete cu_x_type;
	delete [] x_type;
	delete cu_v_radius;
	delete [] v_radius;
	delete cu_omega_rmass;
	delete [] omega_rmass;

	delete cu_debugdata;
	delete[] debugdata;

	delete cu_map_array;

	std::map<NeighList, CudaNeighList>::iterator p = neigh_lists.begin();

	while(p != neigh_lists.end()) {
	delete p->second;
	++p;
	}
	}

	/* ----------------------------------------------------------------------
	package cuda command
	can be invoked multiple times: -c on, -pk, package command
	can only init GPUs once in activate(), so just store params here
	------------------------------------------------------------------------- */

	void Cuda::accelerator(int narg, char **arg)
	{
	// this error should not happen

	if (device_set) error->all(FLERR,"USER-CUDA device is already activated");

	// pppn = # of GPUs/node

	pppn = force->inumeric(FLERR,arg[0]);
	if (pppn <= 0) error->all(FLERR,"Illegal package cuda command");

	// optional args

	delete [] devicelist;
	devicelist = NULL;
	int newtonflag = 0;

	int iarg = 1;
	while (iarg < narg) {
	if (strcmp(arg[iarg],"newton") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
	if (strcmp(arg[iarg+1],"off") == 0) newtonflag = 0;
	else if (strcmp(arg[iarg+1],"on") == 0) newtonflag = 1;
	else error->all(FLERR,"Illegal package cuda command");
	} else if (strcmp(arg[iarg],"gpuID") == 0) {
	if (iarg+pppn+1 > narg) error->all(FLERR,"Illegal package cuda command");
	devicelist = new int[pppn];
	for (int k = 0; k < pppn; k++)
	devicelist[k] = force->inumeric(FLERR,arg[iarg+k+1]);
	iarg += pppn + 1;
	} else if (strcmp(arg[iarg],"timing") == 0) {
	dotiming = true;
	iarg++;
	} else if (strcmp(arg[iarg],"test") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
	testatom = force->numeric(FLERR,arg[iarg+1]);
	dotestatom = true;
	iarg += 2;
	} else if (strcmp(arg[iarg],"thread") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
	if (strcmp(arg[iarg+1],"auto") == 0)
	shared_data.pair.override_block_per_atom = -1;
	else if (strcmp(arg[iarg+1],"tpa") == 0)
	shared_data.pair.override_block_per_atom = 0;
	else if (strcmp(arg[iarg+1],"bpa") == 0)
	shared_data.pair.override_block_per_atom = 1;
	else error->all(FLERR,"Illegal package cuda command");
	iarg += 2;
	}

	// undocumented options

	else if (strcmp(arg[iarg],"suffix") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
	strcpy(lmp->suffix,arg[iarg+1]);
	iarg += 2;
	} else if (strcmp(arg[iarg],"overlap_comm") == 0) {
	shared_data.overlap_comm = 1;
	iarg++;
	} else if (strcmp(arg[iarg],"pinned") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
	pinned = force->inumeric(FLERR,arg[iarg+1]) == 0 ? false : true;
	if ((pinned == false) && (universe->me == 0))
	printf(" #CUDA: Pinned memory is not used for communication\n");
	iarg += 2;
	} else error->all(FLERR,"Illegal package cuda command");
	}

	// set newton flags

	force->newton = force->newton_pair = force->newton_bond = newtonflag;
	}

	/* ----------------------------------------------------------------------
	activate the GPUs
	only done once with whatever settings used by the last package command
	------------------------------------------------------------------------- */

	void Cuda::activate()
	{
	if (device_set) return;
	device_set = true;

	if (universe->me == 0) printf("# CUDA: Activate GPU \n");

	CudaWrapper_Init(0, (char**)0, universe->me, pppn, devicelist);
	//if(shared_data.overlap_comm)
	CudaWrapper_AddStreams(3);
	cu_x = 0;
	cu_v = 0;
	cu_f = 0;
	cu_tag = 0;
	cu_type = 0;
	cu_mask = 0;
	cu_image = 0;
	cu_xhold = 0;
	cu_q = 0;
	cu_rmass = 0;
	cu_mass = 0;
	cu_virial = 0;
	cu_eatom = 0;
	cu_vatom = 0;
	cu_radius = 0;
	cu_density = 0;
	cu_omega = 0;
	cu_torque = 0;

	cu_special = 0;
	cu_nspecial = 0;

	cu_molecule = 0;

	cu_x_type = 0;
	cu_v_radius = 0;
	cu_omega_rmass = 0;

	cu_binned_id = 0;
	cu_binned_idnew = 0;
	allocate();
	}

	/* ---------------------------------------------------------------------- */

	void Cuda::setSharedDataZero()
	{
	MYDBG(printf("# CUDA: Cuda::setSharedDataZero ...\n");)
	shared_data.atom.nlocal = 0;
	shared_data.atom.nghost = 0;
	shared_data.atom.nall = 0;
	shared_data.atom.nmax = 0;
	shared_data.atom.ntypes = 0;
	shared_data.atom.q_flag = 0;
	shared_data.atom.need_eatom = 0;
	shared_data.atom.need_vatom = 0;
	shared_data.atom.update_nmax = 1;
	shared_data.atom.update_nlocal = 1;
	shared_data.atom.update_neigh = 1;

	shared_data.pair.cudable_force = 0;
	shared_data.pair.collect_forces_later = 0;
	shared_data.pair.use_block_per_atom = 0;
	shared_data.pair.override_block_per_atom = -1;
	shared_data.pair.cut = 0;
	shared_data.pair.cutsq = 0;
	shared_data.pair.cut_inner = 0;
	shared_data.pair.cut_coul = 0;
	shared_data.pair.special_lj = 0;
	shared_data.pair.special_coul = 0;

	shared_data.pair.neighall = false;

	shared_data.pppm.cudable_force = 0;

	shared_data.buffersize = 0;
	shared_data.buffer_new = 1;
	shared_data.buffer = NULL;

	shared_data.comm.comm_phase = 0;
	shared_data.overlap_comm = 0;

	shared_data.comm.buffer = NULL;
	shared_data.comm.buffer_size = 0;
	shared_data.comm.overlap_split_ratio = 0;
	// setTimingsZero();
	}

	void Cuda::allocate()
	{
	MYDBG(printf("# CUDA: Cuda::allocate ...\n");)

	if(not cu_virial) {
	cu_virial = new cCudaData<double, ENERGY_CFLOAT, x > (NULL, & shared_data.pair.virial , 6);
	cu_eng_vdwl = new cCudaData<double, ENERGY_CFLOAT, x > (NULL, & shared_data.pair.eng_vdwl , 1);
	cu_eng_coul = new cCudaData<double, ENERGY_CFLOAT, x > (NULL, & shared_data.pair.eng_coul , 1);
	cu_extent = new cCudaData<double, double, x> (extent, 6);
	shared_data.flag = CudaWrapper_AllocCudaData(sizeof(int));
	int size = 2 * CUDA_MAX_DEBUG_SIZE;
	debugdata = new int[size];
	cu_debugdata = new cCudaData<int, int, x > (debugdata , size);
	shared_data.debugdata = cu_debugdata->dev_data();
	}

	checkResize();
	setSystemParams();
	MYDBG(printf("# CUDA: Cuda::allocate done...\n");)
	}

	void Cuda::setSystemParams()
	{
	MYDBG(printf("# CUDA: Cuda::setSystemParams ...\n");)
	shared_data.atom.nlocal = atom->nlocal;
	shared_data.atom.nghost = atom->nghost;
	shared_data.atom.nall = atom->nlocal + atom->nghost;
	shared_data.atom.ntypes = atom->ntypes;
	shared_data.atom.q_flag = atom->q_flag;
	shared_data.atom.rmass_flag = atom->rmass_flag;
	MYDBG(printf("# CUDA: Cuda::setSystemParams done ...\n");)
	}

	void Cuda::setDomainParams()
	{
	MYDBG(printf("# CUDA: Cuda::setDomainParams ...\n");)
	cuda_shared_domain* cu_domain = &shared_data.domain;

	cu_domain->triclinic = domain->triclinic;

	for(short i = 0; i < 3; ++i) {
	cu_domain->periodicity[i] = domain->periodicity[i];
	cu_domain->sublo[i] = domain->sublo[i];
	cu_domain->subhi[i] = domain->subhi[i];
	cu_domain->boxlo[i] = domain->boxlo[i];
	cu_domain->boxhi[i] = domain->boxhi[i];
	cu_domain->prd[i] = domain->prd[i];
	}

	if(domain->triclinic) {
	for(short i = 0; i < 3; ++i) {
	cu_domain->boxlo_lamda[i] = domain->boxlo_lamda[i];
	cu_domain->boxhi_lamda[i] = domain->boxhi_lamda[i];
	cu_domain->prd_lamda[i] = domain->prd_lamda[i];
	cu_domain->sublo[i] = domain->sublo_lamda[i];
	cu_domain->subhi[i] = domain->subhi_lamda[i];
	}

	cu_domain->xy = domain->xy;
	cu_domain->xz = domain->xz;
	cu_domain->yz = domain->yz;
	}

	for(int i = 0; i < 6; i++) {
	cu_domain->h[i] = domain->h[i];
	cu_domain->h_inv[i] = domain->h_inv[i];
	cu_domain->h_rate[i] = domain->h_rate[i];
	}

	cu_domain->update = 2;
	MYDBG(printf("# CUDA: Cuda::setDomainParams done ...\n");)
	}

	void Cuda::checkResize()
	{
	MYDBG(printf("# CUDA: Cuda::checkResize ...\n");)
	cuda_shared_atom* cu_atom = & shared_data.atom;
	cu_atom->q_flag = atom->q_flag;
	cu_atom->rmass_flag = atom->rmass ? 1 : 0;
	cu_atom->nall = atom->nlocal + atom->nghost;
	cu_atom->nlocal = atom->nlocal;
	cu_atom->nghost = atom->nghost;

	// do we have more atoms to upload than currently allocated memory on device? (also true if nothing yet allocated)
	if(atom->nmax > cu_atom->nmax \|\| cu_tag == NULL) {
	delete cu_x;
	cu_x = new cCudaData<double, X_CFLOAT, yx> ((double*)atom->x , & cu_atom->x , atom->nmax, 3, 0, true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true);
	delete cu_v;
	cu_v = new cCudaData<double, V_CFLOAT, yx> ((double*)atom->v, & cu_atom->v , atom->nmax, 3);
	delete cu_f;
	cu_f = new cCudaData<double, F_CFLOAT, yx> ((double*)atom->f, & cu_atom->f , atom->nmax, 3, 0, true);
	delete cu_tag;
	cu_tag = new cCudaData<int , int , x > (atom->tag , & cu_atom->tag , atom->nmax, 0, true);
	delete cu_type;
	cu_type = new cCudaData<int , int , x > (atom->type , & cu_atom->type , atom->nmax, 0, true);
	delete cu_mask;
	cu_mask = new cCudaData<int , int , x > (atom->mask , & cu_atom->mask , atom->nmax, 0, true);
	delete cu_image;
	cu_image = new cCudaData<int , int , x > (atom->image , & cu_atom->image , atom->nmax, 0, true);

	if(atom->rmass) {
	delete cu_rmass;
	cu_rmass = new cCudaData<double, V_CFLOAT, x > (atom->rmass , & cu_atom->rmass , atom->nmax);
	}

	if(cu_atom->q_flag) {
	delete cu_q;
	cu_q = new cCudaData<double, F_CFLOAT, x > ((double*)atom->q, & cu_atom->q , atom->nmax, 0 , true);
	}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);}

	if(atom->radius) {
	delete cu_radius;
	cu_radius = new cCudaData<double, X_CFLOAT, x > (atom->radius , & cu_atom->radius , atom->nmax);
	delete cu_v_radius;
	cu_v_radius = new cCudaData<V_CFLOAT, V_CFLOAT, x> (v_radius , & cu_atom->v_radius , atom->nmax * 4);
	delete cu_omega_rmass;
	cu_omega_rmass = new cCudaData<V_CFLOAT, V_CFLOAT, x> (omega_rmass , & cu_atom->omega_rmass , atom->nmax * 4);
	}

	if(atom->omega) {
	delete cu_omega;
	cu_omega = new cCudaData<double, V_CFLOAT, yx > (((double*) atom->omega) , & cu_atom->omega , atom->nmax, 3);
	}

	if(atom->torque) {
	delete cu_torque;
	cu_torque = new cCudaData<double, F_CFLOAT, yx > (((double*) atom->torque) , & cu_atom->torque , atom->nmax, 3);
	}

	if(atom->special) {
	delete cu_special;
	cu_special = new cCudaData<int, int, yx > (((int*) & (atom->special[0][0])) , & cu_atom->special , atom->nmax, atom->maxspecial, 0 , true);
	shared_data.atom.maxspecial = atom->maxspecial;
	}

	if(atom->nspecial) {
	delete cu_nspecial;
	cu_nspecial = new cCudaData<int, int, yx > (((int*) atom->nspecial) , & cu_atom->nspecial , atom->nmax, 3, 0, true);
	}

	if(atom->molecule) {
	delete cu_molecule;
	cu_molecule = new cCudaData<int, int, x > (((int*) atom->molecule) , & cu_atom->molecule , atom->nmax, 0 , true);
	}

	shared_data.atom.special_flag = neighbor->special_flag;
	shared_data.atom.molecular = atom->molecular;

	cu_atom->update_nmax = 2;
	cu_atom->nmax = atom->nmax;

	delete cu_x_type;
	cu_x_type = new cCudaData<X_CFLOAT, X_CFLOAT, x> (x_type , & cu_atom->x_type , atom->nmax * 4);
	}

	if(((cu_xhold == NULL) \|\| (cu_xhold->get_dim()[0] < neighbor->maxhold)) && neighbor->xhold) {
	delete cu_xhold;
	cu_xhold = new cCudaData<double, X_CFLOAT, yx> ((double*)neighbor->xhold, & cu_atom->xhold , neighbor->maxhold, 3);
	shared_data.atom.maxhold = neighbor->maxhold;
	}

	if(atom->mass && !cu_mass) {
	cu_mass = new cCudaData<double, V_CFLOAT, x > (atom->mass , & cu_atom->mass , atom->ntypes + 1);
	}

	cu_atom->mass_host = atom->mass;

	if(atom->map_style == 1) {
	if((cu_map_array == NULL)) {
	cu_map_array = new cCudaData<int, int, x > (atom->get_map_array() , & cu_atom->map_array , atom->get_map_size());
	} else if(cu_map_array->dev_size() / sizeof(int) < atom->get_map_size()) {
	delete cu_map_array;
	cu_map_array = new cCudaData<int, int, x > (atom->get_map_array() , & cu_atom->map_array , atom->get_map_size());
	}
	}


	// if any of the host pointers have changed (e.g. re-allocated somewhere else), set to correct pointer
	if(cu_x ->get_host_data() != atom->x) cu_x ->set_host_data((double*)(atom->x));

	if(cu_v ->get_host_data() != atom->v) cu_v ->set_host_data((double*)(atom->v));

	if(cu_f ->get_host_data() != atom->f) cu_f ->set_host_data((double*)(atom->f));

	if(cu_tag ->get_host_data() != atom->tag) cu_tag ->set_host_data(atom->tag);

	if(cu_type->get_host_data() != atom->type) cu_type->set_host_data(atom->type);

	if(cu_mask->get_host_data() != atom->mask) cu_mask->set_host_data(atom->mask);

	if(cu_image->get_host_data() != atom->image) cu_mask->set_host_data(atom->image);

	if(cu_xhold)
	if(cu_xhold->get_host_data() != neighbor->xhold) cu_xhold->set_host_data((double*)(neighbor->xhold));

	if(atom->rmass)
	if(cu_rmass->get_host_data() != atom->rmass) cu_rmass->set_host_data((double*)(atom->rmass));

	if(cu_atom->q_flag)
	if(cu_q->get_host_data() != atom->q) cu_q->set_host_data((double*)(atom->q));

	if(atom->radius)
	if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*)(atom->radius));

	if(atom->omega)
	if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*)(atom->omega));

	if(atom->torque)
	if(cu_torque->get_host_data() != atom->torque) cu_torque->set_host_data((double*)(atom->torque));

	if(atom->special)
	if(cu_special->get_host_data() != atom->special) {
	delete cu_special;
	cu_special = new cCudaData<int, int, yx > (((int*) atom->special) , & cu_atom->special , atom->nmax, atom->maxspecial);
	shared_data.atom.maxspecial = atom->maxspecial;
	}

	if(atom->nspecial)
	if(cu_nspecial->get_host_data() != atom->nspecial) cu_nspecial->set_host_data((int*)(atom->nspecial));

	if(atom->molecule)
	if(cu_molecule->get_host_data() != atom->molecule) cu_molecule->set_host_data((int*)(atom->molecule));

	if(force)
	if(cu_virial ->get_host_data() != force->pair->virial) cu_virial ->set_host_data(force->pair->virial);

	if(force)
	if(cu_eng_vdwl ->get_host_data() != &force->pair->eng_vdwl) cu_eng_vdwl ->set_host_data(&force->pair->eng_vdwl);

	if(force)
	if(cu_eng_coul ->get_host_data() != &force->pair->eng_coul) cu_eng_coul ->set_host_data(&force->pair->eng_coul);

	cu_atom->update_nlocal = 2;
	MYDBG(printf("# CUDA: Cuda::checkResize done...\n");)
	}

	void Cuda::evsetup_eatom_vatom(int eflag_atom, int vflag_atom)
	{
	if(eflag_atom) {
	if(not cu_eatom)
	cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > (force->pair->eatom, & (shared_data.atom.eatom) , atom->nmax); // cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}

	if(cu_eatom->get_dim()[0] != atom->nmax) {
	//delete cu_eatom;
	//cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > (force->pair->eatom, & (shared_data.atom.eatom) , atom->nmax );// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
	shared_data.atom.update_nmax = 2;
	}

	cu_eatom->set_host_data(force->pair->eatom);
	cu_eatom->memset_device(0);
	}

	if(vflag_atom) {
	if(not cu_vatom)
	cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax , 6);// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}

	if(cu_vatom->get_dim()[0] != atom->nmax) {
	//delete cu_vatom;
	//cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
	shared_data.atom.update_nmax = 2;
	}

	cu_vatom->set_host_data((double*)force->pair->vatom);
	cu_vatom->memset_device(0);
	}
	}

	void Cuda::uploadAll()
	{
	MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");)
	my_times starttime;
	my_times endtime;

	if(atom->nmax != shared_data.atom.nmax) checkResize();

	my_gettime(CLOCK_REALTIME, &starttime);
	cu_x ->upload();
	cu_v ->upload();
	cu_f ->upload();
	cu_tag ->upload();
	cu_type->upload();
	cu_mask->upload();
	cu_image->upload();

	if(shared_data.atom.q_flag) cu_q ->upload();

	if(atom->rmass) cu_rmass->upload();

	if(atom->radius) cu_radius->upload();

	if(atom->omega) cu_omega->upload();

	if(atom->torque) cu_torque->upload();

	if(atom->special) cu_special->upload();

	if(atom->nspecial) cu_nspecial->upload();

	if(atom->molecule) cu_molecule->upload();

	if(cu_eatom) cu_eatom->upload();

	if(cu_vatom) cu_vatom->upload();

	my_gettime(CLOCK_REALTIME, &endtime);
	uploadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
	CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);)
	CUDA_IF_BINNING(Cuda_Binning(& shared_data);)

	shared_data.atom.triggerneighsq = neighbor->triggersq;
	MYDBG(printf("# CUDA: Cuda::uploadAll() ... end\n");)
	}

	void Cuda::downloadAll()
	{
	MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");)
	my_times starttime;
	my_times endtime;

	if(atom->nmax != shared_data.atom.nmax) checkResize();

	CUDA_IF_BINNING(Cuda_ReverseBinning(& shared_data);)
	my_gettime(CLOCK_REALTIME, &starttime);
	cu_x ->download();
	cu_v ->download();
	cu_f ->download();
	cu_type->download();
	cu_tag ->download();
	cu_mask->download();
	cu_image->download();

	//if(shared_data.atom.need_eatom) cu_eatom->download();
	//if(shared_data.atom.need_vatom) cu_vatom->download();

	if(shared_data.atom.q_flag) cu_q ->download();

	if(atom->rmass) cu_rmass->download();

	if(atom->radius) cu_radius->download();

	if(atom->omega) cu_omega->download();

	if(atom->torque) cu_torque->download();

	if(atom->special) cu_special->download();

	if(atom->nspecial) cu_nspecial->download();

	if(atom->molecule) cu_molecule->download();

	if(cu_eatom) cu_eatom->download();

	if(cu_vatom) cu_vatom->download();

	my_gettime(CLOCK_REALTIME, &endtime);
	downloadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
	MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");)
	}

	void Cuda::upload(int datamask)
	{
	MYDBG(printf("# CUDA: Cuda::upload() ... start\n");)
	my_times starttime;
	my_times endtime;

	if(atom->nmax != shared_data.atom.nmax) checkResize();

	my_gettime(CLOCK_REALTIME, &starttime);
	if(X_MASK & datamask) cu_x ->upload();
	if(V_MASK & datamask) cu_v ->upload();
	if(F_MASK & datamask) cu_f ->upload();
	if(TYPE_MASK & datamask) cu_type->upload();
	if(TAG_MASK & datamask) cu_tag ->upload();
	if(MASK_MASK & datamask) cu_mask->upload();
	if(IMAGE_MASK & datamask) cu_image->upload();

	//if(shared_data.atom.need_eatom) cu_eatom->upload();
	//if(shared_data.atom.need_vatom) cu_vatom->upload();

	if(shared_data.atom.q_flag)
	if(Q_MASK & datamask) cu_q ->upload();

	if(atom->rmass)
	if(RMASS_MASK & datamask) cu_rmass->upload();

	if(atom->radius)
	if(RADIUS_MASK & datamask) cu_radius->upload();

	if(atom->omega)
	if(OMEGA_MASK & datamask) cu_omega->upload();

	if(atom->torque)
	if(TORQUE_MASK & datamask) cu_torque->upload();

	if(atom->special)
	if(SPECIAL_MASK & datamask) cu_special->upload();

	if(atom->nspecial)
	if(SPECIAL_MASK & datamask) cu_nspecial->upload();

	if(atom->molecule)
	if(MOLECULE_MASK & datamask) cu_molecule->upload();

	if(cu_eatom) cu_eatom->upload();

	if(cu_vatom) cu_vatom->upload();

	my_gettime(CLOCK_REALTIME, &endtime);
	uploadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
	MYDBG(printf("# CUDA: Cuda::upload() ... end\n");)
	}

	void Cuda::download(int datamask)
	{
	MYDBG(printf("# CUDA: Cuda::download() ... start\n");)
	my_times starttime;
	my_times endtime;

	if(atom->nmax != shared_data.atom.nmax) checkResize();

	CUDA_IF_BINNING(Cuda_ReverseBinning(& shared_data);)
	my_gettime(CLOCK_REALTIME, &starttime);
	if(X_MASK & datamask) cu_x ->download();
	if(V_MASK & datamask) cu_v ->download();
	if(F_MASK & datamask) cu_f ->download();
	if(TYPE_MASK & datamask) cu_type->download();
	if(TAG_MASK & datamask) cu_tag ->download();
	if(MASK_MASK & datamask) cu_mask->download();
	if(IMAGE_MASK & datamask) cu_image->download();

	//if(shared_data.atom.need_eatom) cu_eatom->download();
	//if(shared_data.atom.need_vatom) cu_vatom->download();

	if(shared_data.atom.q_flag)
	if(Q_MASK & datamask) cu_q ->download();

	if(atom->rmass)
	if(RMASS_MASK & datamask) cu_rmass->download();

	if(atom->radius)
	if(RADIUS_MASK & datamask) cu_radius->download();

	if(atom->omega)
	if(OMEGA_MASK & datamask) cu_omega->download();

	if(atom->torque)
	if(TORQUE_MASK & datamask) cu_torque->download();

	if(atom->special)
	if(SPECIAL_MASK & datamask) cu_special->download();

	if(atom->nspecial)
	if(SPECIAL_MASK & datamask) cu_nspecial->download();

	if(atom->molecule)
	if(MOLECULE_MASK & datamask) cu_molecule->download();

	if(cu_eatom) cu_eatom->download();

	if(cu_vatom) cu_vatom->download();

	my_gettime(CLOCK_REALTIME, &endtime);
	downloadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
	MYDBG(printf("# CUDA: Cuda::download() ... end\n");)
	}

	void Cuda::downloadX()
	{
	Cuda_Pair_RevertXType(& this->shared_data);
	cu_x->download();
	}

	CudaNeighList* Cuda::registerNeighborList(class NeighList* neigh_list)
	{
	MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... start a\n");)
	std::map<NeighList, CudaNeighList>::iterator p = neigh_lists.find(neigh_list);

	if(p != neigh_lists.end()) return p->second;
	else {
	CudaNeighList* neigh_list_cuda = new CudaNeighList(lmp, neigh_list);
	neigh_lists.insert(std::pair<NeighList, CudaNeighList>(neigh_list, neigh_list_cuda));
	return neigh_list_cuda;
	}

	MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... end b\n");)
	}

	void Cuda::uploadAllNeighborLists()
	{
	MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... start\n");)
	std::map<NeighList, CudaNeighList>::iterator p = neigh_lists.begin();

	while(p != neigh_lists.end()) {
	p->second->nl_upload();

	if(not(p->second->neigh_list->cuda_list->build_cuda))
	for(int i = 0; i < atom->nlocal; i++)
	p->second->sneighlist.maxneighbors = MAX(p->second->neigh_list->numneigh[i], p->second->sneighlist.maxneighbors) ;

	++p;
	}

	MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... done\n");)
	}

	void Cuda::downloadAllNeighborLists()
	{
	MYDBG(printf("# CUDA: Cuda::downloadAllNeighborList() ... start\n");)
	std::map<NeighList, CudaNeighList>::iterator p = neigh_lists.begin();

	while(p != neigh_lists.end()) {
	p->second->nl_download();
	++p;
	}
	}

	void Cuda::update_xhold(int &maxhold, double* xhold)
	{
	if(this->shared_data.atom.maxhold < atom->nmax) {
	maxhold = atom->nmax;
	delete this->cu_xhold;
	this->cu_xhold = new cCudaData<double, X_CFLOAT, yx> ((double*)xhold, & this->shared_data.atom.xhold , maxhold, 3);
	}

	this->shared_data.atom.maxhold = maxhold;
	CudaWrapper_CopyData(this->cu_xhold->dev_data(), this->cu_x->dev_data(), 3 * atom->nmax * sizeof(X_CFLOAT));
	}

	void Cuda::setTimingsZero()
	{
	shared_data.cuda_timings.test1 = 0;
	shared_data.cuda_timings.test2 = 0;

	//communication
	shared_data.cuda_timings.comm_forward_total = 0;
	shared_data.cuda_timings.comm_forward_mpi_upper = 0;
	shared_data.cuda_timings.comm_forward_mpi_lower = 0;
	shared_data.cuda_timings.comm_forward_kernel_pack = 0;
	shared_data.cuda_timings.comm_forward_kernel_unpack = 0;
	shared_data.cuda_timings.comm_forward_upload = 0;
	shared_data.cuda_timings.comm_forward_download = 0;

	shared_data.cuda_timings.comm_exchange_total = 0;
	shared_data.cuda_timings.comm_exchange_mpi = 0;
	shared_data.cuda_timings.comm_exchange_kernel_pack = 0;
	shared_data.cuda_timings.comm_exchange_kernel_unpack = 0;
	shared_data.cuda_timings.comm_exchange_kernel_fill = 0;
	shared_data.cuda_timings.comm_exchange_cpu_pack = 0;
	shared_data.cuda_timings.comm_exchange_upload = 0;
	shared_data.cuda_timings.comm_exchange_download = 0;

	shared_data.cuda_timings.comm_border_total = 0;
	shared_data.cuda_timings.comm_border_mpi = 0;
	shared_data.cuda_timings.comm_border_kernel_pack = 0;
	shared_data.cuda_timings.comm_border_kernel_unpack = 0;
	shared_data.cuda_timings.comm_border_kernel_buildlist = 0;
	shared_data.cuda_timings.comm_border_kernel_self = 0;
	shared_data.cuda_timings.comm_border_upload = 0;
	shared_data.cuda_timings.comm_border_download = 0;

	//pair forces
	shared_data.cuda_timings.pair_xtype_conversion = 0;
	shared_data.cuda_timings.pair_kernel = 0;
	shared_data.cuda_timings.pair_virial = 0;
	shared_data.cuda_timings.pair_force_collection = 0;

	//neighbor
	shared_data.cuda_timings.neigh_bin = 0;
	shared_data.cuda_timings.neigh_build = 0;
	shared_data.cuda_timings.neigh_special = 0;

	//PPPM
	shared_data.cuda_timings.pppm_particle_map = 0;
	shared_data.cuda_timings.pppm_make_rho = 0;
	shared_data.cuda_timings.pppm_brick2fft = 0;
	shared_data.cuda_timings.pppm_poisson = 0;
	shared_data.cuda_timings.pppm_fillbrick = 0;
	shared_data.cuda_timings.pppm_fieldforce = 0;
	shared_data.cuda_timings.pppm_compute = 0;

	CudaWrapper_CheckUploadTime(true);
	CudaWrapper_CheckDownloadTime(true);
	CudaWrapper_CheckCPUBufUploadTime(true);
	CudaWrapper_CheckCPUBufDownloadTime(true);
	}

	void Cuda::print_timings()
	{
	if(universe->me != 0) return;

	if(not dotiming) return;

	printf("\n # CUDA: Special timings\n\n");
	printf("\n Transfer Times\n");
	printf(" PCIe Upload: \t %lf s\n", CudaWrapper_CheckUploadTime());
	printf(" PCIe Download:\t %lf s\n", CudaWrapper_CheckDownloadTime());
	printf(" CPU Tempbbuf Upload: \t %lf \n", CudaWrapper_CheckCPUBufUploadTime());
	printf(" CPU Tempbbuf Download: \t %lf \n", CudaWrapper_CheckCPUBufDownloadTime());

	printf("\n Communication \n");

	printf(" Forward Total \t %lf \n", shared_data.cuda_timings.comm_forward_total);
	printf(" Forward MPI Upper Bound \t %lf \n", shared_data.cuda_timings.comm_forward_mpi_upper);
	printf(" Forward MPI Lower Bound \t %lf \n", shared_data.cuda_timings.comm_forward_mpi_lower);
	printf(" Forward Kernel Pack \t %lf \n", shared_data.cuda_timings.comm_forward_kernel_pack);
	printf(" Forward Kernel Unpack \t %lf \n", shared_data.cuda_timings.comm_forward_kernel_unpack);
	printf(" Forward Kernel Self \t %lf \n", shared_data.cuda_timings.comm_forward_kernel_self);
	printf(" Forward Upload \t %lf \n", shared_data.cuda_timings.comm_forward_upload);
	printf(" Forward Download \t %lf \n", shared_data.cuda_timings.comm_forward_download);
	printf(" Forward Overlap Split Ratio\t %lf \n", shared_data.comm.overlap_split_ratio);
	printf("\n");

	printf(" Exchange Total \t %lf \n", shared_data.cuda_timings.comm_exchange_total);
	printf(" Exchange MPI \t %lf \n", shared_data.cuda_timings.comm_exchange_mpi);
	printf(" Exchange Kernel Pack \t %lf \n", shared_data.cuda_timings.comm_exchange_kernel_pack);
	printf(" Exchange Kernel Unpack \t %lf \n", shared_data.cuda_timings.comm_exchange_kernel_unpack);
	printf(" Exchange Kernel Fill \t %lf \n", shared_data.cuda_timings.comm_exchange_kernel_fill);
	printf(" Exchange CPU Pack \t %lf \n", shared_data.cuda_timings.comm_exchange_cpu_pack);
	printf(" Exchange Upload \t %lf \n", shared_data.cuda_timings.comm_exchange_upload);
	printf(" Exchange Download \t %lf \n", shared_data.cuda_timings.comm_exchange_download);
	printf("\n");

	printf(" Border Total \t %lf \n", shared_data.cuda_timings.comm_border_total);
	printf(" Border MPI \t %lf \n", shared_data.cuda_timings.comm_border_mpi);
	printf(" Border Kernel Pack \t %lf \n", shared_data.cuda_timings.comm_border_kernel_pack);
	printf(" Border Kernel Unpack \t %lf \n", shared_data.cuda_timings.comm_border_kernel_unpack);
	printf(" Border Kernel Self \t %lf \n", shared_data.cuda_timings.comm_border_kernel_self);
	printf(" Border Kernel BuildList \t %lf \n", shared_data.cuda_timings.comm_border_kernel_buildlist);
	printf(" Border Upload \t %lf \n", shared_data.cuda_timings.comm_border_upload);
	printf(" Border Download \t %lf \n", shared_data.cuda_timings.comm_border_download);
	printf("\n");

	//pair forces
	printf(" Pair XType Conversion \t %lf \n", shared_data.cuda_timings.pair_xtype_conversion);
	printf(" Pair Kernel \t %lf \n", shared_data.cuda_timings.pair_kernel);
	printf(" Pair Virial \t %lf \n", shared_data.cuda_timings.pair_virial);
	printf(" Pair Force Collection \t %lf \n", shared_data.cuda_timings.pair_force_collection);
	printf("\n");

	//neighbor
	printf(" Neighbor Binning \t %lf \n", shared_data.cuda_timings.neigh_bin);
	printf(" Neighbor Build \t %lf \n", shared_data.cuda_timings.neigh_build);
	printf(" Neighbor Special \t %lf \n", shared_data.cuda_timings.neigh_special);
	printf("\n");

	//pppm
	if(force->kspace) {
	printf(" PPPM Total \t %lf \n", shared_data.cuda_timings.pppm_compute);
	printf(" PPPM Particle Map \t %lf \n", shared_data.cuda_timings.pppm_particle_map);
	printf(" PPPM Make Rho \t %lf \n", shared_data.cuda_timings.pppm_make_rho);
	printf(" PPPM Brick2fft \t %lf \n", shared_data.cuda_timings.pppm_brick2fft);
	printf(" PPPM Poisson \t %lf \n", shared_data.cuda_timings.pppm_poisson);
	printf(" PPPM Fillbrick \t %lf \n", shared_data.cuda_timings.pppm_fillbrick);
	printf(" PPPM Fieldforce \t %lf \n", shared_data.cuda_timings.pppm_fieldforce);
	printf("\n");
	}

	printf(" Debug Test 1 \t %lf \n", shared_data.cuda_timings.test1);
	printf(" Debug Test 2 \t %lf \n", shared_data.cuda_timings.test2);

	printf("\n");
	}

cuda.cppNo OneTemporaryActions

File Metadata

cuda.cppView Options

Event Timeline

cuda.cpp
No OneTemporary
Actions

cuda.cpp
View Options