verlet_cuda.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Jul 25, 17:04

verlet_cuda.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

	Original Version:
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	See the README file in the top-level LAMMPS directory.

	-----------------------------------------------------------------------

	USER-CUDA Package and associated modifications:
	https://sourceforge.net/projects/lammpscuda/

	Christian Trott, christian.trott@tu-ilmenau.de
	Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
	Theoretical Physics II, University of Technology Ilmenau, Germany

	See the README file in the USER-CUDA directory.

	This software is distributed under the GNU General Public License.
	------------------------------------------------------------------------- */


	#include <cstdlib>
	#include <cstdio>
	#include <cstring>
	#include "verlet_cuda.h"
	#include "neighbor.h"
	#include "domain.h"
	#include "comm.h"
	#include "atom.h"
	#include "atom_vec.h"
	#include "force.h"
	#include "pair.h"
	#include "bond.h"
	#include "angle.h"
	#include "dihedral.h"
	#include "improper.h"
	#include "kspace.h"
	#include "output.h"
	#include "update.h"
	#include "modify_cuda.h"
	#include "compute.h"
	#include "fix.h"
	#include "timer.h"
	#include "memory.h"
	#include "error.h"
	#include "cuda_wrapper_cu.h"
	#include "thermo.h"
	#include "cuda_pair_cu.h"
	#include "cuda.h"
	#include <ctime>
	#include <cmath>

	using namespace LAMMPS_NS;

	#define MAKETIMEING


	VerletCuda::VerletCuda(LAMMPS lmp, int narg, char *arg) : Verlet(lmp, narg, arg) {
	cuda = lmp->cuda;
	if(cuda == NULL)
	error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");

	modify_cuda=(ModifyCuda*) modify;
	}

	/* ----------------------------------------------------------------------
	setup before run
	------------------------------------------------------------------------- */

	void VerletCuda::setup()
	{
	//debug related variables
	cuda->debugdata[0]=0;
	cuda->cu_debugdata->upload();
	dotestatom=cuda->dotestatom;
	int testatom=cuda->testatom;//48267;


	MYDBG(printf("# CUDA VerletCuda::setup start\n"); )

	cuda->oncpu = true;
	cuda->begin_setup = true;
	cuda->finished_run = false;

	time_pair=0;
	time_kspace=0;
	time_comm=0;
	time_modify=0;
	time_fulliterate=0;

	atom->setup();

	cuda_shared_atom* cu_atom = & cuda->shared_data.atom;
	cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
	cuda_shared_pair* cu_pair = & cuda->shared_data.pair;
	cu_atom->update_nlocal=1;
	cu_atom->update_nmax=1;

	if(atom->molecular\|\|(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;

	cuda->setDomainParams();


	if(cuda->shared_data.me==0)
	printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
	if(cuda->shared_data.me==0)
	printf("# CUDA: Using precision: Global: %u X: %u V: %u F: %u PPPM: %u \n", CUDA_PRECISION==1?4:8,sizeof(X_FLOAT),sizeof(V_FLOAT),sizeof(F_FLOAT),sizeof(PPPM_FLOAT));
	cuda->allocate();


	if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");

	// setup domain, communication and neighboring
	// acquire ghosts
	// build neighbor lists

	if (triclinic) domain->x2lamda(atom->nlocal);
	domain->pbc();
	domain->reset_box();
	comm->setup();
	if (neighbor->style) neighbor->setup_bins();
	comm->exchange();
	if (atom->sortfreq > 0) atom->sort();
	comm->borders();
	if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
	cuda->setSystemParams();
	cuda->checkResize();

	if(cuda->shared_data.me==0)
	printf("# CUDA: VerletCuda::setup: Upload data...\n");
	cuda->uploadAll();
	neighbor->build();
	neighbor->ncalls = 0;

	if(atom->mass)
	cuda->cu_mass->upload();

	if(cuda->cu_map_array)
	cuda->cu_map_array->upload();

	// compute all forces

	ev_set(update->ntimestep);
	if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
	if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
	if(elist_atom\|\|vlist_atom) cuda->checkResize();

	int test_BpA_vs_TpA = true;
	timespec starttime;
	timespec endtime;
	#ifdef NO_PREC_TIMING
	double startsec,endsec;
	#endif
	//if(atom->molecular\|\|(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = false;
	if(test_BpA_vs_TpA && cuda->shared_data.pair.cudable_force && force->pair &&(cuda->shared_data.pair.override_block_per_atom<0))
	{
	int StyleLoops=10;
	if(cuda->shared_data.me==0)
	printf("Test TpA\n");
	cuda->shared_data.pair.use_block_per_atom = 0;
	neighbor->build();
	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
	force->pair->compute(eflag,vflag);
	CudaWrapper_Sync();
	#ifdef NO_PREC_TIMING
	startsec = 1.0*clock()/CLOCKS_PER_SEC;
	#endif
	clock_gettime(CLOCK_REALTIME,&starttime);
	for(int i=0;i<StyleLoops;i++)
	{
	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
	force->pair->compute(eflag,vflag);
	CudaWrapper_Sync();
	}
	clock_gettime(CLOCK_REALTIME,&endtime);

	double TpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	#ifdef NO_PREC_TIMING
	endsec = 1.0*clock()/CLOCKS_PER_SEC;
	TpAtime = endsec - startsec;
	#endif
	if(cuda->shared_data.me==0)
	printf("Test BpA\n");
	cuda->shared_data.pair.use_block_per_atom = 1;
	neighbor->build();
	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
	force->pair->compute(eflag,vflag);
	CudaWrapper_Sync();

	clock_gettime(CLOCK_REALTIME,&starttime);
	#ifdef NO_PREC_TIMING
	startsec = 1.0*clock()/CLOCKS_PER_SEC;
	#endif
	for(int i=0;i<StyleLoops;i++)
	{
	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
	force->pair->compute(eflag,vflag);
	CudaWrapper_Sync();
	}
	clock_gettime(CLOCK_REALTIME,&endtime);
	double BpAtime=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	#ifdef NO_PREC_TIMING
	endsec = 1.0*clock()/CLOCKS_PER_SEC;
	BpAtime = endsec - startsec;
	#endif

	if(cuda->shared_data.me==0)
	printf("\n# CUDA: Timing of parallelisation layout with %i loops:\n",StyleLoops);
	if(cuda->shared_data.me==0)
	printf("# CUDA: BpA TpA\n %lf %lf\n",BpAtime,TpAtime);
	if(BpAtime>TpAtime) cuda->shared_data.pair.use_block_per_atom = 0;
	}
	else
	cuda->shared_data.pair.use_block_per_atom = cuda->shared_data.pair.override_block_per_atom;
	//cuda->shared_data.pair.use_block_per_atom = 0;
	if(atom->molecular\|\|(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
	neighbor->build();
	neighbor->ncalls = 0;

	force_clear();

	cuda->cu_f->download();
	if(cuda->cu_torque)
	cuda->cu_torque->download();

	//printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

	MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )

	test_atom(testatom,"pre pair force");

	if(cuda->shared_data.pair.cudable_force)
	{
	cuda->uploadAll();
	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
	}

	if (force->pair) force->pair->compute(eflag,vflag);

	if(cuda->shared_data.pair.cudable_force)
	{
	if(cuda->shared_data.pair.collect_forces_later)
	{
	if(eflag) cuda->cu_eng_vdwl->upload();
	if(eflag) cuda->cu_eng_coul->upload();
	if(vflag) cuda->cu_virial->upload();
	Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
	if(eflag) cuda->cu_eng_vdwl->download();
	if(eflag) cuda->cu_eng_coul->download();
	if(vflag) cuda->cu_virial->download();
	}
	cuda->downloadAll();
	}

	test_atom(testatom,"post pair force");

	MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
	//printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

	if (atom->molecular) {
	if (force->bond) force->bond->compute(eflag,vflag);
	if (force->angle) force->angle->compute(eflag,vflag);
	if (force->dihedral) force->dihedral->compute(eflag,vflag);
	if (force->improper) force->improper->compute(eflag,vflag);
	}


	if(cuda->shared_data.pppm.cudable_force)
	{
	cuda->cu_tag ->upload();
	cuda->cu_type->upload();
	cuda->cu_x ->upload();
	cuda->cu_v ->upload();
	cuda->cu_f ->upload();
	if(cu_atom->q_flag) cuda->cu_q->upload();
	}
	if (force->kspace) {
	force->kspace->setup();
	force->kspace->compute(eflag,vflag);
	}
	if(cuda->shared_data.pppm.cudable_force)
	{
	cuda->cu_f ->download();
	}

	test_atom(testatom,"post kspace");

	cuda->uploadAll();
	if (force->newton) comm->reverse_comm();
	cuda->downloadAll();

	test_atom(testatom,"post reverse comm");

	if(cuda->shared_data.me==0)
	printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024);

	MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
	modify->setup(vflag);

	MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup done\n"); )
	output->setup(1);

	test_atom(testatom,"post setup");

	MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
	cuda->finished_setup = true;
	cuda->oncpu = false;
	}


	//this routine is in a messy state
	void VerletCuda::setup_minimal(int flag)
	{


	dotestatom=0;
	int testatom=104;
	cuda->oncpu = true;
	cuda->begin_setup = true;
	cuda->finished_run = false;
	MYDBG(printf("# CUDA VerletCuda::setup start\n"); )
	time_pair=0;
	time_kspace=0;
	time_comm=0;
	time_modify=0;
	time_fulliterate=0;

	//cuda->allocate();

	cuda_shared_atom* cu_atom = & cuda->shared_data.atom;
	cuda_shared_domain* cu_domain = & cuda->shared_data.domain;
	cuda_shared_pair* cu_pair = & cuda->shared_data.pair;
	cu_atom->update_nlocal=1;
	cu_atom->update_nmax=1;

	if(atom->molecular) cuda->shared_data.pair.collect_forces_later = true;

	cuda->setDomainParams();



	if(cuda->shared_data.me==0)
	printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
	cuda->allocate();




	// setup domain, communication and neighboring
	// acquire ghosts
	// build neighbor lists

	if (flag) {
	if (triclinic) domain->x2lamda(atom->nlocal);
	domain->pbc();
	domain->reset_box();
	comm->setup();
	if (neighbor->style) neighbor->setup_bins();
	comm->exchange();
	comm->borders();
	if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
	cuda->setSystemParams();
	cuda->checkResize();
	neighbor->build();
	neighbor->ncalls = 0;
	}

	if(cuda->shared_data.me==0)
	printf("# CUDA: VerletCuda::setup: Upload data...\n");
	cuda->uploadAll();
	cuda->uploadAllNeighborLists();
	if(atom->mass)
	cuda->cu_mass->upload();

	if(cuda->cu_map_array)
	cuda->cu_map_array->upload();

	// compute all forces

	ev_set(update->ntimestep);
	if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
	if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
	if(elist_atom\|\|vlist_atom) cuda->checkResize();

	force_clear();
	cuda->cu_f->download();

	//printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

	cuda->cu_mass->upload();
	MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute\n"); )

	test_atom(testatom,"pre pair force");

	if(cuda->shared_data.pair.cudable_force)
	{
	cuda->uploadAll();
	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
	}

	if (force->pair) force->pair->compute(eflag,vflag);

	if(cuda->shared_data.pair.cudable_force)
	{
	if(cuda->shared_data.pair.collect_forces_later)
	{
	if(eflag) cuda->cu_eng_vdwl->upload();
	if(eflag) cuda->cu_eng_coul->upload();
	if(vflag) cuda->cu_virial->upload();
	Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
	if(eflag) cuda->cu_eng_vdwl->download();
	if(eflag) cuda->cu_eng_coul->download();
	if(vflag) cuda->cu_virial->download();
	}
	cuda->downloadAll();
	}

	test_atom(testatom,"post pair force");

	MYDBG( printf("# CUDA: VerletCuda::setup: initial force compute done\n"); )
	//printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);

	if (atom->molecular) {
	if (force->bond) force->bond->compute(eflag,vflag);
	if (force->angle) force->angle->compute(eflag,vflag);
	if (force->dihedral) force->dihedral->compute(eflag,vflag);
	if (force->improper) force->improper->compute(eflag,vflag);
	}


	if(cuda->shared_data.pppm.cudable_force)
	{
	cuda->cu_tag ->upload();
	cuda->cu_type->upload();
	cuda->cu_x ->upload();
	cuda->cu_v ->upload();
	cuda->cu_f ->upload();
	if(cu_atom->q_flag) cuda->cu_q->upload();
	}
	if (force->kspace) {
	force->kspace->setup();
	force->kspace->compute(eflag,vflag);
	}
	if(cuda->shared_data.pppm.cudable_force)
	{
	cuda->cu_f ->download();
	}

	test_atom(testatom,"post kspace");

	cuda->uploadAll();
	if (force->newton) comm->reverse_comm();
	cuda->downloadAll();

	test_atom(testatom,"post reverse comm");

	if(cuda->shared_data.me==0)
	printf("# CUDA: Total Device Memory useage post setup: %lf MB\n",1.0*CudaWrapper_CheckMemUseage()/1024/1024);

	MYDBG( printf("# CUDA: VerletCuda::setup: call modify setup\n"); )
	modify->setup(vflag);

	MYDBG( printf("# CUDA: VerletCuda::setup: done\n"); )
	cuda->finished_setup=true;
	cuda->oncpu=false;
	}

	//#define TESTATOM
	/* ----------------------------------------------------------------------
	iterate for n steps
	------------------------------------------------------------------------- */

	void VerletCuda::run(int n)
	{
	dotestatom=cuda->dotestatom;
	int testatom=cuda->testatom;//48267;


	timespec starttime;
	timespec endtime;
	timespec starttotal;
	timespec endtotal;

	cuda->setTimingsZero();

	static double testtime=0.0;
	// clock_gettime(CLOCK_REALTIME,&starttime);
	// clock_gettime(CLOCK_REALTIME,&endtime);
	// testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	// printf("Time: %lf\n",testtime);*/


	cuda_shared_domain* cu_domain = & cuda->shared_data.domain;

	int nflag,ntimestep,sortflag;

	int n_initial_integrate = modify_cuda->n_initial_integrate;
	int n_post_integrate = modify_cuda->n_post_integrate;
	int n_final_integrate = modify_cuda->n_final_integrate;
	int n_pre_exchange = modify_cuda->n_pre_exchange;
	int n_pre_neighbor = modify_cuda->n_pre_neighbor;
	int n_pre_force = modify_cuda->n_pre_force;
	int n_post_force = modify_cuda->n_post_force;
	int n_end_of_step = modify_cuda->n_end_of_step;
	MYDBG(printf("# CUDA: Fixes: i_int: %i p_int: %i f_int: %i pr_exc: %i pr_neigh: %i pr_f: %i p_f: %i eos: %i\n",
	n_initial_integrate,n_post_integrate,n_final_integrate,n_pre_exchange,n_pre_neighbor,n_pre_force,n_post_force,n_end_of_step);)

	if (atom->sortfreq > 0) sortflag = 1;
	else sortflag = 0;


	if(cuda->shared_data.me==0)
	{
	if((not cuda->shared_data.pair.cudable_force)&&(force->pair))
	error->warning(FLERR,"# CUDA: You asked for a Verlet integration using Cuda, "
	"but selected a pair force which has not yet been ported to Cuda");
	if((not cuda->shared_data.pppm.cudable_force)&&(force->kspace))
	error->warning(FLERR,"# CUDA: You asked for a Verlet integration using Cuda, "
	"but selected a kspace force which has not yet been ported to Cuda");
	if(modify_cuda->n_post_integrate_host+modify_cuda->n_pre_exchange_host+modify_cuda->n_pre_neighbor_host+modify_cuda->n_pre_force_host+modify_cuda->n_post_force_host+modify_cuda->n_end_of_step_host+modify_cuda->n_initial_integrate_host+modify_cuda->n_final_integrate_host)
	error->warning(FLERR,"# CUDA: You asked for a Verlet integration using Cuda, "
	"but several fixes have not yet been ported to Cuda.\n"
	"This can cause a severe speed penalty due to frequent data synchronization between host and GPU.");
	if(atom->firstgroupname)
	error->warning(FLERR,"Warning: firstgroupname is used, this will cause additional data transfers.");
	}
	cuda->uploadAll();

	if(cuda->neighbor_decide_by_integrator && cuda->cu_xhold)
	{
	const int n=cuda->shared_data.atom.maxhold;
	CudaWrapper_CopyData(cuda->cu_xhold->dev_data(),cuda->cu_x->dev_data(),n*sizeof(X_FLOAT));
	CudaWrapper_CopyData((void) &((X_FLOAT)cuda->cu_xhold->dev_data())[n],(void) &((X_FLOAT)cuda->cu_x->dev_data())[atom->nmax],n*sizeof(X_FLOAT));
	CudaWrapper_CopyData((void) &((X_FLOAT)cuda->cu_xhold->dev_data())[2n],(void) &((X_FLOAT)cuda->cu_x->dev_data())[2atom->nmax],n*sizeof(X_FLOAT));
	}

	cuda->shared_data.atom.reneigh_flag=0;
	cuda->shared_data.atom.update_nlocal=1;
	cuda->shared_data.atom.update_nmax=1;
	cuda->shared_data.atom.update_neigh=1;
	cuda->shared_data.domain.update=1;
	cuda->shared_data.buffer_new=1;
	cuda->uploadtime=0;
	cuda->downloadtime=0;
	int firstreneigh=1;

	for(int i = 0; i < n; i++)
	{
	ntimestep = ++update->ntimestep;
	ev_set(ntimestep);

	// initial time integration

	test_atom(testatom,"Pre initial");

	MYDBG( printf("# CUDA VerletCuda::iterate: before initial_integrate\n"); )

	modify->initial_integrate(vflag);

	MYDBG( printf("# CUDA VerletCuda::iterate: after initial_integrate\n"); )

	if(n_post_integrate) modify->post_integrate();



	// regular communication vs neighbor list rebuild

	test_atom(testatom,"Pre Exchange");

	MYDBG( printf("# CUDA VerletCuda::iterate: before neighbor decide\n"); )
	nflag = neighbor->decide();
	if(nflag == 0)
	{
	MYDBG( printf("# CUDA VerletCuda::iterate: communicate\n"); )
	timer->stamp();

	if((not (eflag\|\|vflag))&&(cuda->shared_data.overlap_comm))
	{
	//overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms)
	//build communication buffers
	// printf("Pre forward_comm(1)\n");
	clock_gettime(CLOCK_REALTIME,&starttotal);
	cuda->shared_data.atom.reneigh_flag=0;
	clock_gettime(CLOCK_REALTIME,&starttime);
	timer->stamp();
	comm->forward_comm(1);
	timer->stamp(TIME_COMM);
	clock_gettime(CLOCK_REALTIME,&endtime);
	cuda->shared_data.cuda_timings.comm_forward_total+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

	//prepare force calculation
	// printf("Pre force_clear\n");
	force_clear();
	// printf("Pre Generate XType\n");
	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

	//start force calculation asynchronus
	cuda->shared_data.comm.comm_phase=1;
	force->pair->compute(eflag, vflag);
	timer->stamp(TIME_PAIR);
	//CudaWrapper_Sync();

	//download comm buffers from GPU, perform MPI communication and upload buffers again
	clock_gettime(CLOCK_REALTIME,&starttime);
	comm->forward_comm(2);
	clock_gettime(CLOCK_REALTIME,&endtime);
	cuda->shared_data.cuda_timings.comm_forward_total+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	timer->stamp(TIME_COMM);

	//wait for force calculation
	CudaWrapper_Sync();
	timer->stamp(TIME_PAIR);

	//unpack communication buffers
	clock_gettime(CLOCK_REALTIME,&starttime);
	comm->forward_comm(3);
	clock_gettime(CLOCK_REALTIME,&endtime);
	cuda->shared_data.cuda_timings.comm_forward_total+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

	timer->stamp(TIME_COMM);
	MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
	cuda->shared_data.cuda_timings.test1+=
	endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000;
	}
	else
	{
	//perform standard forward communication
	clock_gettime(CLOCK_REALTIME,&starttime);
	comm->forward_comm();
	clock_gettime(CLOCK_REALTIME,&endtime);
	cuda->shared_data.cuda_timings.comm_forward_total+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	timer->stamp(TIME_COMM);
	MYDBG( printf("# CUDA VerletCuda::iterate: communicate done\n"); )
	}
	}
	else
	{
	int nlocalold=cuda->shared_data.atom.nlocal;
	if(firstreneigh)
	{
	cuda->shared_data.atom.update_nlocal=1;
	cuda->shared_data.atom.update_nmax=1;
	firstreneigh=0;
	}
	cuda->shared_data.buffer_new=1;
	MYDBG( printf("# CUDA VerletCuda::iterate: neighbor\n"); )
	cuda->setDomainParams();
	if(n_pre_exchange) modify->pre_exchange();
	if(atom->nlocal!=cuda->shared_data.atom.nlocal) //did someone add atoms during pre_exchange?
	{
	cuda->checkResize();
	cuda->uploadAll();
	}

	//check domain changes
	if(domain->triclinic) domain->x2lamda(atom->nlocal);
	MYDBG( printf("# CUDA VerletCuda::iterate: neighbor pbc\n"); )
	domain->pbc();
	if(domain->box_change)
	{
	domain->reset_box();
	comm->setup();
	if (neighbor->style) neighbor->setup_bins();

	}
	timer->stamp();
	MYDBG( printf("# CUDA VerletCuda::iterate: neighbor exchange\n"); )

	//perform exchange of local atoms
	clock_gettime(CLOCK_REALTIME,&starttime);
	comm->exchange();
	clock_gettime(CLOCK_REALTIME,&endtime);

	//special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future
	if(comm->nprocs>1)
	{
	clock_gettime(CLOCK_REALTIME,&starttime);
	if(atom->special)
	cuda->cu_special->upload();
	if(atom->nspecial)
	cuda->cu_nspecial->upload();
	clock_gettime(CLOCK_REALTIME,&endtime);
	cuda->shared_data.cuda_timings.test1+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	}

	cuda->shared_data.cuda_timings.comm_exchange_total+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

	if(nlocalold!=cuda->shared_data.atom.nlocal) cuda->shared_data.atom.update_nlocal=2;

	//sort atoms
	if (sortflag && ntimestep >= atom->nextsort) atom->sort();
	MYDBG( printf("# CUDA VerletCuda::iterate: neighbor borders\n"); )

	//generate ghost atom lists, and transfer ghost atom data
	clock_gettime(CLOCK_REALTIME,&starttime);
	comm->borders();
	clock_gettime(CLOCK_REALTIME,&endtime);
	cuda->shared_data.cuda_timings.comm_border_total+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

	clock_gettime(CLOCK_REALTIME,&starttime);
	//atom index maps are generated on CPU, and need to be transfered to GPU if they are used
	if(cuda->cu_map_array)
	cuda->cu_map_array->upload();


	if(domain->triclinic) domain->lamda2x(atom->nlocal+atom->nghost);

	if(n_pre_neighbor) modify->pre_neighbor();

	cuda->shared_data.buffer_new=2;
	if(atom->molecular) cuda->cu_molecule->download();
	MYDBG( printf("# CUDA VerletCuda::iterate: neighbor build\n"); )
	timer->stamp(TIME_COMM);
	clock_gettime(CLOCK_REALTIME,&endtime);
	cuda->shared_data.cuda_timings.test2+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;

	//rebuild neighbor list
	test_atom(testatom,"Pre Neighbor");
	neighbor->build();
	timer->stamp(TIME_NEIGHBOR);
	MYDBG( printf("# CUDA VerletCuda::iterate: neighbor done\n"); )

	//if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU
	if(cuda->shared_data.pair.collect_forces_later)
	{
	if(cuda->cu_molecule) cuda->cu_molecule->download();
	cuda->cu_tag->download();
	cuda->cu_type->download();
	cuda->cu_mask->download();
	if(cuda->cu_q) cuda->cu_q->download();
	}
	cuda->shared_data.comm.comm_phase=3;
	}

	test_atom(testatom,"Post Exchange");

	// force computations

	//only do force_clear if it has not been done during overlap of communication with local interactions
	if(not((not (eflag\|\|vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)))
	force_clear();

	if(n_pre_force) modify->pre_force(vflag);

	timer->stamp();

	//if overlap of bonded interactions with nonbonded interactions takes place, download forces and positions
	/* if(cuda->shared_data.pair.collect_forces_later)
	{
	cuda->cu_x->downloadAsync(2);
	cuda->cu_f->downloadAsync(2);
	}*/

	if(force->pair)
	{
	if((not (eflag\|\|vflag))&&(cuda->shared_data.overlap_comm)&&(cuda->shared_data.comm.comm_phase<3)&&cuda->shared_data.pair.cudable_force)
	{
	//second part of force calculations in case of overlaping it with commuincation. Only interactions between local and ghost atoms are done now
	//regenerate data layout for force computations, its actually only needed for the ghost atoms
	cuda->shared_data.comm.comm_phase=2;

	timespec atime1,atime2;
	clock_gettime(CLOCK_REALTIME,&atime1);

	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

	clock_gettime(CLOCK_REALTIME,&atime2);
	cuda->shared_data.cuda_timings.pair_xtype_conversion+=
	atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
	force->pair->compute(eflag, vflag);

	}
	else
	{
	//calculate complete pair interactions
	if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll();
	else
	{
	//regenerate data layout for force computations, its actually only needed for the ghost atoms
	timespec atime1,atime2;
	clock_gettime(CLOCK_REALTIME,&atime1);

	Cuda_Pair_GenerateXType(&cuda->shared_data);
	if(cuda->cu_v_radius)
	Cuda_Pair_GenerateVRadius(&cuda->shared_data);
	if(cuda->cu_omega_rmass)
	Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);

	clock_gettime(CLOCK_REALTIME,&atime2);
	cuda->shared_data.cuda_timings.pair_xtype_conversion+=
	atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
	}
	cuda->shared_data.comm.comm_phase=0;
	force->pair->compute(eflag, vflag);
	}

	if(not cuda->shared_data.pair.cudable_force) cuda->uploadAll();

	//wait for force calculation in case of not using overlap with bonded interactions
	if(not cuda->shared_data.pair.collect_forces_later)
	CudaWrapper_Sync();

	timer->stamp(TIME_PAIR);
	}

	//calculate bonded interactions
	if(atom->molecular)
	{
	cuda->cu_x->downloadAsync(2);
	if(n_pre_force==0) Verlet::force_clear();
	else cuda->cu_f->downloadAsync(2);

	timer->stamp(TIME_PAIR);

	test_atom(testatom,"pre bond force");
	if(force->bond) force->bond->compute(eflag, vflag);
	if(force->angle) force->angle->compute(eflag, vflag);
	if(force->dihedral) force->dihedral->compute(eflag, vflag);
	if(force->improper) force->improper->compute(eflag, vflag);
	timer->stamp(TIME_BOND);
	}

	//collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used
	if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&(not (force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
	{
	clock_gettime(CLOCK_REALTIME,&starttime);
	cuda->cu_f->uploadAsync(2);

	test_atom(testatom,"post molecular force");


	if(eflag) cuda->cu_eng_vdwl->upload();
	if(eflag) cuda->cu_eng_coul->upload();
	if(vflag) cuda->cu_virial->upload();
	Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
	if(eflag) cuda->cu_eng_vdwl->download();
	if(eflag) cuda->cu_eng_coul->download();
	if(vflag) cuda->cu_virial->download();
	timer->stamp(TIME_PAIR);

	clock_gettime(CLOCK_REALTIME,&endtime);
	cuda->shared_data.cuda_timings.pair_force_collection+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	}

	//compute kspace force
	if(force->kspace)
	{
	if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
	cuda->downloadAll();
	if((not cuda->shared_data.pppm.cudable_force) && (cuda->shared_data.pair.collect_forces_later) && (not atom->molecular))
	{
	cuda->cu_x->downloadAsync(2);
	if(n_pre_force==0) Verlet::force_clear();
	else cuda->cu_f->downloadAsync(2);

	timer->stamp(TIME_PAIR);
	}

	force->kspace->compute(eflag,vflag);
	if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
	cuda->uploadAll();
	timer->stamp(TIME_KSPACE);
	}

	//collect forces in case pair forces and kspace was overlaped
	if(cuda->shared_data.pair.collect_forces_later&&cuda->shared_data.pair.cudable_force&&((force->kspace&&(not cuda->shared_data.pppm.cudable_force))))
	{
	cuda->cu_f->uploadAsync(2);

	clock_gettime(CLOCK_REALTIME,&starttime);

	if(eflag) cuda->cu_eng_vdwl->upload();
	if(eflag) cuda->cu_eng_coul->upload();
	if(vflag) cuda->cu_virial->upload();
	Cuda_Pair_CollectForces(&cuda->shared_data,eflag,vflag);
	if(eflag) cuda->cu_eng_vdwl->download();
	if(eflag) cuda->cu_eng_coul->download();
	if(vflag) cuda->cu_virial->download();
	timer->stamp(TIME_PAIR);

	clock_gettime(CLOCK_REALTIME,&endtime);
	cuda->shared_data.cuda_timings.pair_force_collection+=
	endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
	}

	//send forces on ghost atoms back to other GPU: THIS SHOULD NEVER HAPPEN
	if(force->newton)
	{
	comm->reverse_comm();
	timer->stamp(TIME_COMM);
	}
	test_atom(testatom,"post force");
	// force modifications, final time integration, diagnostics

	if(n_post_force) modify->post_force(vflag);

	test_atom(testatom,"pre final");

	modify->final_integrate();

	test_atom(testatom,"post final");

	if(n_end_of_step) modify->end_of_step();

	// all output

	test_atom(testatom,"pre output");

	if(ntimestep == output->next)
	{
	if(not output->thermo->cudable)
	cuda->downloadAll();
	timer->stamp();
	output->write(ntimestep);
	timer->stamp(TIME_OUTPUT);
	}


	test_atom(testatom,"post output");

	if(cuda->shared_data.atom.update_nlocal>0)
	cuda->shared_data.atom.update_nlocal--;
	if(cuda->shared_data.atom.update_nmax>0)
	cuda->shared_data.atom.update_nmax--;
	if(cuda->shared_data.atom.update_neigh>0)
	cuda->shared_data.atom.update_neigh--;
	if(cuda->shared_data.domain.update>0)
	cuda->shared_data.domain.update--;
	if(cuda->shared_data.buffer_new>0)
	cuda->shared_data.buffer_new--;
	cuda->shared_data.atom.reneigh_flag=0;
	}


	cuda->downloadAll();
	cuda->downloadAllNeighborLists();
	cuda->shared_data.atom.update_nlocal=1;
	cuda->shared_data.atom.update_nmax=1;
	cuda->shared_data.atom.update_neigh=1;
	cuda->shared_data.buffer_new=1;
	cuda->shared_data.domain.update=1;
	cuda->oncpu = true;
	cuda->finished_run = true;
	}


	/* ----------------------------------------------------------------------
	clear force on own & ghost atoms
	setup and clear other arrays as needed
	------------------------------------------------------------------------- */

	void VerletCuda::force_clear()
	{
	cuda->cu_f->memset_device(0);
	if(cuda->cu_torque) cuda->cu_torque->memset_device(0);
	return;

	//The rest should not be necessary
	int i;
	for(i=0;i<atom->nlocal;i++)
	{
	atom->f[i][0]=0.0;
	atom->f[i][1]=0.0;
	atom->f[i][2]=0.0;
	}
	// clear force on all particles
	// if either newton flag is set, also include ghosts

	if (neighbor->includegroup == 0) {
	int nall;
	if (force->newton) nall = atom->nlocal + atom->nghost;
	else nall = atom->nlocal;
	if (torqueflag) {
	double **torque = atom->torque;
	for (i = 0; i < nall; i++) {
	torque[i][0] = 0.0;
	torque[i][1] = 0.0;
	torque[i][2] = 0.0;
	}
	}

	// neighbor includegroup flag is set
	// clear force only on initial nfirst particles
	// if either newton flag is set, also include ghosts

	} else {
	int nall = atom->nfirst;


	if (torqueflag) {
	double **torque = atom->torque;
	for (i = 0; i < nall; i++) {
	torque[i][0] = 0.0;
	torque[i][1] = 0.0;
	torque[i][2] = 0.0;
	}
	}

	if (force->newton) {
	nall = atom->nlocal + atom->nghost;

	if (torqueflag) {
	double **torque = atom->torque;
	for (i = atom->nlocal; i < nall; i++) {
	torque[i][0] = 0.0;
	torque[i][1] = 0.0;
	torque[i][2] = 0.0;
	}
	}
	}
	}
	}

	void VerletCuda::test_atom(int aatom, char* string) //printing properties of one atom for test purposes
	{
	if(not dotestatom) return;
	bool check=false;
	if(cuda->finished_setup) cuda->downloadAll();
	for(int i=0;i<atom->nlocal+atom->nghost;i++)
	{
	if((atom->tag[i]==aatom)&&(i<atom->nlocal))
	{

	printf("%i # CUDA %s: %i %i %e %e %e %i ",comm->me,string,update->ntimestep,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);
	if(atom->molecular && (i<atom->nlocal))
	{
	printf(" // %i %i %i ",atom->num_bond[i],atom->num_angle[i],atom->num_dihedral[i]);
	for(int k=0;k<atom->num_bond[i];k++)
	printf("// %i %i ",atom->bond_type[i][k],atom->bond_atom[i][k]);
	}
	printf("\n");
	}
	if(i<atom->nlocal)
	{
	if((atom->v[i][0]<-100\|\|atom->v[i][0]>100)\|\|
	(atom->v[i][1]<-100\|\|atom->v[i][1]>100)\|\|
	(atom->v[i][2]<-100\|\|atom->v[i][2]>100)\|\|
	(atom->v[i][0]!=atom->v[i][0])\|\|
	(atom->v[i][1]!=atom->v[i][1])\|\|
	(atom->v[i][2]!=atom->v[i][2]))
	{printf("%i # CUDA %s velocity: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i); check=true;}
	if((atom->f[i][0]<-10000\|\|atom->f[i][0]>10000)\|\|
	(atom->f[i][1]<-10000\|\|atom->f[i][1]>10000)\|\|
	(atom->f[i][2]<-10000\|\|atom->f[i][2]>10000)\|\|
	(atom->f[i][0]!=atom->f[i][0])\|\|
	(atom->f[i][1]!=atom->f[i][1])\|\|
	(atom->f[i][2]!=atom->f[i][2]))
	{printf("%i # CUDA %s force: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i); check=true;}
	if(atom->tag[i]<=0)
	printf("%i # CUDA %s tag: %i %e %e %e %i\n",comm->me,string,atom->tag[i],atom->x[i][0],atom->v[i][0],atom->f[i][0],i);
	}
	}
	if(check) exit(0);
	}

verlet_cuda.cppNo OneTemporaryActions

File Metadata

verlet_cuda.cppView Options

Event Timeline

verlet_cuda.cpp
No OneTemporary
Actions

verlet_cuda.cpp
View Options