comm_cuda.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Jul 4, 22:21

comm_cuda.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing author (triclinic) : Pieter in 't Veld (SNL)
	------------------------------------------------------------------------- */

	#include "mpi.h"
	#include <cmath>
	#include <cstring>
	#include <cstdio>
	#include <cstdlib>
	#include "comm_cuda.h"
	#include "atom.h"
	#include "atom_vec.h"
	#include "force.h"
	#include "pair.h"
	#include "domain.h"
	#include "neighbor.h"
	#include "modify.h"
	#include "fix.h"
	#include "group.h"
	#include "compute.h"
	#include "cuda.h"
	#include "error.h"
	#include "memory.h"
	#include "comm_cuda_cu.h"

	using namespace LAMMPS_NS;

	#define BUFFACTOR 1.5
	#define BUFMIN 1000
	#define BUFEXTRA 1000



	#define BIG 1.0e20

	enum{SINGLE,MULTI};

	/* ----------------------------------------------------------------------
	setup MPI and allocate buffer space
	------------------------------------------------------------------------- */

	CommCuda::CommCuda(LAMMPS *lmp):Comm(lmp)
	{
	cuda = lmp->cuda;
	if(cuda == NULL)
	error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");

	cu_pbc=NULL;
	cu_slablo=NULL;
	cu_slabhi=NULL;
	cu_multilo=NULL;
	cu_multihi=NULL;
	cu_sendlist=NULL;


	memory->sfree(buf_send);
	memory->sfree(buf_recv);
	buf_send = NULL;
	buf_recv = NULL;

	Comm::free_swap();
	allocate_swap(maxswap);
	}

	/* ---------------------------------------------------------------------- */

	CommCuda::~CommCuda()
	{
	delete cu_sendlist;
	if(cuda->pinned)
	{
	CudaWrapper_FreePinnedHostData((void*)buf_send);
	CudaWrapper_FreePinnedHostData((void*)buf_recv);
	}
	else
	{
	memory->sfree(buf_send);
	memory->sfree(buf_recv);
	}
	buf_send=NULL;
	buf_recv=NULL;
	}

	/* ---------------------------------------------------------------------- */

	void CommCuda::init()
	{
	int factor = 1;
	if(cuda->shared_data.overlap_comm) factor=maxswap;
	if(not buf_send)
	grow_send(maxsend,0);
	if(not buf_recv)
	grow_recv(maxrecv);
	if(not cu_sendlist)
	{
	cu_sendlist=new cCudaData<int, int, xy> ((int*)sendlist,maxswap,BUFMIN);
	cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data();
	cuda->shared_data.comm.maxswap=maxswap;
	cuda->shared_data.comm.maxlistlength=BUFMIN;
	cu_sendlist->upload();
	}
	delete cu_pbc;
	cu_pbc=new cCudaData<int, int, xy> ((int*)pbc,cuda->shared_data.comm.maxswap,6);
	cu_pbc->upload();

	delete cu_slablo;
	cu_slablo = new cCudaData<double, X_FLOAT,x>(slablo,cuda->shared_data.comm.maxswap);
	cu_slablo->upload();

	delete cu_slabhi;
	cu_slabhi = new cCudaData<double, X_FLOAT,x>(slabhi,cuda->shared_data.comm.maxswap);
	cu_slabhi->upload();

	cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
	cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
	cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data();

	Comm::init();
	}

	/* ----------------------------------------------------------------------
	setup spatial-decomposition communication patterns
	function of neighbor cutoff(s) & cutghostuser & current box size
	single style sets slab boundaries (slablo,slabhi) based on max cutoff
	multi style sets type-dependent slab boundaries (multilo,multihi)
	------------------------------------------------------------------------- */

	void CommCuda::setup()
	{
	if(cuda->shared_data.pair.neighall) cutghostuser = MAX(2.0*neighbor->cutneighmax,cutghostuser);
	Comm::setup();

	//upload changed geometry to device
	if(style == SINGLE)
	{
	if(cu_slablo) cu_slablo->upload();
	if(cu_slabhi) cu_slabhi->upload();
	}
	else
	{
	if(cu_multilo) cu_multilo->upload();
	if(cu_multihi) cu_multihi->upload();
	}
	}

	/* ----------------------------------------------------------------------
	forward communication of atom coords every timestep
	other per-atom attributes may also be sent via pack/unpack routines
	------------------------------------------------------------------------- */

	void CommCuda::forward_comm(int mode)
	{
	if(mode==0) return forward_comm_cuda();
	if(mode==1) return forward_comm_pack_cuda();
	if(mode==2) return forward_comm_transfer_cuda();
	if(mode==3) return forward_comm_unpack_cuda();
	}


	void CommCuda::forward_comm_cuda()
	{
	static int count=0;
	static double kerneltime=0.0;
	static double copytime=0.0;
	timespec time1,time2,time3;

	int n;
	MPI_Request request;
	MPI_Status status;
	AtomVec *avec = atom->avec;
	double **x = atom->x;

	cuda->shared_data.domain.xy=domain->xy;
	cuda->shared_data.domain.xz=domain->xz;
	cuda->shared_data.domain.yz=domain->yz;
	cuda->shared_data.domain.prd[0]=domain->prd[0];
	cuda->shared_data.domain.prd[1]=domain->prd[1];
	cuda->shared_data.domain.prd[2]=domain->prd[2];
	cuda->shared_data.domain.triclinic=domain->triclinic;
	if(not comm_x_only && not avec->cudable)
	{
	cuda->downloadAll();
	Comm::forward_comm();
	cuda->uploadAll();
	return;
	}

	// exchange data with another proc
	// if other proc is self, just copy
	// if comm_x_only set, exchange or copy directly to x, don't unpack

	for (int iswap = 0; iswap < nswap; iswap++) {
	if (sendproc[iswap] != me)
	{
	if (comm_x_only)
	{

	int size_forward_recv_now=0;

	if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
	size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
	else
	size_forward_recv_now=size_forward_recv[iswap];
	clock_gettime(CLOCK_REALTIME,&time1);

	MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE,
	recvproc[iswap],0,world,&request);
	n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) buf_send,pbc[iswap],pbc_flag[iswap]);

	clock_gettime(CLOCK_REALTIME,&time2);

	if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
	n=(n+1)*sizeof(X_FLOAT)/sizeof(double);

	//printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now,n);
	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);

	clock_gettime(CLOCK_REALTIME,&time3);
	cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
	time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
	cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
	time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;

	Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],(void*)buf_recv,iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]

	}
	else if (ghost_velocity)
	{
	MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
	recvproc[iswap],0,world,&request);

	if(avec->cudable)
	n = avec->pack_comm_vel(sendnum[iswap],&iswap,
	buf_send,pbc_flag[iswap],pbc[iswap]);
	else
	n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
	buf_send,pbc_flag[iswap],pbc[iswap]);

	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv);
	}
	else
	{
	MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
	recvproc[iswap],0,world,&request);

	if(avec->cudable)
	n = avec->pack_comm(sendnum[iswap],&iswap,
	buf_send,pbc_flag[iswap],pbc[iswap]);
	else
	n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
	buf_send,pbc_flag[iswap],pbc[iswap]);

	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
	}

	}
	else //sendproc == me
	{
	cuda->self_comm=1;
	if (comm_x_only)
	{
	if (sendnum[iswap])
	{
	n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
	if(n<0) error->all(FLERR," # CUDA ERRROR on PackComm_Self");
	if((sizeof(X_FLOAT)!=sizeof(double)) && n)
	n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
	}
	}
	else if (ghost_velocity)
	{
	n = avec->pack_comm_vel(sendnum[iswap],&iswap,
	(double*) firstrecv,pbc_flag[iswap],pbc[iswap]);
	//avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],(double*) firstrecv);
	}
	else
	{
	n = avec->pack_comm(sendnum[iswap],&iswap,
	(double*) firstrecv,pbc_flag[iswap],pbc[iswap]);
	//avec->unpack_comm(recvnum[iswap],firstrecv[iswap],(double*) firstrecv);
	}
	cuda->self_comm=0;
	}
	}
	}

	void CommCuda::forward_comm_pack_cuda()
	{
	static int count=0;
	static double kerneltime=0.0;
	static double copytime=0.0;
	timespec time1,time2,time3;
	int n; // initialize comm buffers & exchange memory

	MPI_Request request;
	MPI_Status status;
	AtomVec *avec = atom->avec;
	double **x = atom->x;

	cuda->shared_data.domain.xy=domain->xy;
	cuda->shared_data.domain.xz=domain->xz;
	cuda->shared_data.domain.yz=domain->yz;
	cuda->shared_data.domain.prd[0]=domain->prd[0];
	cuda->shared_data.domain.prd[1]=domain->prd[1];
	cuda->shared_data.domain.prd[2]=domain->prd[2];
	cuda->shared_data.domain.triclinic=domain->triclinic;
	if(not comm_x_only && not avec->cudable) cuda->downloadAll(); //if not comm_x_only the communication routine of the atom_vec style class is used

	// exchange data with another proc
	// if other proc is self, just copy
	// if comm_x_only set, exchange or copy directly to x, don't unpack

	for (int iswap = 0; iswap < nswap; iswap++) {
	if (sendproc[iswap] != me)
	{
	if (comm_x_only)
	{


	clock_gettime(CLOCK_REALTIME,&time1);

	// n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) cuda->shared_data.comm.buf_send[iswap],pbc[iswap],pbc_flag[iswap]);
	n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*)buf_send,pbc[iswap],pbc_flag[iswap]);

	clock_gettime(CLOCK_REALTIME,&time2);

	if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
	n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
	cuda->shared_data.comm.send_size[iswap]=n;
	}
	else if (ghost_velocity)
	{
	clock_gettime(CLOCK_REALTIME,&time1);

	// n = Cuda_CommCuda_PackComm_Vel(&cuda->shared_data,sendnum[iswap],iswap,(void) &buf_send[iswapmaxsend],pbc[iswap],pbc_flag[iswap]);

	clock_gettime(CLOCK_REALTIME,&time2);

	if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
	n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
	cuda->shared_data.comm.send_size[iswap]=n;
	}
	else
	{
	MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
	recvproc[iswap],0,world,&request);

	if(avec->cudable)
	n = avec->pack_comm(sendnum[iswap],&iswap,
	cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]);
	else
	n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
	cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]);

	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
	}

	}
	else //sendproc == me
	{
	if (comm_x_only)
	{
	if (sendnum[iswap])
	{
	n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
	if(n<0) error->all(FLERR," # CUDA ERRROR on PackComm_Self");
	if((sizeof(X_FLOAT)!=sizeof(double)) && n)
	n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
	}
	}
	else if (ghost_velocity)
	{
	n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
	buf_send,pbc_flag[iswap],pbc[iswap]);
	avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send);
	}
	else
	{
	n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
	buf_send,pbc_flag[iswap],pbc[iswap]);
	avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
	}
	}
	}
	if(not comm_x_only && not avec->cudable) cuda->uploadAll();
	}

	void CommCuda::forward_comm_transfer_cuda()
	{
	static int count=0;
	static double kerneltime=0.0;
	static double copytime=0.0;
	timespec time1,time2,time3;
	int n;
	MPI_Request request;
	MPI_Status status;
	AtomVec *avec = atom->avec;
	double **x = atom->x;
	cuda->shared_data.domain.xy=domain->xy;
	cuda->shared_data.domain.xz=domain->xz;
	cuda->shared_data.domain.yz=domain->yz;
	cuda->shared_data.domain.prd[0]=domain->prd[0];
	cuda->shared_data.domain.prd[1]=domain->prd[1];
	cuda->shared_data.domain.prd[2]=domain->prd[2];
	cuda->shared_data.domain.triclinic=domain->triclinic;
	if(not comm_x_only && not avec->cudable) cuda->downloadAll(); //if not comm_x_only the communication routine of the atom_vec style class is used
	//printf("A\n");
	// exchange data with another proc
	// if other proc is self, just copy
	// if comm_x_only set, exchange or copy directly to x, don't unpack

	for (int iswap = 0; iswap < nswap; iswap++) {
	if (sendproc[iswap] != me)
	{
	if (comm_x_only)
	{

	int size_forward_recv_now=0;

	if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
	size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
	else
	size_forward_recv_now=size_forward_recv[iswap];

	//printf("A: %i \n",size_forward_recv_now/1024*4);
	//MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE,
	// recvproc[iswap],0,world,&request);
	MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE,
	recvproc[iswap],0,world,&request);
	//printf("%p %p %i\n",buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]*sizeof(double));
	//memcpy(buf_send,cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap]*sizeof(double));
	// CudaWrapper_SyncStream(1);
	//printf("B: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
	CudaWrapper_DownloadCudaDataAsync((void) buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]sizeof(double),2);
	//MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
	clock_gettime(CLOCK_REALTIME,&time1);
	CudaWrapper_SyncStream(2);
	//printf("C: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
	clock_gettime(CLOCK_REALTIME,&time2);
	cuda->shared_data.cuda_timings.comm_forward_download+=
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
	MPI_Send(buf_send,cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	//printf("D: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
	CudaWrapper_UploadCudaDataAsync((void) buf_recv,cuda->shared_data.comm.buf_recv_dev[iswap], size_forward_recv_nowsizeof(double),2);
	clock_gettime(CLOCK_REALTIME,&time1);
	CudaWrapper_SyncStream(2);
	//printf("E: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
	//memcpy(cuda->shared_data.comm.buf_recv[iswap],buf_recv,size_forward_recv_now*sizeof(double));
	//printf("RecvSize: %i SendSize: %i\n",size_forward_recv_nowsizeof(double),cuda->shared_data.comm.send_size[iswap]sizeof(double));
	clock_gettime(CLOCK_REALTIME,&time3);
	cuda->shared_data.cuda_timings.comm_forward_upload+=
	time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
	cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
	time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;
	clock_gettime(CLOCK_REALTIME,&time3);
	cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
	time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
	}
	else if (ghost_velocity)
	{
	/* int size_forward_recv_now=0;

	if((sizeof(X_FLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
	size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
	else
	size_forward_recv_now=size_forward_recv[iswap];

	clock_gettime(CLOCK_REALTIME,&time1);

	MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE,
	recvproc[iswap],0,world,&request);

	clock_gettime(CLOCK_REALTIME,&time2);

	MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);

	clock_gettime(CLOCK_REALTIME,&time3);
	cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
	time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
	cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
	time3.tv_sec-time2.tv_sec+1.0(time3.tv_nsec-time2.tv_nsec)/1000000000;/

	}
	else
	{
	MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
	recvproc[iswap],0,world,&request);

	if(avec->cudable)
	n = avec->pack_comm(sendnum[iswap],&iswap,
	buf_send,pbc_flag[iswap],pbc[iswap]);
	else
	n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
	buf_send,pbc_flag[iswap],pbc[iswap]);

	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
	}

	}
	else //sendproc == me
	{
	if (comm_x_only)
	{
	if (sendnum[iswap])
	{
	}
	}
	else if (ghost_velocity)
	{
	}
	else
	{
	n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
	buf_send,pbc_flag[iswap],pbc[iswap]);
	avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
	}
	}
	}
	if(not comm_x_only && not avec->cudable) cuda->uploadAll();
	}

	void CommCuda::forward_comm_unpack_cuda()
	{
	static int count=0;
	static double kerneltime=0.0;
	static double copytime=0.0;
	timespec time1,time2,time3;
	int n;
	MPI_Request request;
	MPI_Status status;
	AtomVec *avec = atom->avec;
	double **x = atom->x;

	cuda->shared_data.domain.xy=domain->xy;
	cuda->shared_data.domain.xz=domain->xz;
	cuda->shared_data.domain.yz=domain->yz;
	cuda->shared_data.domain.prd[0]=domain->prd[0];
	cuda->shared_data.domain.prd[1]=domain->prd[1];
	cuda->shared_data.domain.prd[2]=domain->prd[2];
	cuda->shared_data.domain.triclinic=domain->triclinic;
	if(not comm_x_only && not avec->cudable) cuda->downloadAll(); //if not comm_x_only the communication routine of the atom_vec style class is used

	// exchange data with another proc
	// if other proc is self, just copy
	// if comm_x_only set, exchange or copy directly to x, don't unpack

	for (int iswap = 0; iswap < nswap; iswap++) {
	if (sendproc[iswap] != me)
	{
	if (comm_x_only)
	{

	//Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],cuda->shared_data.comm.buf_recv[iswap],iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
	Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],buf_recv,iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]

	}
	else if (ghost_velocity)
	{
	//Cuda_CommCuda_UnpackComm_Vel(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],(void)&buf_recv[iswapmaxrecv]); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
	}
	else
	{
	MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
	recvproc[iswap],0,world,&request);

	if(avec->cudable)
	n = avec->pack_comm(sendnum[iswap],&iswap,
	buf_send,pbc_flag[iswap],pbc[iswap]);
	else
	n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
	buf_send,pbc_flag[iswap],pbc[iswap]);

	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
	}

	}
	else //sendproc == me
	{
	if (comm_x_only)
	{
	if (sendnum[iswap])
	{
	}
	}
	else if (ghost_velocity)
	{
	}
	else
	{
	n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
	buf_send,pbc_flag[iswap],pbc[iswap]);
	avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
	}
	}
	}
	if(not comm_x_only && not avec->cudable) cuda->uploadAll();
	}

	void CommCuda::forward_comm_pair(Pair *pair)
	{
	if(not cuda->shared_data.pair.cudable_force)
	{
	return Comm::forward_comm_pair(pair);
	}

	int iswap,n;
	double *buf;
	MPI_Request request;
	MPI_Status status;

	for (iswap = 0; iswap < nswap; iswap++) {

	// pack buffer

	n = pair->pack_comm(sendnum[iswap],&iswap,
	buf_send,pbc_flag[iswap],pbc[iswap]);
	int nrecv = recvnum[iswap]*n;
	if(nrecv<0) nrecv=-(nrecv+1)/2;
	int nsend = sendnum[iswap]*n;
	if(nsend<0) nsend=-(nsend+1)/2;

	// exchange with another proc
	// if self, set recv buffer to send buffer

	if (sendproc[iswap] != me) {
	MPI_Irecv(buf_recv,nrecv,MPI_DOUBLE,recvproc[iswap],0,
	world,&request);
	MPI_Send(buf_send,nsend,MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	buf = buf_recv;
	} else buf = buf_send;

	// unpack buffer

	pair->unpack_comm(recvnum[iswap],firstrecv[iswap],buf);
	}
	}

	/* ----------------------------------------------------------------------
	reverse communication of forces on atoms every timestep
	other per-atom attributes may also be sent via pack/unpack routines
	------------------------------------------------------------------------- */

	void CommCuda::reverse_comm()
	{
	int n;
	MPI_Request request;
	MPI_Status status;
	AtomVec *avec = atom->avec;
	double **f = atom->f;
	double *buf;

	if(not comm_f_only && not avec->cudable) cuda->downloadAll(); //not yet implemented in CUDA but only needed for non standard atom styles

	// exchange data with another proc
	// if other proc is self, just copy
	// if comm_f_only set, exchange or copy directly from f, don't pack

	for (int iswap = nswap-1; iswap >= 0; iswap--) {
	if (sendproc[iswap] != me) {
	if (comm_f_only) {

	int size_recv_now=size_reverse_recv[iswap];
	if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_recv[iswap])
	size_recv_now=(size_recv_now+1)*sizeof(F_FLOAT)/sizeof(double);
	MPI_Irecv(buf_recv,size_recv_now,MPI_DOUBLE,
	sendproc[iswap],0,world,&request);

	buf=buf_send;
	if (size_reverse_send[iswap])
	{
	Cuda_CommCuda_PackReverse(&cuda->shared_data,size_reverse_send[iswap]/3,firstrecv[iswap],buf);
	}
	else buf=NULL;
	int size_reverse_send_now=size_reverse_send[iswap];
	if((sizeof(F_FLOAT)!=sizeof(double))&& size_reverse_send[iswap])
	size_reverse_send_now=(size_reverse_send_now+1)*sizeof(F_FLOAT)/sizeof(double);
	MPI_Send(buf,size_reverse_send_now,MPI_DOUBLE,
	recvproc[iswap],0,world);
	MPI_Wait(&request,&status);
	Cuda_CommCuda_UnpackReverse(&cuda->shared_data,sendnum[iswap],iswap,buf_recv);

	} else {
	MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE,
	sendproc[iswap],0,world,&request);
	n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
	MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world);
	MPI_Wait(&request,&status);

	avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_recv);
	}

	} else {
	if (comm_f_only) {
	if (sendnum[iswap])
	Cuda_CommCuda_UnpackReverse_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap]);
	} else {
	n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
	avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_send);
	}
	}
	}
	if(not comm_f_only && not avec->cudable) cuda->uploadAll(); //not yet implemented in CUDA but only needed for non standard atom styles
	}

	/* ----------------------------------------------------------------------
	exchange: move atoms to correct processors
	atoms exchanged with all 6 stencil neighbors
	send out atoms that have left my box, receive ones entering my box
	atoms will be lost if not inside some proc's box
	can happen if atom moves outside of non-periodic bounary
	or if atom moves more than one proc away
	this routine called before every reneighboring
	for triclinic, atoms must be in lamda coords (0-1) before exchange is called
	------------------------------------------------------------------------- */

	void CommCuda::exchange()
	{
	AtomVec *avec = atom->avec;

	if(not cuda->oncpu && avec->cudable)
	return exchange_cuda();

	if(not cuda->oncpu) cuda->downloadAll();

	Comm::exchange();
	}


	void CommCuda::exchange_cuda()
	{
	int i,m,nsend,nrecv,nrecv1,nrecv2,nlocal;
	double lo,hi,value;
	double **x;
	double sublo,subhi,*buf;
	MPI_Request request;
	MPI_Status status;
	AtomVec *avec = atom->avec;
	timespec time1,time2,time3;

	// clear global->local map for owned and ghost atoms
	// b/c atoms migrate to new procs in exchange() and
	// new ghosts are created in borders()
	// map_set() is done at end of borders()


	if(map_style) cuda->cu_tag->download();

	if (map_style) atom->map_clear();

	// subbox bounds for orthogonal or triclinic

	if (triclinic == 0) {
	sublo = domain->sublo;
	subhi = domain->subhi;
	} else {
	sublo = domain->sublo_lamda;
	subhi = domain->subhi_lamda;
	}

	// loop over dimensions

	for (int dim = 0; dim < 3; dim++) {
	// fill buffer with atoms leaving my box, using < and >=
	// when atom is deleted, fill it in with last atom

	cuda->shared_data.exchange_dim=dim;

	nlocal = atom->nlocal;
	avec->maxsend=&maxsend;
	nsend=avec->pack_exchange(dim,(double*) &buf_send);
	nlocal = atom->nlocal;


	atom->nlocal = nlocal;

	// send/recv atoms in both directions
	// if 1 proc in dimension, no send/recv, set recv buf to send buf
	// if 2 procs in dimension, single send/recv
	// if more than 2 procs in dimension, send/recv to both neighbors

	clock_gettime(CLOCK_REALTIME,&time1);

	if (procgrid[dim] == 1) {
	nrecv = nsend;
	buf = buf_send;

	} else {
	MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
	&nrecv1,1,MPI_INT,procneigh[dim][1],0,world,&status);
	nrecv = nrecv1;
	if (procgrid[dim] > 2) {
	MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
	&nrecv2,1,MPI_INT,procneigh[dim][0],0,world,&status);
	nrecv += nrecv2;
	}
	if (nrecv+1 > maxrecv) grow_recv(nrecv+1);

	MPI_Irecv(buf_recv,nrecv1,MPI_DOUBLE,procneigh[dim][1],0,
	world,&request);
	MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][0],0,world);
	MPI_Wait(&request,&status);

	if (procgrid[dim] > 2) {
	MPI_Irecv(&buf_recv[nrecv1],nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
	world,&request);
	MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][1],0,world);
	MPI_Wait(&request,&status);

	if((nrecv1==0)\|\|(nrecv2==0)) buf_recv[nrecv]=0;
	}

	buf = buf_recv;
	}
	//printf("nsend: %i nrecv: %i\n",nsend,nrecv);
	// check incoming atoms to see if they are in my box
	// if so, add to my list
	clock_gettime(CLOCK_REALTIME,&time2);
	cuda->shared_data.cuda_timings.comm_exchange_mpi+=
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;

	if(nrecv)
	{
	avec->maxsend=&maxsend;
	avec->unpack_exchange(buf);
	}
	}

	if(atom->firstgroupname) cuda->downloadAll();

	if(atom->firstgroupname) atom->first_reorder();

	if(atom->firstgroupname) cuda->uploadAll();
	}

	/* ----------------------------------------------------------------------
	borders: list nearby atoms to send to neighboring procs at every timestep
	one list is created for every swap that will be made
	as list is made, actually do swaps
	this does equivalent of a communicate (so don't need to explicitly
	call communicate routine on reneighboring timestep)
	this routine is called before every reneighboring
	for triclinic, atoms must be in lamda coords (0-1) before borders is called
	------------------------------------------------------------------------- */


	void CommCuda::borders()
	{
	AtomVec *avec = atom->avec;
	if(not cuda->oncpu && avec->cudable)
	{
	if(cuda->shared_data.overlap_comm&&cuda->finished_setup)
	borders_cuda_overlap_forward_comm();
	else
	borders_cuda();

	return;
	}

	Comm::borders();

	cuda->setSystemParams();
	if(cuda->finished_setup) {cuda->checkResize(); cuda->uploadAll();}
	cuda->shared_data.atom.nghost=atom->nghost;
	cu_sendlist->upload();
	}

	void CommCuda::borders_cuda()
	{
	int i,n,itype,iswap,dim,ineed,maxneed,smax,rmax;
	int nsend,nrecv,nfirst,nlast,ngroup;
	double lo,hi;
	int *type;
	double **x;
	double buf,mlo,*mhi;
	MPI_Request request;
	MPI_Status status;
	AtomVec *avec = atom->avec;
	timespec time1,time2,time3;

	// clear old ghosts

	atom->nghost = 0;

	// do swaps over all 3 dimensions

	iswap = 0;
	smax = rmax = 0;

	cuda->shared_data.comm.nsend=0;
	for (dim = 0; dim < 3; dim++) {
	nlast = 0;
	maxneed = 2*need[dim];
	for (ineed = 0; ineed < maxneed; ineed++) {

	// find atoms within slab boundaries lo/hi using <= and >=
	// check atoms between nfirst and nlast
	// for first swaps in a dim, check owned and ghost
	// for later swaps in a dim, only check newly arrived ghosts
	// store sent atom indices in list for use in future timesteps

	x = atom->x;
	if (style == SINGLE) {
	lo = slablo[iswap];
	hi = slabhi[iswap];
	} else {
	type = atom->type;
	mlo = multilo[iswap];
	mhi = multihi[iswap];
	}
	if (ineed % 2 == 0) {
	nfirst = nlast;
	nlast = atom->nlocal + atom->nghost;
	}

	nsend = 0;

	// find send atoms according to SINGLE vs MULTI
	// all atoms eligible versus atoms in bordergroup
	// only need to limit loop to bordergroup for first sends (ineed < 2)
	// on these sends, break loop in two: owned (in group) and ghost
	do
	{
	if(nsend>=maxsendlist[iswap]) grow_list(iswap,static_cast <int> (nsend*1.05));
	nsend=Cuda_CommCuda_BuildSendlist(&cuda->shared_data,bordergroup,ineed,style==SINGLE?1:0,atom->nfirst,nfirst,nlast,dim,iswap);
	}while(nsend>=maxsendlist[iswap]);
	// pack up list of border atoms

	if (nsend*size_border > maxsend)
	grow_send(nsend*size_border,0);

	if (ghost_velocity)
	n = avec->pack_border_vel(nsend,&iswap,buf_send,
	pbc_flag[iswap],pbc[iswap]);
	else
	n = avec->pack_border(nsend,&iswap,buf_send,
	pbc_flag[iswap],pbc[iswap]);

	// swap atoms with other proc
	// put incoming ghosts at end of my atom arrays
	// if swapping with self, simply copy, no messages

	clock_gettime(CLOCK_REALTIME,&time1);
	if (sendproc[iswap] != me) {
	MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
	&nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
	if (nrecv*size_border > maxrecv)
	grow_recv(nrecv*size_border);
	MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
	recvproc[iswap],0,world,&request);
	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	buf = buf_recv;
	} else {
	nrecv = nsend;
	buf = buf_send;
	}

	clock_gettime(CLOCK_REALTIME,&time2);
	cuda->shared_data.cuda_timings.comm_border_mpi+=
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;

	// unpack buffer

	if (ghost_velocity)
	avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf);
	else
	avec->unpack_border(nrecv,atom->nlocal+atom->nghost,buf);

	// set all pointers & counters

	smax = MAX(smax,nsend);
	rmax = MAX(rmax,nrecv);
	sendnum[iswap] = nsend;
	recvnum[iswap] = nrecv;
	size_forward_recv[iswap] = nrecv*size_forward;
	size_reverse_send[iswap] = nrecv*size_reverse;
	size_reverse_recv[iswap] = nsend*size_reverse;
	firstrecv[iswap] = atom->nlocal + atom->nghost;
	atom->nghost += nrecv;
	iswap++;
	}
	}

	// insure send/recv buffers are long enough for all forward & reverse comm

	int max = MAX(maxforwardsmax,maxreversermax);
	if (max > maxsend) grow_send(max,0);
	max = MAX(maxforwardrmax,maxreversesmax);
	if (max > maxrecv) grow_recv(max);

	// reset global->local map
	if(map_style)
	{
	cuda->cu_tag->download();
	atom->map_set();
	}

	cuda->setSystemParams();
	cuda->shared_data.atom.nghost+=n;
	}

	void CommCuda::borders_cuda_overlap_forward_comm()
	{
	int i,n,itype,iswap,dim,ineed,maxneed,smax,rmax;
	int nsend,nrecv,nfirst,nlast,ngroup;
	double lo,hi;
	int *type;
	double **x;
	double buf,mlo,*mhi;
	MPI_Request request;
	MPI_Status status;
	AtomVec *avec = atom->avec;
	timespec time1,time2,time3;

	// clear old ghosts

	atom->nghost = 0;

	// do swaps over all 3 dimensions

	iswap = 0;
	smax = rmax = 0;

	cuda->shared_data.comm.nsend=0;
	for (dim = 0; dim < 3; dim++) {
	nlast = 0;
	maxneed = 2*need[dim];
	for (ineed = 0; ineed < maxneed; ineed++) {

	// find atoms within slab boundaries lo/hi using <= and >=
	// check atoms between nfirst and nlast
	// for first swaps in a dim, check owned and ghost
	// for later swaps in a dim, only check newly arrived ghosts
	// store sent atom indices in list for use in future timesteps

	x = atom->x;
	if (style == SINGLE) {
	lo = slablo[iswap];
	hi = slabhi[iswap];
	} else {
	type = atom->type;
	mlo = multilo[iswap];
	mhi = multihi[iswap];
	}
	if (ineed % 2 == 0) {
	nfirst = nlast;
	nlast = atom->nlocal + atom->nghost;
	}

	nsend = 0;

	// find send atoms according to SINGLE vs MULTI
	// all atoms eligible versus atoms in bordergroup
	// only need to limit loop to bordergroup for first sends (ineed < 2)
	// on these sends, break loop in two: owned (in group) and ghost
	do
	{
	if(nsend>=maxsendlist[iswap]) grow_list(iswap,static_cast <int> (nsend*1.05));
	nsend=Cuda_CommCuda_BuildSendlist(&cuda->shared_data,bordergroup,ineed,style==SINGLE?1:0,atom->nfirst,nfirst,nlast,dim,iswap);
	}while(nsend>=maxsendlist[iswap]);
	cuda->shared_data.comm.nsend_swap[iswap]=nsend;
	// pack up list of border atoms

	if (nsend*size_border > maxsend)
	grow_send(nsend*size_border,0);

	if (ghost_velocity)
	n = avec->pack_border_vel(nsend,&iswap,buf_send,
	pbc_flag[iswap],pbc[iswap]);
	else
	n = avec->pack_border(nsend,&iswap,buf_send,
	pbc_flag[iswap],pbc[iswap]);

	// swap atoms with other proc
	// put incoming ghosts at end of my atom arrays
	// if swapping with self, simply copy, no messages

	clock_gettime(CLOCK_REALTIME,&time1);
	if (sendproc[iswap] != me) {
	MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
	&nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
	if (nrecv*size_border > maxrecv)
	grow_recv(nrecv*size_border);
	MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
	recvproc[iswap],0,world,&request);
	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	buf = buf_recv;
	} else {
	nrecv = nsend;
	buf = buf_send;
	}

	clock_gettime(CLOCK_REALTIME,&time2);
	cuda->shared_data.cuda_timings.comm_border_mpi+=
	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;

	// unpack buffer

	if (ghost_velocity)
	avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf);
	else
	avec->unpack_border(nrecv,atom->nlocal+atom->nghost,buf);

	// set all pointers & counters

	smax = MAX(smax,nsend);
	rmax = MAX(rmax,nrecv);
	sendnum[iswap] = nsend;
	recvnum[iswap] = nrecv;
	size_forward_recv[iswap] = nrecv*size_forward;
	size_reverse_send[iswap] = nrecv*size_reverse;
	size_reverse_recv[iswap] = nsend*size_reverse;
	firstrecv[iswap] = atom->nlocal + atom->nghost;
	atom->nghost += nrecv;
	iswap++;
	}
	}

	// insure send/recv buffers are long enough for all forward & reverse comm

	int max = MAX(maxforwardsmax,maxreversermax);
	if (max > maxsend) grow_send(max,0);
	max = MAX(maxforwardrmax,maxreversesmax);
	if (max > maxrecv) grow_recv(max);

	// reset global->local map
	if(map_style)
	{
	cuda->cu_tag->download();
	atom->map_set();
	}

	cuda->setSystemParams();
	cuda->shared_data.atom.nghost+=n;
	}




	void CommCuda::forward_comm_fix(Fix *fix)
	{
	int iswap,n;
	double *buf;
	MPI_Request request;
	MPI_Status status;

	for (iswap = 0; iswap < nswap; iswap++) {
	// pack buffer
	if(fix->cudable_comm&&cuda->finished_setup)
	{
	int swap=iswap;
	if(sendproc[iswap] == me) {swap=-iswap-1; buf=(double*)&(firstrecv[iswap]);}
	else buf=buf_send;

	n = fix->pack_comm(sendnum[iswap],&swap,
	buf,pbc_flag[iswap],pbc[iswap]);
	if(sendproc[iswap] == me)
	{
	continue;
	}
	}
	else
	n = fix->pack_comm(sendnum[iswap],sendlist[iswap],
	buf_send,pbc_flag[iswap],pbc[iswap]);

	// exchange with another proc
	// if self, set recv buffer to send buffer

	if (sendproc[iswap] != me) {
	MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
	world,&request);
	MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
	MPI_Wait(&request,&status);
	buf = buf_recv;
	} else buf = buf_send;

	// unpack buffer

	fix->unpack_comm(recvnum[iswap],firstrecv[iswap],buf);
	}
	}


	void CommCuda::grow_send(int n, int flag)
	{
	int oldmaxsend = (maxsend+BUFEXTRA)*sizeof(double);
	maxsend = static_cast<int> (BUFFACTOR * n);
	if (flag){
	if(cuda->pinned)
	{
	double* tmp = new double[oldmaxsend];
	memcpy((void) tmp,(void) buf_send,oldmaxsend*sizeof(double));
	if(buf_send) CudaWrapper_FreePinnedHostData((void*) (buf_send));
	buf_send = (double) CudaWrapper_AllocPinnedHostData((maxsend+BUFEXTRA)sizeof(double),false);
	memcpy(buf_send,tmp,oldmaxsend*sizeof(double));
	delete [] tmp;
	}
	else
	{
	buf_send = (double *)
	memory->srealloc(buf_send,(maxsend+BUFEXTRA)*sizeof(double),
	"comm:buf_send");printf("srealloc\n");
	}
	}
	else {
	if(cuda->pinned)
	{
	if(buf_send) CudaWrapper_FreePinnedHostData((void*) buf_send);
	buf_send = (double) CudaWrapper_AllocPinnedHostData((maxsend+BUFEXTRA)sizeof(double),false);
	}
	else
	{
	memory->sfree(buf_send);
	buf_send = (double ) memory->smalloc((maxsend+BUFEXTRA)sizeof(double),
	"comm:buf_send");
	}
	for(int i=0;i<maxswap;i++)
	{
	if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],oldmaxsend);
	cuda->shared_data.comm.buf_send_dev[i]=CudaWrapper_AllocCudaData((maxsend+BUFEXTRA)*sizeof(double));
	}
	}
	}
	/* ----------------------------------------------------------------------
	free/malloc the size of the recv buffer as needed with BUFFACTOR
	------------------------------------------------------------------------- */


	void CommCuda::grow_recv(int n)
	{
	int oldmaxrecv = maxrecv*sizeof(double);
	maxrecv = static_cast<int> (BUFFACTOR * n);
	if(cuda->pinned)
	{
	if(buf_recv) CudaWrapper_FreePinnedHostData((void*)buf_recv);
	buf_recv = (double) CudaWrapper_AllocPinnedHostData(maxrecvsizeof(double), false,true);
	}
	else
	{
	memory->sfree(buf_recv);
	buf_recv = (double ) memory->smalloc(maxrecvsizeof(double),
	"comm:buf_recv");
	}
	for(int i=0;i<maxswap;i++)
	{
	if(cuda->shared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],oldmaxrecv);
	cuda->shared_data.comm.buf_recv_dev[i]=CudaWrapper_AllocCudaData((maxrecv)*sizeof(double));
	}
	}

	/* ----------------------------------------------------------------------
	realloc the size of the iswap sendlist as needed with BUFFACTOR
	------------------------------------------------------------------------- */

	void CommCuda::grow_list(int iswap, int n)
	{

	MYDBG(printf(" # CUDA CommCuda::grow_list\n");)
	if(cuda->finished_setup&&cu_sendlist) cu_sendlist->download();
	if(!cu_sendlist\|\|nBUFFACTOR>cu_sendlist->get_dim()[1]\|\|nBUFFACTOR>maxsendlist[iswap])
	{
	for(int i=0;i<maxswap;i++)
	{
	maxsendlist[i] = static_cast<int> (BUFFACTOR * n);
	sendlist[i] = (int *)
	memory->srealloc(sendlist[i],maxsendlist[i]*sizeof(int),
	"comm:sendlist[iswap]");
	}
	delete cu_sendlist;
	cu_sendlist=new cCudaData<int, int, xy> ((int*)sendlist,maxswap,maxsendlist[iswap]);
	cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data();
	cuda->shared_data.comm.maxlistlength=maxsendlist[iswap];
	cu_sendlist->upload();
	}
	}

	/* ----------------------------------------------------------------------
	realloc the buffers needed for swaps
	------------------------------------------------------------------------- */

	void CommCuda::grow_swap(int n)
	{
	int oldmaxswap=maxswap;
	Comm::grow_swap(n);
	if(n>cu_sendlist->get_dim()[0])
	{
	MYDBG(printf(" # CUDA CommCuda::grow_swap\n");)

	delete cu_sendlist;
	cu_sendlist=new cCudaData<int, int, xy> ((int*)sendlist,n,BUFMIN);
	cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data();
	cuda->shared_data.comm.maxlistlength=BUFMIN;
	cuda->shared_data.comm.maxswap=n;
	cuda->shared_data.comm.nsend_swap=new int[n];
	cuda->shared_data.comm.send_size=new int[n];
	cuda->shared_data.comm.recv_size=new int[n];
	}
	for(int i=0;i<oldmaxswap;i++)
	{
	if(cuda->shared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],maxrecv*sizeof(double));
	if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],maxsend*sizeof(double));
	cuda->shared_data.comm.buf_recv_dev[i]=NULL;
	cuda->shared_data.comm.buf_send_dev[i]=NULL;
	}
	cuda->shared_data.comm.buf_send= new double*[n];
	cuda->shared_data.comm.buf_recv= new double*[n];
	cuda->shared_data.comm.buf_send_dev= new void*[n];
	cuda->shared_data.comm.buf_recv_dev= new void*[n];
	for(int i=0;i<n;i++)
	{
	cuda->shared_data.comm.buf_recv[i]=NULL;
	cuda->shared_data.comm.buf_send[i]=NULL;
	cuda->shared_data.comm.buf_recv_dev[i]=NULL;
	cuda->shared_data.comm.buf_send_dev[i]=NULL;
	}
	grow_send(maxsend,0);
	grow_recv(maxrecv);

	maxswap=n;
	}

	/* ----------------------------------------------------------------------
	allocation of swap info
	------------------------------------------------------------------------- */

	void CommCuda::allocate_swap(int n)
	{
	Comm::allocate_swap(n);

	delete cu_pbc;
	delete cu_slablo;
	delete cu_slabhi;

	cuda->shared_data.comm.maxswap=n;
	if(cu_sendlist)
	{
	cu_pbc=new cCudaData<int, int, xy> ((int*)pbc,n,6);
	cu_slablo = new cCudaData<double, X_FLOAT,x>(slablo,n);
	cu_slabhi = new cCudaData<double, X_FLOAT,x>(slabhi,n);

	cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
	cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
	cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data();
	}
	cuda->shared_data.comm.nsend_swap=new int[n];
	cuda->shared_data.comm.send_size=new int[n];
	cuda->shared_data.comm.recv_size=new int[n];
	cuda->shared_data.comm.buf_send= new double*[n];
	cuda->shared_data.comm.buf_recv= new double*[n];
	cuda->shared_data.comm.buf_send_dev= new void*[n];
	cuda->shared_data.comm.buf_recv_dev= new void*[n];
	for(int i=0;i<n;i++) cuda->shared_data.comm.buf_send_dev[i]=NULL;
	for(int i=0;i<n;i++) cuda->shared_data.comm.buf_recv_dev[i]=NULL;
	}


	/* ----------------------------------------------------------------------
	allocation of multi-type swap info
	------------------------------------------------------------------------- */

	void CommCuda::allocate_multi(int n)
	{
	Comm::allocate_multi(n);

	delete cu_multilo;
	delete cu_multihi;
	cu_multilo = new cCudaData<double, X_FLOAT,xy>(slablo,n,atom->ntypes+1);
	cu_multihi = new cCudaData<double, X_FLOAT,xy>(slabhi,n,atom->ntypes+1);

	cuda->shared_data.comm.multilo.dev_data=cu_multilo->dev_data();
	cuda->shared_data.comm.multihi.dev_data=cu_multihi->dev_data();
	}

	/* ----------------------------------------------------------------------
	free memory for swaps
	------------------------------------------------------------------------- */

	void CommCuda::free_swap()
	{

	Comm::free_swap();

	delete cuda->shared_data.comm.nsend_swap; cuda->shared_data.comm.nsend_swap=NULL;
	delete cu_pbc; cu_pbc = NULL;
	delete cu_slablo; cu_slablo = NULL;
	delete cu_slabhi; cu_slabhi = NULL;
	for(int i=0;i<maxswap;i++)
	{
	if(cuda->shared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],maxrecv*sizeof(double));
	if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],maxsend*sizeof(double));
	}


	}

	/* ----------------------------------------------------------------------
	free memory for multi-type swaps
	------------------------------------------------------------------------- */

	void CommCuda::free_multi()
	{
	Comm::free_multi();
	delete cu_multilo; cu_multilo = NULL;
	delete cu_multihi; cu_multihi = NULL;
	}

comm_cuda.cppNo OneTemporaryActions

File Metadata

comm_cuda.cppView Options

Event Timeline

comm_cuda.cpp
No OneTemporary
Actions

comm_cuda.cpp
View Options