diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 3276d0cdb..553434134 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -1,1103 +1,1102 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ #include "comm_kokkos.h" #include "kokkos.h" #include "atom.h" #include "atom_kokkos.h" #include "atom_vec.h" #include "atom_vec_kokkos.h" #include "domain.h" #include "atom_masks.h" #include "error.h" #include "memory.h" #include "force.h" #include "pair.h" #include "fix.h" #include "compute.h" #include "dump.h" #include "output.h" #include "modify.h" using namespace LAMMPS_NS; #define BUFFACTOR 1.5 #define BUFMIN 10000 #define BUFEXTRA 1000 enum{SINGLE,MULTI}; /* ---------------------------------------------------------------------- setup MPI and allocate buffer space ------------------------------------------------------------------------- */ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) { if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]); memory->sfree(sendlist); sendlist = NULL; k_sendlist = DAT::tdual_int_2d(); k_total_send = DAT::tdual_int_scalar("comm::k_total_send"); // error check for disallow of OpenMP threads? // initialize comm buffers & exchange memory memory->destroy(buf_send); buf_send = NULL; memory->destroy(buf_recv); buf_recv = NULL; k_exchange_sendlist = DAT:: tdual_int_1d("comm:k_exchange_sendlist",100); k_exchange_copylist = DAT:: tdual_int_1d("comm:k_exchange_copylist",100); k_count = DAT::tdual_int_scalar("comm:k_count"); k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100); memory->destroy(maxsendlist); maxsendlist = NULL; memory->create(maxsendlist,maxswap,"comm:maxsendlist"); for (int i = 0; i < maxswap; i++) { maxsendlist[i] = BUFMIN; } memory->create_kokkos(k_sendlist,sendlist,maxswap,BUFMIN,"comm:sendlist"); max_buf_pair = 0; k_buf_send_pair = DAT::tdual_xfloat_1d("comm:k_buf_send_pair",1); k_buf_recv_pair = DAT::tdual_xfloat_1d("comm:k_recv_send_pair",1); } /* ---------------------------------------------------------------------- */ CommKokkos::~CommKokkos() { memory->destroy_kokkos(k_sendlist,sendlist); sendlist = NULL; memory->destroy_kokkos(k_buf_send,buf_send); buf_send = NULL; memory->destroy_kokkos(k_buf_recv,buf_recv); buf_recv = NULL; } /* ---------------------------------------------------------------------- */ void CommKokkos::init() { maxsend = BUFMIN; maxrecv = BUFMIN; grow_send_kokkos(maxsend+bufextra,0,Host); grow_recv_kokkos(maxrecv,Host); atomKK = (AtomKokkos *) atom; exchange_comm_classic = lmp->kokkos->exchange_comm_classic; forward_comm_classic = lmp->kokkos->forward_comm_classic; reverse_comm_classic = lmp->kokkos->reverse_comm_classic; exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host; forward_comm_on_host = lmp->kokkos->forward_comm_on_host; reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host; CommBrick::init(); int check_forward = 0; int check_reverse = 0; if (force->pair && (force->pair->execution_space == Host)) check_forward += force->pair->comm_forward; if (force->pair && (force->pair->execution_space == Host)) check_reverse += force->pair->comm_reverse; for (int i = 0; i < modify->nfix; i++) { check_forward += modify->fix[i]->comm_forward; check_reverse += modify->fix[i]->comm_reverse; } for (int i = 0; i < modify->ncompute; i++) { check_forward += modify->compute[i]->comm_forward; check_reverse += modify->compute[i]->comm_reverse; } for (int i = 0; i < output->ndump; i++) { check_forward += output->dump[i]->comm_forward; check_reverse += output->dump[i]->comm_reverse; } if (force->newton == 0) check_reverse = 0; if (force->pair) check_reverse += force->pair->comm_reverse_off; - //if (check_forward) - // forward_comm_classic = true; + if (ghost_velocity) + forward_comm_classic = true; - //if (check_reverse || !comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet reverse_comm_classic = true; } /* ---------------------------------------------------------------------- forward communication of atom coords every timestep other per-atom attributes may also be sent via pack/unpack routines ------------------------------------------------------------------------- */ void CommKokkos::forward_comm(int dummy) { if (!forward_comm_classic) { if (forward_comm_on_host) forward_comm_device(dummy); else forward_comm_device(dummy); return; } k_sendlist.sync(); if (comm_x_only) { atomKK->sync(Host,X_MASK); atomKK->modified(Host,X_MASK); } else if (ghost_velocity) { atomKK->sync(Host,X_MASK | V_MASK); atomKK->modified(Host,X_MASK | V_MASK); } else { atomKK->sync(Host,ALL_MASK); atomKK->modified(Host,ALL_MASK); } CommBrick::forward_comm(dummy); } /* ---------------------------------------------------------------------- */ template void CommKokkos::forward_comm_device(int dummy) { int n; MPI_Request request; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; double *buf; // exchange data with another proc // if other proc is self, just copy // if comm_x_only set, exchange or copy directly to x, don't unpack k_sendlist.sync(); atomKK->sync(ExecutionSpaceFromDevice::space,X_MASK); for (int iswap = 0; iswap < nswap; iswap++) { if (sendproc[iswap] != me) { if (comm_x_only) { if (size_forward_recv[iswap]) { buf = atomKK->k_x.view().ptr_on_device() + firstrecv[iswap]*atomKK->k_x.view().dimension_1(); MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); } n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); if (n) { MPI_Send(k_buf_send.view().ptr_on_device(), n,MPI_DOUBLE,sendproc[iswap],0,world); } if (size_forward_recv[iswap]) { MPI_Wait(&request,MPI_STATUS_IGNORE); atomKK->modified(ExecutionSpaceFromDevice:: space,X_MASK); } } else if (ghost_velocity) { error->all(FLERR,"Ghost velocity forward comm not yet " "implemented with Kokkos"); if (size_forward_recv[iswap]) MPI_Irecv(k_buf_recv.view().ptr_on_device(), size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap], buf_send,pbc_flag[iswap],pbc[iswap]); if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv); } else { if (size_forward_recv[iswap]) MPI_Irecv(k_buf_recv.view().ptr_on_device(), size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap, k_buf_send,pbc_flag[iswap],pbc[iswap]); if (n) MPI_Send(k_buf_send.view().ptr_on_device(),n, MPI_DOUBLE,sendproc[iswap],0,world); if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); } } else { if (!ghost_velocity) { if (sendnum[iswap]) n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap, firstrecv[iswap],pbc_flag[iswap],pbc[iswap]); } else if (ghost_velocity) { error->all(FLERR,"Ghost velocity forward comm not yet " "implemented with Kokkos"); n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap], buf_send,pbc_flag[iswap],pbc[iswap]); avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send); } } } } /* ---------------------------------------------------------------------- reverse communication of forces on atoms every timestep other per-atom attributes may also be sent via pack/unpack routines ------------------------------------------------------------------------- */ void CommKokkos::reverse_comm() { if (!reverse_comm_classic) { if (reverse_comm_on_host) reverse_comm_device(); else reverse_comm_device(); return; } k_sendlist.sync(); if (comm_f_only) atomKK->sync(Host,F_MASK); else atomKK->sync(Host,ALL_MASK); CommBrick::reverse_comm(); if (comm_f_only) atomKK->modified(Host,F_MASK); else atomKK->modified(Host,ALL_MASK); //atomKK->sync(Device,ALL_MASK); // is this needed? } template void CommKokkos::reverse_comm_device() { int n; MPI_Request request; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; double *buf; // exchange data with another proc // if other proc is self, just copy // if comm_f_only set, exchange or copy directly from f, don't pack k_sendlist.sync(); atomKK->sync(ExecutionSpaceFromDevice::space,F_MASK); for (int iswap = nswap-1; iswap >= 0; iswap--) { if (sendproc[iswap] != me) { if (comm_f_only) { if (size_reverse_recv[iswap]) MPI_Irecv(k_buf_recv.view().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE, sendproc[iswap],0,world,&request); if (size_reverse_send[iswap]) { buf = atomKK->k_f.view().ptr_on_device() + firstrecv[iswap]*atomKK->k_f.view().dimension_1(); MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, recvproc[iswap],0,world); } if (size_reverse_recv[iswap]) { MPI_Wait(&request,MPI_STATUS_IGNORE); atomKK->modified(ExecutionSpaceFromDevice:: space,F_MASK); } } else { if (size_reverse_recv[iswap]) MPI_Irecv(k_buf_recv.view().ptr_on_device(), size_reverse_recv[iswap],MPI_DOUBLE, sendproc[iswap],0,world,&request); n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send); if (n) MPI_Send(k_buf_send.view().ptr_on_device(),n, MPI_DOUBLE,recvproc[iswap],0,world); if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); } avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap, k_buf_recv); } else { if (sendnum[iswap]) n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap, firstrecv[iswap]); } } } /* ---------------------------------------------------------------------- */ void CommKokkos::forward_comm_fix(Fix *fix, int size) { k_sendlist.sync(); CommBrick::forward_comm_fix(fix,size); } void CommKokkos::reverse_comm_fix(Fix *fix, int size) { k_sendlist.sync(); CommBrick::reverse_comm_fix(fix, size); } void CommKokkos::forward_comm_compute(Compute *compute) { k_sendlist.sync(); CommBrick::forward_comm_compute(compute); } void CommKokkos::reverse_comm_compute(Compute *compute) { k_sendlist.sync(); CommBrick::reverse_comm_compute(compute); } void CommKokkos::forward_comm_pair(Pair *pair) { if (pair->execution_space == Host) { k_sendlist.sync(); CommBrick::forward_comm_pair(pair); } else if (pair->execution_space == Device) { k_sendlist.sync(); forward_comm_pair_device(pair); } } template void CommKokkos::forward_comm_pair_device(Pair *pair) { int iswap,n; MPI_Request request; int nsize = pair->comm_forward; for (iswap = 0; iswap < nswap; iswap++) { int n = MAX(max_buf_pair,nsize*sendnum[iswap]); n = MAX(n,nsize*recvnum[iswap]); if (n > max_buf_pair) grow_buf_pair(n); } for (iswap = 0; iswap < nswap; iswap++) { // pack buffer n = pair->pack_forward_comm_kokkos(sendnum[iswap],k_sendlist, iswap,k_buf_send_pair,pbc_flag[iswap],pbc[iswap]); // exchange with another proc // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { if (recvnum[iswap]) MPI_Irecv(k_buf_recv_pair.view().ptr_on_device(),nsize*recvnum[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); if (sendnum[iswap]) MPI_Send(k_buf_send_pair.view().ptr_on_device(),n,MPI_DOUBLE,sendproc[iswap],0,world); if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); } else k_buf_recv_pair = k_buf_send_pair; // unpack buffer pair->unpack_forward_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv_pair); } } void CommKokkos::grow_buf_pair(int n) { max_buf_pair = n * BUFFACTOR; k_buf_send_pair.resize(max_buf_pair); k_buf_recv_pair.resize(max_buf_pair); } void CommKokkos::reverse_comm_pair(Pair *pair) { k_sendlist.sync(); CommBrick::reverse_comm_pair(pair); } void CommKokkos::forward_comm_dump(Dump *dump) { k_sendlist.sync(); CommBrick::forward_comm_dump(dump); } void CommKokkos::reverse_comm_dump(Dump *dump) { k_sendlist.sync(); CommBrick::reverse_comm_dump(dump); } /* ---------------------------------------------------------------------- exchange: move atoms to correct processors atoms exchanged with all 6 stencil neighbors send out atoms that have left my box, receive ones entering my box atoms will be lost if not inside some proc's box can happen if atom moves outside of non-periodic bounary or if atom moves more than one proc away this routine called before every reneighboring for triclinic, atoms must be in lamda coords (0-1) before exchange is called ------------------------------------------------------------------------- */ void CommKokkos::exchange() { if(atom->nextra_grow + atom->nextra_border) { if(!exchange_comm_classic) { static int print = 1; if(print && comm->me==0) { error->warning(FLERR,"Fixes cannot yet send data in Kokkos communication, " "switching to classic communication"); } print = 0; exchange_comm_classic = true; } } if (!exchange_comm_classic) { if (exchange_comm_on_host) exchange_device(); else exchange_device(); return; } atomKK->sync(Host,ALL_MASK); atomKK->modified(Host,ALL_MASK); CommBrick::exchange(); } /* ---------------------------------------------------------------------- */ template struct BuildExchangeListFunctor { typedef DeviceType device_type; typedef ArrayTypes AT; X_FLOAT _lo,_hi; typename AT::t_x_array _x; int _nlocal,_dim; typename AT::t_int_scalar _nsend; typename AT::t_int_1d _sendlist; typename AT::t_int_1d _sendflag; BuildExchangeListFunctor( const typename AT::tdual_x_array x, const typename AT::tdual_int_1d sendlist, typename AT::tdual_int_scalar nsend, typename AT::tdual_int_1d sendflag,int nlocal, int dim, X_FLOAT lo, X_FLOAT hi): _x(x.template view()), _sendlist(sendlist.template view()), _nsend(nsend.template view()), _sendflag(sendflag.template view()), _nlocal(nlocal),_dim(dim), _lo(lo),_hi(hi){ } KOKKOS_INLINE_FUNCTION void operator() (int i) const { if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) { const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1); if(mysend<_sendlist.dimension_0()) { _sendlist(mysend) = i; _sendflag(i) = 1; } } else _sendflag(i) = 0; } }; /* ---------------------------------------------------------------------- */ template void CommKokkos::exchange_device() { int i,nsend,nrecv,nrecv1,nrecv2,nlocal; double lo,hi; double **x; double *sublo,*subhi; MPI_Request request; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; // clear global->local map for owned and ghost atoms // b/c atoms migrate to new procs in exchange() and // new ghosts are created in borders() // map_set() is done at end of borders() // clear ghost count and any ghost bonus data internal to AtomVec if (map_style) atom->map_clear(); atom->nghost = 0; atom->avec->clear_bonus(); // subbox bounds for orthogonal or triclinic if (triclinic == 0) { sublo = domain->sublo; subhi = domain->subhi; } else { sublo = domain->sublo_lamda; subhi = domain->subhi_lamda; } atomKK->sync(ExecutionSpaceFromDevice::space,ALL_MASK); // loop over dimensions for (int dim = 0; dim < 3; dim++) { // fill buffer with atoms leaving my box, using < and >= // when atom is deleted, fill it in with last atom x = atom->x; lo = sublo[dim]; hi = subhi[dim]; nlocal = atom->nlocal; i = nsend = 0; if (true) { if (k_sendflag.h_view.dimension_0()(); k_count.h_view() = k_exchange_sendlist.h_view.dimension_0(); while (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) { k_count.h_view() = 0; k_count.modify(); k_count.sync(); BuildExchangeListFunctor f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag, nlocal,dim,lo,hi); Kokkos::parallel_for(nlocal,f); k_exchange_sendlist.modify(); k_sendflag.modify(); k_count.modify(); k_count.sync(); if (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) { k_exchange_sendlist.resize(k_count.h_view()*1.1); k_exchange_copylist.resize(k_count.h_view()*1.1); k_count.h_view()=k_exchange_sendlist.h_view.dimension_0(); } } k_exchange_copylist.sync(); k_exchange_sendlist.sync(); k_sendflag.sync(); int sendpos = nlocal-1; nlocal -= k_count.h_view(); for(int i = 0; i < k_count.h_view(); i++) { if (k_exchange_sendlist.h_view(i)(); k_exchange_copylist.sync(); nsend = k_count.h_view(); if (nsend > maxsend) grow_send_kokkos(nsend,1); nsend = avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send, k_exchange_sendlist,k_exchange_copylist, ExecutionSpaceFromDevice:: space,dim,lo,hi); } else { while (i < nlocal) { if (x[i][dim] < lo || x[i][dim] >= hi) { if (nsend > maxsend) grow_send_kokkos(nsend,1); nsend += avec->pack_exchange(i,&buf_send[nsend]); avec->copy(nlocal-1,i,1); nlocal--; } else i++; } } atom->nlocal = nlocal; // send/recv atoms in both directions // if 1 proc in dimension, no send/recv, set recv buf to send buf // if 2 procs in dimension, single send/recv // if more than 2 procs in dimension, send/recv to both neighbors if (procgrid[dim] == 1) { nrecv = nsend; if (nrecv) { atom->nlocal=avec-> unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi, ExecutionSpaceFromDevice::space); } } else { MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0, &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,MPI_STATUS_IGNORE); nrecv = nrecv1; if (procgrid[dim] > 2) { MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0, &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,MPI_STATUS_IGNORE); nrecv += nrecv2; } if (nrecv > maxrecv) grow_recv_kokkos(nrecv); MPI_Irecv(k_buf_recv.view().ptr_on_device(),nrecv1, MPI_DOUBLE,procneigh[dim][1],0, world,&request); MPI_Send(k_buf_send.view().ptr_on_device(),nsend, MPI_DOUBLE,procneigh[dim][0],0,world); MPI_Wait(&request,MPI_STATUS_IGNORE); if (procgrid[dim] > 2) { MPI_Irecv(k_buf_recv.view().ptr_on_device()+nrecv1, nrecv2,MPI_DOUBLE,procneigh[dim][0],0, world,&request); MPI_Send(k_buf_send.view().ptr_on_device(),nsend, MPI_DOUBLE,procneigh[dim][1],0,world); MPI_Wait(&request,MPI_STATUS_IGNORE); } if (nrecv) { atom->nlocal = avec-> unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi, ExecutionSpaceFromDevice::space); } } // check incoming atoms to see if they are in my box // if so, add to my list } atomKK->modified(ExecutionSpaceFromDevice::space,ALL_MASK); if (atom->firstgroupname) { /* this is not yet implemented with Kokkos */ atomKK->sync(Host,ALL_MASK); atom->first_reorder(); atomKK->modified(Host,ALL_MASK); } } /* ---------------------------------------------------------------------- borders: list nearby atoms to send to neighboring procs at every timestep one list is created for every swap that will be made as list is made, actually do swaps this does equivalent of a communicate, so don't need to explicitly call communicate routine on reneighboring timestep this routine is called before every reneighboring for triclinic, atoms must be in lamda coords (0-1) before borders is called ------------------------------------------------------------------------- */ void CommKokkos::borders() { if (!exchange_comm_classic) { static int print = 1; if (style != SINGLE || bordergroup || ghost_velocity) { if (print && comm->me==0) { error->warning(FLERR,"Required border comm not yet implemented in Kokkos communication, " "switching to classic communication"); } print = 0; exchange_comm_classic = true; } } if (!exchange_comm_classic) { if (exchange_comm_on_host) borders_device(); else borders_device(); return; } atomKK->sync(Host,ALL_MASK); k_sendlist.sync(); CommBrick::borders(); k_sendlist.modify(); atomKK->modified(Host,ALL_MASK); } /* ---------------------------------------------------------------------- */ template struct BuildBorderListFunctor { typedef DeviceType device_type; typedef ArrayTypes AT; X_FLOAT lo,hi; typename AT::t_x_array x; int iswap,maxsendlist; int nfirst,nlast,dim; typename AT::t_int_2d sendlist; typename AT::t_int_scalar nsend; BuildBorderListFunctor(typename AT::tdual_x_array _x, typename AT::tdual_int_2d _sendlist, typename AT::tdual_int_scalar _nsend,int _nfirst, int _nlast, int _dim, X_FLOAT _lo, X_FLOAT _hi, int _iswap, int _maxsendlist): x(_x.template view()), sendlist(_sendlist.template view()), nsend(_nsend.template view()), nfirst(_nfirst),nlast(_nlast),dim(_dim), lo(_lo),hi(_hi),iswap(_iswap),maxsendlist(_maxsendlist){} KOKKOS_INLINE_FUNCTION void operator() (typename Kokkos::TeamPolicy::member_type dev) const { const int chunk = ((nlast - nfirst + dev.league_size() - 1 ) / dev.league_size()); const int teamstart = chunk*dev.league_rank() + nfirst; const int teamend = (teamstart + chunk) < nlast?(teamstart + chunk):nlast; int mysend = 0; for (int i=teamstart + dev.team_rank(); i= lo && x(i,dim) <= hi) mysend++; } const int my_store_pos = dev.team_scan(mysend,&nsend()); if (my_store_pos+mysend < maxsendlist) { mysend = my_store_pos; for(int i=teamstart + dev.team_rank(); i= lo && x(i,dim) <= hi) { sendlist(iswap,mysend++) = i; } } } } size_t shmem_size(const int team_size) const { (void) team_size; return 1000u;} }; /* ---------------------------------------------------------------------- */ template void CommKokkos::borders_device() { int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax; int nsend,nrecv,sendflag,nfirst,nlast,ngroup; double lo,hi; int *type; double **x; double *buf,*mlo,*mhi; MPI_Request request; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; ExecutionSpace exec_space = ExecutionSpaceFromDevice::space; k_sendlist.sync(); atomKK->sync(exec_space,ALL_MASK); // do swaps over all 3 dimensions iswap = 0; smax = rmax = 0; for (dim = 0; dim < 3; dim++) { nlast = 0; twoneed = 2*maxneed[dim]; for (ineed = 0; ineed < twoneed; ineed++) { // find atoms within slab boundaries lo/hi using <= and >= // check atoms between nfirst and nlast // for first swaps in a dim, check owned and ghost // for later swaps in a dim, only check newly arrived ghosts // store sent atom indices in list for use in future timesteps x = atom->x; if (style == SINGLE) { lo = slablo[iswap]; hi = slabhi[iswap]; } else { type = atom->type; mlo = multilo[iswap]; mhi = multihi[iswap]; } if (ineed % 2 == 0) { nfirst = nlast; nlast = atom->nlocal + atom->nghost; } nsend = 0; // sendflag = 0 if I do not send on this swap // sendneed test indicates receiver no longer requires data // e.g. due to non-PBC or non-uniform sub-domains if (ineed/2 >= sendneed[dim][ineed % 2]) sendflag = 0; else sendflag = 1; // find send atoms according to SINGLE vs MULTI // all atoms eligible versus atoms in bordergroup // only need to limit loop to bordergroup for first sends (ineed < 2) // on these sends, break loop in two: owned (in group) and ghost if (sendflag) { if (!bordergroup || ineed >= 2) { if (style == SINGLE) { k_total_send.h_view() = 0; k_total_send.template modify(); k_total_send.template sync(); BuildBorderListFunctor f(atomKK->k_x,k_sendlist, k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::TeamPolicy config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); k_total_send.template modify(); k_total_send.template sync(); k_sendlist.modify(); if(k_total_send.h_view() >= maxsendlist[iswap]) { grow_list(iswap,k_total_send.h_view()); k_total_send.h_view() = 0; k_total_send.template modify(); k_total_send.template sync(); BuildBorderListFunctor f(atomKK->k_x,k_sendlist, k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::TeamPolicy config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); k_total_send.template modify(); k_total_send.template sync(); k_sendlist.modify(); } nsend = k_total_send.h_view(); } else { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos"); for (i = nfirst; i < nlast; i++) { itype = type[i]; if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } } } } else { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos"); if (style == SINGLE) { ngroup = atom->nfirst; for (i = 0; i < ngroup; i++) if (x[i][dim] >= lo && x[i][dim] <= hi) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } for (i = atom->nlocal; i < nlast; i++) if (x[i][dim] >= lo && x[i][dim] <= hi) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } } else { ngroup = atom->nfirst; for (i = 0; i < ngroup; i++) { itype = type[i]; if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } } for (i = atom->nlocal; i < nlast; i++) { itype = type[i]; if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } } } } } // pack up list of border atoms if (nsend*size_border > maxsend) grow_send_kokkos(nsend*size_border,0); if (ghost_velocity) { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos"); n = avec->pack_border_vel(nsend,sendlist[iswap],buf_send, pbc_flag[iswap],pbc[iswap]); } else n = avec-> pack_border_kokkos(nsend,k_sendlist,k_buf_send,iswap, pbc_flag[iswap],pbc[iswap],exec_space); // swap atoms with other proc // no MPI calls except SendRecv if nsend/nrecv = 0 // put incoming ghosts at end of my atom arrays // if swapping with self, simply copy, no messages if (sendproc[iswap] != me) { MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0, &nrecv,1,MPI_INT,recvproc[iswap],0,world,MPI_STATUS_IGNORE); if (nrecv*size_border > maxrecv) grow_recv_kokkos(nrecv*size_border); if (nrecv) MPI_Irecv(k_buf_recv.view().ptr_on_device(), nrecv*size_border,MPI_DOUBLE, recvproc[iswap],0,world,&request); if (n) MPI_Send(k_buf_send.view().ptr_on_device(),n, MPI_DOUBLE,sendproc[iswap],0,world); if (nrecv) MPI_Wait(&request,MPI_STATUS_IGNORE); buf = buf_recv; } else { nrecv = nsend; buf = buf_send; } // unpack buffer if (ghost_velocity) { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos"); avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf); } else if (sendproc[iswap] != me) avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost, k_buf_recv,exec_space); else avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost, k_buf_send,exec_space); // set all pointers & counters smax = MAX(smax,nsend); rmax = MAX(rmax,nrecv); sendnum[iswap] = nsend; recvnum[iswap] = nrecv; size_forward_recv[iswap] = nrecv*size_forward; size_reverse_send[iswap] = nrecv*size_reverse; size_reverse_recv[iswap] = nsend*size_reverse; firstrecv[iswap] = atom->nlocal + atom->nghost; atom->nghost += nrecv; iswap++; } } // insure send/recv buffers are long enough for all forward & reverse comm int max = MAX(maxforward*smax,maxreverse*rmax); if (max > maxsend) grow_send_kokkos(max,0); max = MAX(maxforward*rmax,maxreverse*smax); if (max > maxrecv) grow_recv_kokkos(max); // reset global->local map atomKK->modified(exec_space,ALL_MASK); if (map_style) { atomKK->sync(Host,TAG_MASK); atom->map_set(); } } /* ---------------------------------------------------------------------- realloc the size of the send buffer as needed with BUFFACTOR and bufextra if flag = 1, realloc if flag = 0, don't need to realloc with copy, just free/malloc ------------------------------------------------------------------------- */ void CommKokkos::grow_send(int n, int flag) { grow_send_kokkos(n,flag,Host); } /* ---------------------------------------------------------------------- free/malloc the size of the recv buffer as needed with BUFFACTOR ------------------------------------------------------------------------- */ void CommKokkos::grow_recv(int n) { grow_recv_kokkos(n,Host); } /* ---------------------------------------------------------------------- realloc the size of the send buffer as needed with BUFFACTOR & BUFEXTRA if flag = 1, realloc if flag = 0, don't need to realloc with copy, just free/malloc ------------------------------------------------------------------------- */ void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space) { maxsend = static_cast (BUFFACTOR * n); int maxsend_border = (maxsend+BUFEXTRA+5)/atom->avec->size_border + 2; if (flag) { if(space == Device) k_buf_send.modify(); else k_buf_send.modify(); k_buf_send.resize(maxsend_border,atom->avec->size_border); buf_send = k_buf_send.view().ptr_on_device(); } else { k_buf_send = DAT:: tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border); buf_send = k_buf_send.view().ptr_on_device(); } } /* ---------------------------------------------------------------------- free/malloc the size of the recv buffer as needed with BUFFACTOR ------------------------------------------------------------------------- */ void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space) { maxrecv = static_cast (BUFFACTOR * n); int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2; k_buf_recv = DAT:: tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border); buf_recv = k_buf_recv.view().ptr_on_device(); } /* ---------------------------------------------------------------------- realloc the size of the iswap sendlist as needed with BUFFACTOR ------------------------------------------------------------------------- */ void CommKokkos::grow_list(int iswap, int n) { int size = static_cast (BUFFACTOR * n); if (exchange_comm_classic) { // force realloc on Host k_sendlist.sync(); k_sendlist.modify(); } memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); for(int i=0;i()(i,0); } } /* ---------------------------------------------------------------------- realloc the buffers needed for swaps ------------------------------------------------------------------------- */ void CommKokkos::grow_swap(int n) { free_swap(); allocate_swap(n); if (style == MULTI) { free_multi(); allocate_multi(n); } maxswap = n; int size = MAX(k_sendlist.d_view.dimension_1(),BUFMIN); if (exchange_comm_classic) { // force realloc on Host k_sendlist.sync(); k_sendlist.modify(); } memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); memory->grow(maxsendlist,n,"comm:maxsendlist"); for (int i=0;i