diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp index 2b3325659..ba4bc9171 100644 --- a/src/KOKKOS/npair_ssa_kokkos.cpp +++ b/src/KOKKOS/npair_ssa_kokkos.cpp @@ -1,713 +1,704 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: James Larentzos and Timothy I. Mattox (Engility Corporation) ------------------------------------------------------------------------- */ #include "npair_ssa_kokkos.h" #include "neigh_list.h" #include "atom_kokkos.h" #include "atom_masks.h" #include "domain_kokkos.h" #include "neighbor_kokkos.h" #include "nbin_ssa_kokkos.h" #include "nstencil_ssa.h" #include "error.h" #include "comm.h" namespace LAMMPS_NS { /* ---------------------------------------------------------------------- */ template NPairSSAKokkos::NPairSSAKokkos(LAMMPS *lmp) : NPair(lmp), ssa_phaseCt(27), ssa_gphaseCt(7) { const int gphaseLenEstimate = 1; //FIXME make this 4 eventually k_ssa_gphaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_gphaseLen",ssa_gphaseCt); ssa_gphaseLen = k_ssa_gphaseLen.view(); k_ssa_gitemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLoc",ssa_gphaseCt,gphaseLenEstimate); ssa_gitemLoc = k_ssa_gitemLoc.view(); k_ssa_gitemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLen",ssa_gphaseCt,gphaseLenEstimate); ssa_gitemLen = k_ssa_gitemLen.view(); } /* ---------------------------------------------------------------------- copy needed info from Neighbor class to this build class ------------------------------------------------------------------------- */ template void NPairSSAKokkos::copy_neighbor_info() { NPair::copy_neighbor_info(); NeighborKokkos* neighborKK = (NeighborKokkos*) neighbor; // general params k_cutneighsq = neighborKK->k_cutneighsq; // exclusion info k_ex1_type = neighborKK->k_ex1_type; k_ex2_type = neighborKK->k_ex2_type; k_ex_type = neighborKK->k_ex_type; k_ex1_group = neighborKK->k_ex1_group; k_ex2_group = neighborKK->k_ex2_group; k_ex1_bit = neighborKK->k_ex1_bit; k_ex2_bit = neighborKK->k_ex2_bit; k_ex_mol_group = neighborKK->k_ex_mol_group; k_ex_mol_bit = neighborKK->k_ex_mol_bit; } /* ---------------------------------------------------------------------- copy per-atom and per-bin vectors from NBinSSAKokkos class to this build class ------------------------------------------------------------------------- */ template void NPairSSAKokkos::copy_bin_info() { NPair::copy_bin_info(); NBinSSAKokkos* nbKK = dynamic_cast*>(nb); if (!nbKK) error->one(FLERR, "NBin wasn't a NBinSSAKokkos object"); atoms_per_bin = nbKK->atoms_per_bin; k_bincount = nbKK->k_bincount; k_bins = nbKK->k_bins; ghosts_per_gbin = nbKK->ghosts_per_gbin; k_gbincount = nbKK->k_gbincount; k_gbins = nbKK->k_gbins; lbinxlo = nbKK->h_lbinxlo(); lbinxhi = nbKK->h_lbinxhi(); lbinylo = nbKK->h_lbinylo(); lbinyhi = nbKK->h_lbinyhi(); lbinzlo = nbKK->h_lbinzlo(); lbinzhi = nbKK->h_lbinzhi(); } /* ---------------------------------------------------------------------- copy needed info from NStencil class to this build class ------------------------------------------------------------------------- */ template void NPairSSAKokkos::copy_stencil_info() { NPair::copy_stencil_info(); nstencil = ns->nstencil; int maxstencil = ns->get_maxstencil(); k_stencil = DAT::tdual_int_1d("NPairSSAKokkos:stencil",maxstencil); for (int k = 0; k < maxstencil; k++) { k_stencil.h_view(k) = ns->stencil[k]; } k_stencil.modify(); k_stencil.sync(); k_stencilxyz = DAT::tdual_int_1d_3("NPairSSAKokkos:stencilxyz",maxstencil); for (int k = 0; k < maxstencil; k++) { k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0]; k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1]; k_stencilxyz.h_view(k,2) = ns->stencilxyz[k][2]; } k_stencilxyz.modify(); k_stencilxyz.sync(); NStencilSSA *ns_ssa = dynamic_cast(ns); if (!ns_ssa) error->one(FLERR, "NStencil wasn't a NStencilSSA object"); k_nstencil_ssa = DAT::tdual_int_1d("NPairSSAKokkos:nstencil_ssa",5); for (int k = 0; k < 5; ++k) { k_nstencil_ssa.h_view(k) = ns_ssa->nstencil_ssa[k]; } k_nstencil_ssa.modify(); k_nstencil_ssa.sync(); sx1 = ns_ssa->sx + 1; sy1 = ns_ssa->sy + 1; sz1 = ns_ssa->sz + 1; // Setup the phases of the workplan for locals ssa_phaseCt = sz1*sy1*sx1; if (ssa_phaseCt > (int) k_ssa_phaseLen.dimension_0()) { k_ssa_phaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_phaseLen",ssa_phaseCt); ssa_phaseLen = k_ssa_phaseLen.view(); k_ssa_phaseOff = DAT::tdual_int_1d_3("NPairSSAKokkos:ssa_phaseOff",ssa_phaseCt); ssa_phaseOff = k_ssa_phaseOff.view(); } int workPhase = 0; for (int zoff = sz1 - 1; zoff >= 0; --zoff) { for (int yoff = sy1 - 1; yoff >= 0; --yoff) { for (int xoff = sx1 - 1; xoff >= 0; --xoff) { ssa_phaseOff(workPhase, 0) = xoff; ssa_phaseOff(workPhase, 1) = yoff; ssa_phaseOff(workPhase, 2) = zoff; workPhase++; } } } } /* ---------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION int NPairSSAKokkosExecute::find_special(const int &i, const int &j) const { const int n1 = nspecial(i,0); const int n2 = nspecial(i,1); const int n3 = nspecial(i,2); for (int k = 0; k < n3; k++) { if (special(i,k) == tag(j)) { if (k < n1) { if (special_flag[1] == 0) return -1; else if (special_flag[1] == 1) return 0; else return 1; } else if (k < n2) { if (special_flag[2] == 0) return -1; else if (special_flag[2] == 1) return 0; else return 2; } else { if (special_flag[3] == 0) return -1; else if (special_flag[3] == 1) return 0; else return 3; } } } return 0; }; /* ---------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION int NPairSSAKokkosExecute::exclusion(const int &i,const int &j, const int &itype,const int &jtype) const { int m; if (nex_type && ex_type(itype,jtype)) return 1; if (nex_group) { for (m = 0; m < nex_group; m++) { if (mask(i) & ex1_bit(m) && mask(j) & ex2_bit(m)) return 1; if (mask(i) & ex2_bit(m) && mask(j) & ex1_bit(m)) return 1; } } if (nex_mol) { for (m = 0; m < nex_mol; m++) if (mask(i) & ex_mol_bit(m) && mask(j) & ex_mol_bit(m) && molecule(i) == molecule(j)) return 1; } return 0; } /* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- binned neighbor list construction with full Newton's 3rd law for use by Shardlow Spliting Algorithm each owned atom i checks its own bin and other bins in Newton stencil every pair stored exactly once by some processor ------------------------------------------------------------------------- */ template void NPairSSAKokkos::build(NeighList *list_) { NeighListKokkos* list = (NeighListKokkos*) list_; const int nlocal = includegroup?atom->nfirst:atom->nlocal; int nl_size; int xbinCt = (lbinxhi - lbinxlo + sx1 - 1) / sx1 + 1; int ybinCt = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1; int zbinCt = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1; int phaseLenEstimate = xbinCt*ybinCt*zbinCt; if ((ssa_phaseCt > (int) k_ssa_itemLoc.dimension_0()) || (phaseLenEstimate > (int) k_ssa_itemLoc.dimension_1())) { k_ssa_itemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_itemLoc",ssa_phaseCt,phaseLenEstimate); ssa_itemLoc = k_ssa_itemLoc.view(); k_ssa_itemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_itemLen",ssa_phaseCt,phaseLenEstimate); ssa_itemLen = k_ssa_itemLen.view(); } { // Preflight the neighbor list workplan const typename ArrayTypes::t_int_1d_const c_bincount = k_bincount.view(); const typename ArrayTypes::t_int_2d_const c_bins = k_bins.view(); const typename ArrayTypes::t_int_1d_const_um c_stencil = k_stencil.view(); const typename ArrayTypes::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view(); int inum = 0; // loop over bins with local atoms, counting half of the neighbors for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) { int zoff = ssa_phaseOff(workPhase, 2); int yoff = ssa_phaseOff(workPhase, 1); int xoff = ssa_phaseOff(workPhase, 0); int workItem = 0; for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) { for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) { for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) { int inum_start = inum; // if (workItem >= phaseLenEstimate) error->one(FLERR,"phaseLenEstimate was too small"); for (int subphase = 0; subphase < 4; subphase++) { int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0); int s_xbin = xbin + ((subphase & 0x1) ? sx1 - 1 : 0); if ((s_ybin < lbinylo) || (s_ybin >= lbinyhi)) continue; if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue; const int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin; const int ibinCt = c_bincount(ibin); if (ibinCt > 0) { int base_n = 0; bool include_same = false; // count all local atoms in the current stencil "subphase" as potential neighbors for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) { const int jbin = ibin+c_stencil(k); if (jbin != ibin) base_n += c_bincount(jbin); else include_same = true; } // Calculate how many ibin particles would have had some neighbors if (base_n > 0) inum += ibinCt; else if (include_same) inum += ibinCt - 1; } } ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist ssa_itemLen(workPhase,workItem) = inum - inum_start; // record workItem length #ifdef DEBUG_SSA_BUILD_LOCALS if (ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3d) inum %d - inum_start %d UNDERFLOW\n" ,comm->me ,workPhase ,workItem ,inum ,inum_start ); #endif workItem++; } } } #ifdef DEBUG_SSA_BUILD_LOCALS fprintf(stdout, "phas%03d phase %3d could use %6d inums, expected %6d inums. maxworkItems = %3d, inums/workItems = %g\n" ,comm->me ,workPhase ,inum - ssa_itemLoc(workPhase, 0) ,(nlocal*4 + ssa_phaseCt - 1) / ssa_phaseCt ,workItem ,(inum - ssa_itemLoc(workPhase, 0)) / (double) workItem ); #endif // record where workPhase ends ssa_phaseLen(workPhase) = workItem; } #ifdef DEBUG_SSA_BUILD_LOCALS fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n" ,comm->me ,workPhase ,inum ,nlocal*4 ,inum / (double) workPhase ); #endif nl_size = inum; // record how much space is needed for the local work plan } // count how many ghosts might have neighbors, and increase the work plan storage for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) { - nl_size += k_gbincount.h_view(workPhase + 1); + int len = k_gbincount.h_view(workPhase + 1); + ssa_gitemLoc(workPhase,0) = nl_size; // record where workItem starts in ilist + ssa_gitemLen(workPhase,0) = len; + nl_size += len; } list->grow(nl_size); // Make special larger SSA neighbor list NPairSSAKokkosExecute data(*list, k_cutneighsq.view(), k_bincount.view(), k_bins.view(), k_gbincount.view(), k_gbins.view(), lbinxlo, lbinxhi, lbinylo, lbinyhi, lbinzlo, lbinzhi, nstencil, sx1, sy1, sz1, k_stencil.view(), k_stencilxyz.view(), k_nstencil_ssa.view(), ssa_phaseCt, k_ssa_phaseLen.view(), k_ssa_phaseOff.view(), k_ssa_itemLoc.view(), k_ssa_itemLen.view(), ssa_gphaseCt, k_ssa_gphaseLen.view(), k_ssa_gitemLoc.view(), k_ssa_gitemLen.view(), nlocal, atomKK->k_x.view(), atomKK->k_type.view(), atomKK->k_mask.view(), atomKK->k_molecule.view(), atomKK->k_tag.view(), atomKK->k_special.view(), atomKK->k_nspecial.view(), atomKK->molecular, nbinx,nbiny,nbinz,mbinx,mbiny,mbinz,mbinxlo,mbinylo,mbinzlo, bininvx,bininvy,bininvz, exclude, nex_type, k_ex1_type.view(), k_ex2_type.view(), k_ex_type.view(), nex_group, k_ex1_group.view(), k_ex2_group.view(), k_ex1_bit.view(), k_ex2_bit.view(), nex_mol, k_ex_mol_group.view(), k_ex_mol_bit.view(), bboxhi,bboxlo, domain->xperiodic,domain->yperiodic,domain->zperiodic, domain->xprd_half,domain->yprd_half,domain->zprd_half); k_cutneighsq.sync(); k_ex1_type.sync(); k_ex2_type.sync(); k_ex_type.sync(); k_ex1_group.sync(); k_ex2_group.sync(); k_ex1_bit.sync(); k_ex2_bit.sync(); k_ex_mol_group.sync(); k_ex_mol_bit.sync(); k_bincount.sync(); k_bins.sync(); k_gbincount.sync(); k_gbins.sync(); atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK); data.special_flag[0] = special_flag[0]; data.special_flag[1] = special_flag[1]; data.special_flag[2] = special_flag[2]; data.special_flag[3] = special_flag[3]; bool firstTry = true; data.h_resize()=1; while(data.h_resize()) { data.h_new_maxneighs() = list->maxneighs; data.h_resize() = 0; Kokkos::deep_copy(data.resize, data.h_resize); Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs); -#ifdef NOTYET - NPairSSAKokkosBuildFunctor f(data,atoms_per_bin*5*sizeof(X_FLOAT)); - Kokkos::parallel_for(nall, f); -#endif // loop over bins with local atoms, storing half of the neighbors -#ifdef USE_LAMBDA_BUILD Kokkos::parallel_for(ssa_phaseCt, LAMMPS_LAMBDA (const int workPhase) { data.build_locals_onePhase(firstTry, comm->me, workPhase); }); -#else - NPairSSAKokkosBuildFunctor f(data, firstTry, comm->me); - Kokkos::parallel_for(ssa_phaseCt, f); -#endif data.neigh_list.inum = ssa_itemLoc(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1) + ssa_itemLen(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1); - data.build_ghosts(); + + // loop over AIR ghost atoms, storing their local neighbors + Kokkos::parallel_for(ssa_gphaseCt, LAMMPS_LAMBDA (const int workPhase) { + data.build_ghosts_onePhase(workPhase); + }); + data.neigh_list.gnum = ssa_gitemLoc(ssa_gphaseCt-1,ssa_gphaseLen(ssa_gphaseCt-1)-1) + + ssa_gitemLen(ssa_gphaseCt-1,ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum; firstTry = false; DeviceType::fence(); deep_copy(data.h_resize, data.resize); if(data.h_resize()) { deep_copy(data.h_new_maxneighs, data.new_maxneighs); list->maxneighs = data.h_new_maxneighs() * 1.2; list->d_neighbors = typename ArrayTypes::t_neighbors_2d("neighbors", list->d_neighbors.dimension_0(), list->maxneighs); data.neigh_list.d_neighbors = list->d_neighbors; data.neigh_list.maxneighs = list->maxneighs; } } k_ssa_phaseLen.modify(); k_ssa_itemLoc.modify(); k_ssa_itemLen.modify(); k_ssa_gphaseLen.modify(); k_ssa_gitemLoc.modify(); k_ssa_gitemLen.modify(); list->inum = data.neigh_list.inum; //FIXME once the above is in a parallel_for list->gnum = data.neigh_list.gnum; // it will need a deep_copy or something #ifdef DEBUG_SSA_BUILD_LOCALS fprintf(stdout, "Fina%03d %6d inum %6d gnum, total used %6d, allocated %6d\n" ,comm->me ,list->inum ,list->gnum ,list->inum + list->gnum ,nl_size ); #endif list->k_ilist.template modify(); } template void NPairSSAKokkosExecute::build_locals_onePhase(const bool firstTry, int me, int workPhase) const { const typename ArrayTypes::t_int_1d_const_um stencil = d_stencil; int which = 0; int zoff = d_ssa_phaseOff(workPhase, 2); int yoff = d_ssa_phaseOff(workPhase, 1); int xoff = d_ssa_phaseOff(workPhase, 0); int workItem = 0; int skippedItems = 0; for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) { for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) { for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) { if (d_ssa_itemLen(workPhase, workItem + skippedItems) == 0) { if (firstTry) ++skippedItems; else ++workItem; // phase is done,should break out of three loops here if we could... continue; } int inum_start = d_ssa_itemLoc(workPhase, workItem + skippedItems); int inum = inum_start; for (int subphase = 0; subphase < 4; subphase++) { int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0); int s_xbin = xbin + ((subphase & 0x1) ? sx1 - 1 : 0); if ((s_ybin < lbinylo) || (s_ybin >= lbinyhi)) continue; if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue; int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin; for (int il = 0; il < c_bincount(ibin); ++il) { const int i = c_bins(ibin, il); int n = 0; const AtomNeighbors neighbors_i = neigh_list.get_neighbors(inum); const X_FLOAT xtmp = x(i, 0); const X_FLOAT ytmp = x(i, 1); const X_FLOAT ztmp = x(i, 2); const int itype = type(i); // loop over all local atoms in the current stencil "subphase" for (int k = d_nstencil_ssa(subphase); k < d_nstencil_ssa(subphase+1); k++) { const int jbin = ibin+stencil(k); int jl; if (jbin != ibin) jl = 0; else jl = il + 1; // same bin as i, so start just past i in the bin for (; jl < c_bincount(jbin); ++jl) { const int j = c_bins(jbin, jl); const int jtype = type(j); if(exclude && exclusion(i,j,itype,jtype)) continue; const X_FLOAT delx = xtmp - x(j, 0); const X_FLOAT dely = ytmp - x(j, 1); const X_FLOAT delz = ztmp - x(j, 2); const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; if(rsq <= cutneighsq(itype,jtype)) { if (molecular) { if (!moltemplate) which = find_special(i,j); /* else if (imol >= 0) */ /* which = find_special(onemols[imol]->special[iatom], */ /* onemols[imol]->nspecial[iatom], */ /* tag[j]-tagprev); */ /* else which = 0; */ if (which == 0){ if(n 0) { if(n 0) { neigh_list.d_numneigh(inum) = n; neigh_list.d_ilist(inum++) = i; if(n > neigh_list.maxneighs) { resize() = 1; if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); } } } } int len = inum - inum_start; #ifdef DEBUG_SSA_BUILD_LOCALS if (len != d_ssa_itemLen(workPhase, workItem + skippedItems)) { fprintf(stdout, "Leng%03d workphase (%2d,%3d,%3d): len = %4d, but ssa_itemLen = %4d%s\n" ,me ,workPhase ,workItem ,workItem + skippedItems ,len ,d_ssa_itemLen(workPhase, workItem + skippedItems) ,(len > d_ssa_itemLen(workPhase, workItem + skippedItems)) ? " OVERFLOW" : "" ); } #endif if (inum > inum_start) { d_ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist d_ssa_itemLen(workPhase,workItem) = inum - inum_start; // record actual workItem length workItem++; } else if (firstTry) ++skippedItems; } } } #ifdef DEBUG_SSA_BUILD_LOCALS fprintf(stdout, "Phas%03d phase %3d used %6d inums, workItems = %3d, skipped = %3d, inums/workItems = %g\n" ,me ,workPhase ,inum - d_ssa_itemLoc(workPhase, 0) ,workItem ,skippedItems ,(inum - d_ssa_itemLoc(workPhase, 0)) / (double) workItem ); #endif // record where workPhase actually ends if (firstTry) { d_ssa_phaseLen(workPhase) = workItem; while (workItem < (int) d_ssa_itemLen.dimension_1()) { d_ssa_itemLen(workPhase,workItem++) = 0; } } } template -void NPairSSAKokkosExecute::build_ghosts() +void NPairSSAKokkosExecute::build_ghosts_onePhase(int workPhase) const { - int n = 0; + const typename ArrayTypes::t_int_1d_const_um stencil = d_stencil; int which = 0; - int inum = neigh_list.inum; - int gnum = 0; - // loop over AIR ghost atoms, storing their local neighbors // since these are ghosts, must check if stencil bin is out of bounds - for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) { int airnum = workPhase + 1; //FIXME for now, there is only 1 workItem for each ghost AIR int workItem; for (workItem = 0; workItem < 1; ++workItem) { - d_ssa_gitemLoc(workPhase, workItem) = inum + gnum; // record where workItem starts in ilist + int gNdx = d_ssa_gitemLoc(workPhase, workItem); // record where workItem starts in ilist for (int il = 0; il < c_gbincount(airnum); ++il) { const int i = c_gbins(airnum, il); - n = 0; + int n = 0; - const AtomNeighbors neighbors_i = neigh_list.get_neighbors(inum + gnum); + const AtomNeighbors neighbors_i = neigh_list.get_neighbors(gNdx); const X_FLOAT xtmp = x(i, 0); const X_FLOAT ytmp = x(i, 1); const X_FLOAT ztmp = x(i, 2); const int itype = type(i); - const typename ArrayTypes::t_int_1d_const_um stencil - = d_stencil; - int loc[3]; const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2), &(loc[0])); // loop over AIR ghost atoms in all bins in "full" stencil // Note: the non-AIR ghost atoms have already been filtered out for (int k = 0; k < nstencil; k++) { int xbin2 = loc[0] + d_stencilxyz(k,0); int ybin2 = loc[1] + d_stencilxyz(k,1); int zbin2 = loc[2] + d_stencilxyz(k,2); // Skip it if this bin is outside the extent of local bins if (xbin2 < lbinxlo || xbin2 >= lbinxhi || ybin2 < lbinylo || ybin2 >= lbinyhi || zbin2 < lbinzlo || zbin2 >= lbinzhi) continue; const int jbin = ibin+stencil(k); for (int jl = 0; jl < c_bincount(jbin); ++jl) { const int j = c_bins(jbin, jl); const int jtype = type(j); if(exclude && exclusion(i,j,itype,jtype)) continue; const X_FLOAT delx = xtmp - x(j, 0); const X_FLOAT dely = ytmp - x(j, 1); const X_FLOAT delz = ztmp - x(j, 2); const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; if(rsq <= cutneighsq(itype,jtype)) { if (molecular) { if (!moltemplate) which = find_special(j,i); /* else if (jmol >= 0) */ /* which = find_special(onemols[jmol]->special[jatom], */ /* onemols[jmol]->nspecial[jatom], */ /* tag[i]-jtagprev); */ /* else which = 0; */ if (which == 0){ if(n 0) { if(n 0) { - neigh_list.d_numneigh(inum + gnum) = n; - neigh_list.d_ilist(inum + (gnum++)) = i; + neigh_list.d_numneigh(gNdx) = n; + neigh_list.d_ilist(gNdx++) = i; if(n > neigh_list.maxneighs) { resize() = 1; if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n); } } } // record where workItem ends in ilist - d_ssa_gitemLen(workPhase,workItem) = inum + gnum - d_ssa_gitemLoc(workPhase,workItem); + d_ssa_gitemLen(workPhase,workItem) = gNdx - d_ssa_gitemLoc(workPhase,workItem); // if (d_ssa_gitemLen(workPhase,workItem) > 0) workItem++; } d_ssa_gphaseLen(workPhase) = workItem; - } - neigh_list.gnum = gnum; } } namespace LAMMPS_NS { template class NPairSSAKokkos; #ifdef KOKKOS_HAVE_CUDA template class NPairSSAKokkos; #endif } diff --git a/src/KOKKOS/npair_ssa_kokkos.h b/src/KOKKOS/npair_ssa_kokkos.h index 62c4135cc..98046feba 100644 --- a/src/KOKKOS/npair_ssa_kokkos.h +++ b/src/KOKKOS/npair_ssa_kokkos.h @@ -1,379 +1,362 @@ /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ #ifdef NPAIR_CLASS typedef NPairSSAKokkos NPairSSAKokkosHost; NPairStyle(half/bin/newton/ssa/kk/host, NPairSSAKokkosHost, NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA | NP_GHOST | NP_KOKKOS_HOST) typedef NPairSSAKokkos NPairSSAKokkosDevice; NPairStyle(half/bin/newton/ssa/kk/device, NPairSSAKokkosDevice, NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA | NP_GHOST | NP_KOKKOS_DEVICE) #else #ifndef LMP_NPAIR_SSA_KOKKOS_H #define LMP_NPAIR_SSA_KOKKOS_H #include "npair.h" #include "neigh_list_kokkos.h" namespace LAMMPS_NS { template class NPairSSAKokkos : public NPair { public: typedef ArrayTypes AT; // SSA Work plan data structures int ssa_phaseCt; DAT::tdual_int_1d k_ssa_phaseLen; DAT::tdual_int_1d_3 k_ssa_phaseOff; DAT::tdual_int_2d k_ssa_itemLoc; DAT::tdual_int_2d k_ssa_itemLen; typename AT::t_int_1d ssa_phaseLen; typename AT::t_int_1d_3 ssa_phaseOff; typename AT::t_int_2d ssa_itemLoc; typename AT::t_int_2d ssa_itemLen; const int ssa_gphaseCt; DAT::tdual_int_1d k_ssa_gphaseLen; DAT::tdual_int_2d k_ssa_gitemLoc; DAT::tdual_int_2d k_ssa_gitemLen; typename AT::t_int_1d ssa_gphaseLen; typename AT::t_int_2d ssa_gitemLoc; typename AT::t_int_2d ssa_gitemLen; NPairSSAKokkos(class LAMMPS *); ~NPairSSAKokkos() {} void copy_neighbor_info(); void copy_bin_info(); void copy_stencil_info(); void build(class NeighList *); private: // data from Neighbor class DAT::tdual_xfloat_2d k_cutneighsq; // exclusion data from Neighbor class DAT::tdual_int_1d k_ex1_type,k_ex2_type; DAT::tdual_int_2d k_ex_type; DAT::tdual_int_1d k_ex1_group,k_ex2_group; DAT::tdual_int_1d k_ex1_bit,k_ex2_bit; DAT::tdual_int_1d k_ex_mol_group; DAT::tdual_int_1d k_ex_mol_bit; // data from NBinSSA class int atoms_per_bin; DAT::tdual_int_1d k_bincount; DAT::tdual_int_2d k_bins; int ghosts_per_gbin; DAT::tdual_int_1d k_gbincount; DAT::tdual_int_2d k_gbins; int lbinxlo, lbinxhi, lbinylo, lbinyhi, lbinzlo, lbinzhi; // data from NStencilSSA class int nstencil; DAT::tdual_int_1d k_stencil; // # of J neighs for each I DAT::tdual_int_1d_3 k_stencilxyz; DAT::tdual_int_1d k_nstencil_ssa; int sx1, sy1, sz1; }; template class NPairSSAKokkosExecute { typedef ArrayTypes AT; public: NeighListKokkos neigh_list; // data from Neighbor class const typename AT::t_xfloat_2d_randomread cutneighsq; // exclusion data from Neighbor class const int exclude; const int nex_type; const typename AT::t_int_1d_const ex1_type,ex2_type; const typename AT::t_int_2d_const ex_type; const int nex_group; const typename AT::t_int_1d_const ex1_group,ex2_group; const typename AT::t_int_1d_const ex1_bit,ex2_bit; const int nex_mol; const typename AT::t_int_1d_const ex_mol_group; const typename AT::t_int_1d_const ex_mol_bit; // data from NBinSSA class const typename AT::t_int_1d bincount; const typename AT::t_int_1d_const c_bincount; typename AT::t_int_2d bins; typename AT::t_int_2d_const c_bins; const typename AT::t_int_1d gbincount; const typename AT::t_int_1d_const c_gbincount; typename AT::t_int_2d gbins; typename AT::t_int_2d_const c_gbins; const int lbinxlo, lbinxhi, lbinylo, lbinyhi, lbinzlo, lbinzhi; // data from NStencil class const int nstencil; const int sx1, sy1, sz1; typename AT::t_int_1d d_stencil; // # of J neighs for each I typename AT::t_int_1d_3 d_stencilxyz; typename AT::t_int_1d d_nstencil_ssa; // data from Atom class const typename AT::t_x_array_randomread x; const typename AT::t_int_1d_const type,mask; const typename AT::t_tagint_1d_const molecule; const typename AT::t_tagint_1d_const tag; const typename AT::t_tagint_2d_const special; const typename AT::t_int_2d_const nspecial; const int molecular; int moltemplate; int special_flag[4]; const int nbinx,nbiny,nbinz; const int mbinx,mbiny,mbinz; const int mbinxlo,mbinylo,mbinzlo; const X_FLOAT bininvx,bininvy,bininvz; X_FLOAT bboxhi[3],bboxlo[3]; const int nlocal; typename AT::t_int_scalar resize; typename AT::t_int_scalar new_maxneighs; typename ArrayTypes::t_int_scalar h_resize; typename ArrayTypes::t_int_scalar h_new_maxneighs; const int xperiodic, yperiodic, zperiodic; const int xprd_half, yprd_half, zprd_half; // SSA Work plan data structures int ssa_phaseCt; typename AT::t_int_1d d_ssa_phaseLen; typename AT::t_int_1d_3_const d_ssa_phaseOff; typename AT::t_int_2d d_ssa_itemLoc; typename AT::t_int_2d d_ssa_itemLen; int ssa_gphaseCt; typename AT::t_int_1d d_ssa_gphaseLen; typename AT::t_int_2d d_ssa_gitemLoc; typename AT::t_int_2d d_ssa_gitemLen; NPairSSAKokkosExecute( const NeighListKokkos &_neigh_list, const typename AT::t_xfloat_2d_randomread &_cutneighsq, const typename AT::t_int_1d &_bincount, const typename AT::t_int_2d &_bins, const typename AT::t_int_1d &_gbincount, const typename AT::t_int_2d &_gbins, const int _lbinxlo, const int _lbinxhi, const int _lbinylo, const int _lbinyhi, const int _lbinzlo, const int _lbinzhi, const int _nstencil, const int _sx1, const int _sy1, const int _sz1, const typename AT::t_int_1d &_d_stencil, const typename AT::t_int_1d_3 &_d_stencilxyz, const typename AT::t_int_1d &_d_nstencil_ssa, const int _ssa_phaseCt, const typename AT::t_int_1d &_d_ssa_phaseLen, const typename AT::t_int_1d_3 &_d_ssa_phaseOff, const typename AT::t_int_2d &_d_ssa_itemLoc, const typename AT::t_int_2d &_d_ssa_itemLen, const int _ssa_gphaseCt, const typename AT::t_int_1d &_d_ssa_gphaseLen, const typename AT::t_int_2d &_d_ssa_gitemLoc, const typename AT::t_int_2d &_d_ssa_gitemLen, const int _nlocal, const typename AT::t_x_array_randomread &_x, const typename AT::t_int_1d_const &_type, const typename AT::t_int_1d_const &_mask, const typename AT::t_tagint_1d_const &_molecule, const typename AT::t_tagint_1d_const &_tag, const typename AT::t_tagint_2d_const &_special, const typename AT::t_int_2d_const &_nspecial, const int &_molecular, const int & _nbinx,const int & _nbiny,const int & _nbinz, const int & _mbinx,const int & _mbiny,const int & _mbinz, const int & _mbinxlo,const int & _mbinylo,const int & _mbinzlo, const X_FLOAT &_bininvx,const X_FLOAT &_bininvy,const X_FLOAT &_bininvz, const int & _exclude,const int & _nex_type, const typename AT::t_int_1d_const & _ex1_type, const typename AT::t_int_1d_const & _ex2_type, const typename AT::t_int_2d_const & _ex_type, const int & _nex_group, const typename AT::t_int_1d_const & _ex1_group, const typename AT::t_int_1d_const & _ex2_group, const typename AT::t_int_1d_const & _ex1_bit, const typename AT::t_int_1d_const & _ex2_bit, const int & _nex_mol, const typename AT::t_int_1d_const & _ex_mol_group, const typename AT::t_int_1d_const & _ex_mol_bit, const X_FLOAT *_bboxhi, const X_FLOAT* _bboxlo, const int & _xperiodic, const int & _yperiodic, const int & _zperiodic, const int & _xprd_half, const int & _yprd_half, const int & _zprd_half): neigh_list(_neigh_list), cutneighsq(_cutneighsq), bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins), gbincount(_gbincount),c_gbincount(_gbincount),gbins(_gbins),c_gbins(_gbins), lbinxlo(_lbinxlo),lbinxhi(_lbinxhi), lbinylo(_lbinylo),lbinyhi(_lbinyhi), lbinzlo(_lbinzlo),lbinzhi(_lbinzhi), nstencil(_nstencil),sx1(_sx1),sy1(_sy1),sz1(_sz1), d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),d_nstencil_ssa(_d_nstencil_ssa), ssa_phaseCt(_ssa_phaseCt), d_ssa_phaseLen(_d_ssa_phaseLen), d_ssa_phaseOff(_d_ssa_phaseOff), d_ssa_itemLoc(_d_ssa_itemLoc), d_ssa_itemLen(_d_ssa_itemLen), ssa_gphaseCt(_ssa_gphaseCt), d_ssa_gphaseLen(_d_ssa_gphaseLen), d_ssa_gitemLoc(_d_ssa_gitemLoc), d_ssa_gitemLen(_d_ssa_gitemLen), nlocal(_nlocal), x(_x),type(_type),mask(_mask),molecule(_molecule), tag(_tag),special(_special),nspecial(_nspecial),molecular(_molecular), nbinx(_nbinx),nbiny(_nbiny),nbinz(_nbinz), mbinx(_mbinx),mbiny(_mbiny),mbinz(_mbinz), mbinxlo(_mbinxlo),mbinylo(_mbinylo),mbinzlo(_mbinzlo), bininvx(_bininvx),bininvy(_bininvy),bininvz(_bininvz), exclude(_exclude),nex_type(_nex_type), ex1_type(_ex1_type),ex2_type(_ex2_type),ex_type(_ex_type), nex_group(_nex_group), ex1_group(_ex1_group),ex2_group(_ex2_group), ex1_bit(_ex1_bit),ex2_bit(_ex2_bit),nex_mol(_nex_mol), ex_mol_group(_ex_mol_group),ex_mol_bit(_ex_mol_bit), xperiodic(_xperiodic),yperiodic(_yperiodic),zperiodic(_zperiodic), xprd_half(_xprd_half),yprd_half(_yprd_half),zprd_half(_zprd_half) { if (molecular == 2) moltemplate = 1; else moltemplate = 0; bboxlo[0] = _bboxlo[0]; bboxlo[1] = _bboxlo[1]; bboxlo[2] = _bboxlo[2]; bboxhi[0] = _bboxhi[0]; bboxhi[1] = _bboxhi[1]; bboxhi[2] = _bboxhi[2]; - resize = typename AT::t_int_scalar("NeighborKokkosFunctor::resize"); + resize = typename AT::t_int_scalar("NPairSSAKokkosExecute::resize"); #ifndef KOKKOS_USE_CUDA_UVM h_resize = Kokkos::create_mirror_view(resize); #else h_resize = resize; #endif h_resize() = 1; new_maxneighs = typename AT:: - t_int_scalar("NeighborKokkosFunctor::new_maxneighs"); + t_int_scalar("NPairSSAKokkosExecute::new_maxneighs"); #ifndef KOKKOS_USE_CUDA_UVM h_new_maxneighs = Kokkos::create_mirror_view(new_maxneighs); #else h_new_maxneighs = new_maxneighs; #endif h_new_maxneighs() = neigh_list.maxneighs; }; ~NPairSSAKokkosExecute() {neigh_list.copymode = 1;}; KOKKOS_FUNCTION void build_locals_onePhase(const bool firstTry, int me, int workPhase) const; - void build_ghosts(); + KOKKOS_FUNCTION + void build_ghosts_onePhase(int workPhase) const; KOKKOS_INLINE_FUNCTION int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const { int ix,iy,iz; if (x >= bboxhi[0]) ix = static_cast ((x-bboxhi[0])*bininvx) + nbinx; else if (x >= bboxlo[0]) { ix = static_cast ((x-bboxlo[0])*bininvx); ix = MIN(ix,nbinx-1); } else ix = static_cast ((x-bboxlo[0])*bininvx) - 1; if (y >= bboxhi[1]) iy = static_cast ((y-bboxhi[1])*bininvy) + nbiny; else if (y >= bboxlo[1]) { iy = static_cast ((y-bboxlo[1])*bininvy); iy = MIN(iy,nbiny-1); } else iy = static_cast ((y-bboxlo[1])*bininvy) - 1; if (z >= bboxhi[2]) iz = static_cast ((z-bboxhi[2])*bininvz) + nbinz; else if (z >= bboxlo[2]) { iz = static_cast ((z-bboxlo[2])*bininvz); iz = MIN(iz,nbinz-1); } else iz = static_cast ((z-bboxlo[2])*bininvz) - 1; i[0] = ix - mbinxlo; i[1] = iy - mbinylo; i[2] = iz - mbinzlo; return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo); } KOKKOS_INLINE_FUNCTION int exclusion(const int &i,const int &j, const int &itype,const int &jtype) const; KOKKOS_INLINE_FUNCTION int find_special(const int &i, const int &j) const; KOKKOS_INLINE_FUNCTION int minimum_image_check(double dx, double dy, double dz) const { if (xperiodic && fabs(dx) > xprd_half) return 1; if (yperiodic && fabs(dy) > yprd_half) return 1; if (zperiodic && fabs(dz) > zprd_half) return 1; return 0; } }; -template -struct NPairSSAKokkosBuildFunctor { - typedef DeviceType device_type; - - const NPairSSAKokkosExecute c; - const bool firstTry; - const int me; - - NPairSSAKokkosBuildFunctor(const NPairSSAKokkosExecute &_c, - const bool _firstTry, const int _me):c(_c), - firstTry(_firstTry), me(_me) {}; - - KOKKOS_INLINE_FUNCTION - void operator() (const int & i) const { - c.build_locals_onePhase(firstTry, me, i); - } -}; - } #endif #endif /* ERROR/WARNING messages: */