Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90896547
neigh_half_bin_intel.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, Nov 5, 18:33
Size
77 KB
Mime Type
text/x-c++
Expires
Thu, Nov 7, 18:33 (1 d, 22 h)
Engine
blob
Format
Raw Data
Handle
22154902
Attached To
rLAMMPS lammps
neigh_half_bin_intel.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
//#define OUTER_CHUNK 1
#include "neighbor.h"
#include "neigh_list.h"
#include "atom.h"
#include "comm.h"
#include "group.h"
#include "fix_intel.h"
#if defined(_OPENMP)
#include <omp.h>
#endif
#ifdef LMP_USE_AVXCD
#include "intel_simd.h"
#endif
#ifdef OUTER_CHUNK
#include "intel_simd.h"
#endif
using namespace LAMMPS_NS;
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push,target(mic))
#endif
#define ofind_special(which, special, nspecial, i, tag) \
{ \
which = 0; \
const int n1 = nspecial[i * 3]; \
const int n2 = nspecial[i * 3 + 1]; \
const int n3 = nspecial[i * 3 + 2]; \
const tagint *sptr = special + i * maxspecial; \
for (int s = 0; s < n3; s++) { \
if (sptr[s] == tag) { \
if (s < n1) { \
which = 1; \
} else if (s < n2) { \
which = 2; \
} else { \
which = 3; \
} \
} \
} \
}
#define ominimum_image_check(answer, dx, dy, dz) \
{ \
answer = 0; \
if (xperiodic && fabs(dx) > xprd_half) answer = 1; \
if (yperiodic && fabs(dy) > yprd_half) answer = 1; \
if (zperiodic && fabs(dz) > zprd_half) answer = 1; \
}
#define dminimum_image_check(answer, dx, dy, dz) \
{ \
answer = 0; \
if (domain->xperiodic && fabs(dx) > domain->xprd_half) answer = 1; \
if (domain->yperiodic && fabs(dy) > domain->yprd_half) answer = 1; \
if (domain->zperiodic && fabs(dz) > domain->zprd_half) answer = 1; \
}
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
template <class flt_t, class acc_t>
void Neighbor::bin_atoms(void * xin, int * _noalias const atombin,
int * _noalias const binpacked) {
const ATOM_T * _noalias const x = (const ATOM_T * _noalias const)xin;
int nlocal = atom->nlocal;
const int nall = nlocal + atom->nghost;
const double sboxlo0 = bboxlo[0] + mbinxlo/bininvx;
const double sboxlo1 = bboxlo[1] + mbinylo/bininvy;
const double sboxlo2 = bboxlo[2] + mbinzlo/bininvz;
int i, ibin;
for (i = 0; i < mbins; i++) binhead[i] = -1;
int *mask = atom->mask;
if (includegroup) {
int bitmask = group->bitmask[includegroup];
for (i = nall-1; i >= nlocal; i--) {
if (mask[i] & bitmask) {
ibin = coord2bin(atom->x[i]);
bins[i] = binhead[ibin];
binhead[ibin] = i;
}
}
for (i = atom->nfirst-1; i >= 0; i--) {
ibin = coord2bin(atom->x[i]);
atombin[i] = ibin;
bins[i] = binhead[ibin];
binhead[ibin] = i;
}
} else {
for (i = nall-1; i >= nlocal; i--) {
ibin = coord2bin(atom->x[i]);
bins[i] = binhead[ibin];
binhead[ibin] = i;
}
for (i = nlocal-1; i >= 0; i--) {
ibin = coord2bin(atom->x[i]);
atombin[i]=ibin;
bins[i] = binhead[ibin];
binhead[ibin] = i;
}
}
int newhead = 0;
for (i = 0; i < mbins; i++) {
int j = binhead[i];
binhead[i] = newhead;
for ( ; j >= 0; j = bins[j])
binpacked[newhead++] = j;
}
binhead[mbins] = newhead;
}
/* ----------------------------------------------------------------------
binned neighbor list construction with partial Newton's 3rd law
each owned atom i checks own bin and other bins in stencil
pair stored once if i,j are both owned and i < j
pair stored by me if j is ghost (also stored by proc owning j)
------------------------------------------------------------------------- */
void Neighbor::half_bin_no_newton_intel(NeighList *list)
{
const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
list->inum = nlocal;
// Get fix for intel stuff
FixIntel *fix = static_cast<FixIntel *>(fix_intel);
const int off_end = fix->offload_end_neighbor();
int host_start = off_end;;
#ifdef _LMP_INTEL_OFFLOAD
if (fix->full_host_list()) host_start = 0;
if (exclude)
error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
#endif
if (list->nstencil > INTEL_MAX_STENCIL_CHECK)
error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
int need_ic = 0;
if (atom->molecular)
dminimum_image_check(need_ic, cutneighmax, cutneighmax, cutneighmax);
if (need_ic) {
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
hbnni<float,double,1>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbnni<float,double,1>(0, list, fix->get_mixed_buffers(),
host_start, nlocal,fix);
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
hbnni<double,double,1>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbnni<double,double,1>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix);
} else {
hbnni<float,float,1>(1, list, fix->get_single_buffers(),
0, off_end, fix);
hbnni<float,float,1>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix);
}
} else {
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
hbnni<float,double,0>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbnni<float,double,0>(0, list, fix->get_mixed_buffers(),
host_start, nlocal,fix);
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
hbnni<double,double,0>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbnni<double,double,0>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix);
} else {
hbnni<float,float,0>(1, list, fix->get_single_buffers(),
0, off_end, fix);
hbnni<float,float,0>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix);
}
}
}
template <class flt_t, class acc_t, int need_ic>
void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
const int astart, const int aend, void *fix_in) {
IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
FixIntel *fix = (FixIntel *)fix_in;
const int nall = atom->nlocal + atom->nghost;
int pad = 1;
if (offload) {
fix->start_watch(TIME_PACK);
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
ATOM_T biga;
biga.x = INTEL_BIGP;
biga.y = INTEL_BIGP;
biga.z = INTEL_BIGP;
biga.w = 1;
buffers->get_x()[nall] = biga;
const int nthreads = comm->nthreads;
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(buffers)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
sizeof(ATOM_T));
buffers->thr_pack(ifrom, ito, 0);
}
fix->stop_watch(TIME_PACK);
fix->start_watch(TIME_HOST_NEIGHBOR);
bin_atoms<flt_t,acc_t>(buffers->get_x(), buffers->get_atombin(),
buffers->get_binpacked());
if (INTEL_MIC_NBOR_PAD > 1)
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
} else {
fix->start_watch(TIME_HOST_NEIGHBOR);
if (INTEL_NBOR_PAD > 1)
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
}
const int pad_width = pad;
if (aend-astart == 0) {
fix->stop_watch(TIME_HOST_NEIGHBOR);
return;
}
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const firstneigh = buffers->firstneigh(list);
const int molecular = atom->molecular;
int *ns = NULL;
tagint *s = NULL;
int tag_size = 0, special_size;
if (buffers->need_tag()) tag_size = nall;
if (molecular) {
s = atom->special[0];
ns = atom->nspecial[0];
special_size = aend;
} else {
s = &buffers->_special_holder;
ns = &buffers->_nspecial_holder;
special_size = 0;
}
const tagint * _noalias const special = s;
const int * _noalias const nspecial = ns;
const int maxspecial = atom->maxspecial;
const tagint * _noalias const tag = atom->tag;
int * _noalias const ilist = list->ilist;
int * _noalias numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int nstencil = list->nstencil;
const int * _noalias const stencil = list->stencil;
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
const int ntypes = atom->ntypes + 1;
const int nlocal = atom->nlocal;
#ifndef _LMP_INTEL_OFFLOAD
int * const mask = atom->mask;
tagint * const molecule = atom->molecule;
#endif
int tnum;
int *overflow;
double *timer_compute;
if (offload) {
timer_compute = fix->off_watch_neighbor();
tnum = buffers->get_off_threads();
overflow = fix->get_off_overflow_flag();
fix->stop_watch(TIME_HOST_NEIGHBOR);
fix->start_watch(TIME_OFFLOAD_LATENCY);
} else {
tnum = comm->nthreads;
overflow = fix->get_overflow_flag();
}
const int nthreads = tnum;
const int maxnbors = buffers->get_max_nbors();
int * _noalias const atombin = buffers->get_atombin();
const int * _noalias const binpacked = buffers->get_binpacked();
const int xperiodic = domain->xperiodic;
const int yperiodic = domain->yperiodic;
const int zperiodic = domain->zperiodic;
const flt_t xprd_half = domain->xprd_half;
const flt_t yprd_half = domain->yprd_half;
const flt_t zprd_half = domain->zprd_half;
// Make sure dummy coordinates to eliminate loop remainder not within cutoff
{
const flt_t dx = (INTEL_BIGP - bboxhi[0]);
const flt_t dy = (INTEL_BIGP - bboxhi[1]);
const flt_t dz = (INTEL_BIGP - bboxhi[2]);
if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
error->one(FLERR,
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
}
#ifdef _LMP_INTEL_OFFLOAD
const int * _noalias const binhead = this->binhead;
const int * _noalias const bins = this->bins;
const int cop = fix->coprocessor_number();
const int separate_buffers = fix->separate_buffers();
#pragma offload target(mic:cop) if(offload) \
in(x:length(nall+1) alloc_if(0) free_if(0)) \
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
out(numneigh:length(0) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
in(atombin:length(aend) alloc_if(0) free_if(0)) \
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
in(maxnbors,nthreads,maxspecial,nstencil,pad_width,offload,nall) \
in(separate_buffers, astart, aend, nlocal, molecular, ntypes) \
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
out(overflow:length(5) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(tag)
#endif
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
#ifdef _LMP_INTEL_OFFLOAD
overflow[LMP_LOCAL_MIN] = astart;
overflow[LMP_LOCAL_MAX] = aend - 1;
overflow[LMP_GHOST_MIN] = nall;
overflow[LMP_GHOST_MAX] = -1;
#endif
int nstencilp = 0;
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
for (int k = 0; k < nstencil; k++) {
binstart[nstencilp] = stencil[k];
int end = stencil[k] + 1;
for (int kk = k + 1; kk < nstencil; kk++) {
if (stencil[kk-1]+1 == stencil[kk]) {
end++;
k++;
} else break;
}
binend[nstencilp] = end;
nstencilp++;
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(numneigh, overflow, nstencilp, binstart, binend)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
int lmin = nall, lmax = -1, gmin = nall, gmax = -1;
#endif
const int num = aend - astart;
int tid, ifrom, ito;
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
ifrom += astart;
ito += astart;
int which;
const int list_size = (ito + tid + 1) * maxnbors;
int ct = (ifrom + tid) * maxnbors;
int *neighptr = firstneigh + ct;
for (int i = ifrom; i < ito; i++) {
int j, k, n, n2, itype, jtype, ibin;
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
n = 0;
n2 = maxnbors;
xtmp = x[i].x;
ytmp = x[i].y;
ztmp = x[i].z;
itype = x[i].w;
const int ioffset = ntypes*itype;
// loop over all atoms in other bins in stencil including self
// only store pair if i < j
// stores own/own pairs only once
// stores own/ghost pairs on both procs
ibin = atombin[i];
for (k = 0; k < nstencilp; k++) {
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
for (int jj = bstart; jj < bend; jj++) {
const int j = binpacked[jj];
if (j <= i) continue;
jtype = x[j].w;
#ifndef _LMP_INTEL_OFFLOAD
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
#endif
delx = xtmp - x[j].x;
dely = ytmp - x[j].y;
delz = ztmp - x[j].z;
rsq = delx * delx + dely * dely + delz * delz;
if (rsq <= cutneighsq[ioffset + jtype]) {
if (j < nlocal) {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[n++] = -j - 1;
else
neighptr[n++] = j;
} else
neighptr[n++] = j;
#ifdef _LMP_INTEL_OFFLOAD
if (j < lmin) lmin = j;
if (j > lmax) lmax = j;
#endif
} else {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[n2++] = -j - 1;
else
neighptr[n2++] = j;
} else
neighptr[n2++] = j;
#ifdef _LMP_INTEL_OFFLOAD
if (j < gmin) gmin = j;
if (j > gmax) gmax = j;
#endif
}
}
}
}
ilist[i] = i;
cnumneigh[i] = ct;
if (n > maxnbors) *overflow = 1;
for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
const int edge = (n % pad_width);
if (edge) {
const int pad_end = n + (pad_width - edge);
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min=1, max=15, avg=8
#endif
for ( ; n < pad_end; n++)
neighptr[n] = nall;
}
numneigh[i] = n;
while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
ct += n;
neighptr += n;
if (ct + n + maxnbors > list_size) {
*overflow = 1;
ct = (ifrom + tid) * maxnbors;
}
}
if (*overflow == 1)
for (int i = ifrom; i < ito; i++)
numneigh[i] = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (separate_buffers) {
#if defined(_OPENMP)
#pragma omp critical
#endif
{
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
}
#pragma omp barrier
}
int ghost_offset = 0, nall_offset = nall;
if (separate_buffers) {
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
if (nghost < 0) nghost = 0;
if (offload) {
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
} else {
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
nall_offset = nlocal + nghost;
}
}
#endif
if (molecular) {
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj];
if (need_ic && j < 0) {
which = 0;
jlist[jj] = -j - 1;
} else
ofind_special(which, special, nspecial, i, tag[j]);
#ifdef _LMP_INTEL_OFFLOAD
if (j >= nlocal) {
if (j == nall)
jlist[jj] = nall_offset;
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which) jlist[jj] = j ^ (which << SBBITS);
}
}
}
#ifdef _LMP_INTEL_OFFLOAD
else if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
int jj = 0;
for (jj = 0; jj < jnum; jj++)
if (jlist[jj] >= nlocal) break;
while (jj < jnum) {
if (jlist[jj] == nall) jlist[jj] = nall_offset;
else jlist[jj] -= ghost_offset;
jj++;
}
}
}
#endif
} // end omp
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
if (offload) {
fix->stop_watch(TIME_OFFLOAD_LATENCY);
#ifdef _LMP_INTEL_OFFLOAD
for (int n = 0; n < aend; n++) {
ilist[n] = n;
numneigh[n] = 0;
}
#endif
} else {
for (int i = astart; i < aend; i++)
list->firstneigh[i] = firstneigh + cnumneigh[i];
fix->stop_watch(TIME_HOST_NEIGHBOR);
#ifdef _LMP_INTEL_OFFLOAD
if (separate_buffers) {
fix->start_watch(TIME_PACK);
fix->set_neighbor_host_sizes();
buffers->pack_sep_from_single(fix->host_min_local(),
fix->host_used_local(),
fix->host_min_ghost(),
fix->host_used_ghost());
fix->stop_watch(TIME_PACK);
}
#endif
}
}
/* ----------------------------------------------------------------------
binned neighbor list construction with full Newton's 3rd law
each owned atom i checks its own bin and other bins in Newton stencil
every pair stored exactly once by some processor
------------------------------------------------------------------------- */
void Neighbor::half_bin_newton_intel(NeighList *list)
{
const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
list->inum = nlocal;
// Get fix for intel stuff
FixIntel *fix = static_cast<FixIntel *>(fix_intel);
const int off_end = fix->offload_end_neighbor();
int host_start = fix->host_start_neighbor();;
int offload_noghost = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (fix->full_host_list()) host_start = 0;
offload_noghost = fix->offload_noghost();
if (exclude)
error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
#endif
if (list->nstencil / 2 > INTEL_MAX_STENCIL_CHECK)
error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
int need_ic = 0;
if (atom->molecular)
dminimum_image_check(need_ic, cutneighmax, cutneighmax, cutneighmax);
if (need_ic) {
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbni<float,double,1,1>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbni<float,double,1,1>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbni<float,double,0,1>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbni<float,double,0,1>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix);
}
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbni<double,double,1,1>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbni<double,double,1,1>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbni<double,double,0,1>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbni<double,double,0,1>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix);
}
} else {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbni<float,float,1,1>(1, list, fix->get_single_buffers(), 0, off_end,
fix);
hbni<float,float,1,1>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbni<float,float,0,1>(1, list, fix->get_single_buffers(), 0, off_end,
fix);
hbni<float,float,0,1>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix);
}
}
} else {
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbni<float,double,1,0>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbni<float,double,1,0>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbni<float,double,0,0>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbni<float,double,0,0>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix);
}
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbni<double,double,1,0>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbni<double,double,1,0>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbni<double,double,0,0>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbni<double,double,0,0>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix);
}
} else {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbni<float,float,1,0>(1, list, fix->get_single_buffers(), 0, off_end,
fix);
hbni<float,float,1,0>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbni<float,float,0,0>(1, list, fix->get_single_buffers(), 0, off_end,
fix);
hbni<float,float,0,0>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix);
}
}
}
}
template <class flt_t, class acc_t, int offload_noghost, int need_ic>
void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
const int astart, const int aend, void *fix_in,
const int offload_end) {
IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
FixIntel *fix = (FixIntel *)fix_in;
const int nall = atom->nlocal + atom->nghost;
int pad = 1;
if (offload) {
fix->start_watch(TIME_PACK);
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
ATOM_T biga;
biga.x = INTEL_BIGP;
biga.y = INTEL_BIGP;
biga.z = INTEL_BIGP;
biga.w = 1;
buffers->get_x()[nall]=biga;
const int nthreads = comm->nthreads;
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(buffers)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
sizeof(ATOM_T));
buffers->thr_pack(ifrom, ito, 0);
}
fix->stop_watch(TIME_PACK);
fix->start_watch(TIME_HOST_NEIGHBOR);
bin_atoms<flt_t,acc_t>(buffers->get_x(), buffers->get_atombin(),
buffers->get_binpacked());
if (INTEL_MIC_NBOR_PAD > 1)
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
} else {
fix->start_watch(TIME_HOST_NEIGHBOR);
if (INTEL_NBOR_PAD > 1)
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
}
const int pad_width = pad;
if (aend-astart == 0) {
fix->stop_watch(TIME_HOST_NEIGHBOR);
return;
}
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const firstneigh = buffers->firstneigh(list);
int nall_t = nall;
if (offload_noghost && offload) nall_t = atom->nlocal;
const int e_nall = nall_t;
const int molecular = atom->molecular;
int *ns = NULL;
tagint *s = NULL;
int tag_size = 0, special_size;
if (buffers->need_tag()) tag_size = e_nall;
if (molecular) {
s = atom->special[0];
ns = atom->nspecial[0];
special_size = aend;
} else {
s = &buffers->_special_holder;
ns = &buffers->_nspecial_holder;
special_size = 0;
}
const tagint * _noalias const special = s;
const int * _noalias const nspecial = ns;
const int maxspecial = atom->maxspecial;
const tagint * _noalias const tag = atom->tag;
int * _noalias const ilist = list->ilist;
int * _noalias numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int nstencil = list->nstencil;
const int * _noalias const stencil = list->stencil;
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
const int ntypes = atom->ntypes + 1;
const int nlocal = atom->nlocal;
#ifndef _LMP_INTEL_OFFLOAD
int * const mask = atom->mask;
tagint * const molecule = atom->molecule;
#endif
int tnum;
int *overflow;
double *timer_compute;
if (offload) {
timer_compute = fix->off_watch_neighbor();
tnum = buffers->get_off_threads();
overflow = fix->get_off_overflow_flag();
fix->stop_watch(TIME_HOST_NEIGHBOR);
fix->start_watch(TIME_OFFLOAD_LATENCY);
} else {
tnum = comm->nthreads;
overflow = fix->get_overflow_flag();
}
const int nthreads = tnum;
const int maxnbors = buffers->get_max_nbors();
int * _noalias const atombin = buffers->get_atombin();
const int * _noalias const binpacked = buffers->get_binpacked();
const int xperiodic = domain->xperiodic;
const int yperiodic = domain->yperiodic;
const int zperiodic = domain->zperiodic;
const flt_t xprd_half = domain->xprd_half;
const flt_t yprd_half = domain->yprd_half;
const flt_t zprd_half = domain->zprd_half;
// Make sure dummy coordinates to eliminate loop remainder not within cutoff
{
const flt_t dx = (INTEL_BIGP - bboxhi[0]);
const flt_t dy = (INTEL_BIGP - bboxhi[1]);
const flt_t dz = (INTEL_BIGP - bboxhi[2]);
if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
error->one(FLERR,
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
}
#ifdef _LMP_INTEL_OFFLOAD
const int * _noalias const binhead = this->binhead;
const int * _noalias const bins = this->bins;
const int cop = fix->coprocessor_number();
const int separate_buffers = fix->separate_buffers();
#pragma offload target(mic:cop) if(offload) \
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
out(numneigh:length(0) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
in(atombin:length(aend) alloc_if(0) free_if(0)) \
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pad_width) \
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
out(overflow:length(5) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(tag)
#endif
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
#ifdef _LMP_INTEL_OFFLOAD
overflow[LMP_LOCAL_MIN] = astart;
overflow[LMP_LOCAL_MAX] = aend - 1;
overflow[LMP_GHOST_MIN] = e_nall;
overflow[LMP_GHOST_MAX] = -1;
#endif
int nstencilp = 0;
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
for (int k = 0; k < nstencil; k++) {
binstart[nstencilp] = stencil[k];
int end = stencil[k] + 1;
for (int kk = k + 1; kk < nstencil; kk++) {
if (stencil[kk-1]+1 == stencil[kk]) {
end++;
k++;
} else break;
}
binend[nstencilp] = end;
nstencilp++;
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(numneigh, overflow, nstencilp, binstart, binend)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
#endif
const int num = aend - astart;
int tid, ifrom, ito;
#ifdef OUTER_CHUNK
const int swidth = ip_simd::SIMD_type<flt_t>::width();
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
ifrom += astart;
ito += astart;
int e_ito = ito;
if (ito == num) {
int imod = ito % swidth;
if (imod) e_ito += swidth - imod;
}
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
#else
const int swidth = 1;
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
ifrom += astart;
ito += astart;
const int list_size = (ito + tid * 2 + 2) * maxnbors;
#endif
int which;
int pack_offset = maxnbors * swidth;
int ct = (ifrom + tid * 2) * maxnbors;
int *neighptr = firstneigh + ct;
const int obound = pack_offset + maxnbors * 2;
int max_chunk = 0;
int lane = 0;
for (int i = ifrom; i < ito; i++) {
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
const int itype = x[i].w;
const int ioffset = ntypes * itype;
// loop over rest of atoms in i's bin, ghosts are at end of linked list
// if j is owned atom, store it, since j is beyond i in linked list
// if j is ghost, only store if j coords are "above/to the right" of i
int raw_count = pack_offset;
for (int j = bins[i]; j >= 0; j = bins[j]) {
if (j >= nlocal) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost && offload) continue;
#endif
if (x[j].z < ztmp) continue;
if (x[j].z == ztmp) {
if (x[j].y < ytmp) continue;
if (x[j].y == ytmp && x[j].x < xtmp) continue;
}
}
#ifdef _LMP_INTEL_OFFLOAD
else if (offload_noghost && i < offload_end) continue;
#endif
#ifndef _LMP_INTEL_OFFLOAD
if (exclude) {
const int jtype = x[j].w;
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
}
#endif
neighptr[raw_count++] = j;
}
// loop over all atoms in other bins in stencil, store every pair
const int ibin = atombin[i];
if (exclude) {
for (int k = 0; k < nstencilp; k++) {
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#ifndef _LMP_INTEL_OFFLOAD
#ifdef INTEL_VMASK
#pragma simd
#endif
#endif
for (int jj = bstart; jj < bend; jj++) {
const int j = binpacked[jj];
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
if (j < nlocal) {
if (i < offload_end) continue;
} else if (offload) continue;
}
#endif
#ifndef _LMP_INTEL_OFFLOAD
const int jtype = x[j].w;
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
#endif
neighptr[raw_count++] = j;
}
}
} else {
for (int k = 0; k < nstencilp; k++) {
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#ifndef _LMP_INTEL_OFFLOAD
#ifdef INTEL_VMASK
#pragma simd
#endif
#endif
for (int jj = bstart; jj < bend; jj++) {
const int j = binpacked[jj];
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
if (j < nlocal) {
if (i < offload_end) continue;
} else if (offload) continue;
}
#endif
neighptr[raw_count++] = j;
}
}
}
if (raw_count > obound) *overflow = 1;
#if defined(LMP_SIMD_COMPILER)
#ifdef _LMP_INTEL_OFFLOAD
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
#if __INTEL_COMPILER+0 > 1499
#pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#endif
#else
#pragma vector aligned
#pragma simd
#endif
#endif
for (int u = pack_offset; u < raw_count; u++) {
int j = neighptr[u];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq > cutneighsq[ioffset + jtype])
neighptr[u] = e_nall;
else {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[u] = -j - 1;
}
#ifdef _LMP_INTEL_OFFLOAD
if (j < nlocal) {
if (j < vlmin) vlmin = j;
if (j > vlmax) vlmax = j;
} else {
if (j < vgmin) vgmin = j;
if (j > vgmax) vgmax = j;
}
#endif
}
}
#ifdef _LMP_INTEL_OFFLOAD
lmin = MIN(lmin,vlmin);
gmin = MIN(gmin,vgmin);
lmax = MAX(lmax,vlmax);
gmax = MAX(gmax,vgmax);
#endif
int n = lane, n2 = pack_offset;
for (int u = pack_offset; u < raw_count; u++) {
const int j = neighptr[u];
int pj = j;
if (pj < e_nall) {
if (need_ic)
if (pj < 0) pj = -pj - 1;
if (pj < nlocal) {
neighptr[n] = j;
n += swidth;
} else
neighptr[n2++] = j;
}
}
int ns = (n - lane) / swidth;
for (int u = pack_offset; u < n2; u++) {
neighptr[n] = neighptr[u];
n += swidth;
}
ilist[i] = i;
cnumneigh[i] = ct + lane;
ns += n2 - pack_offset;
#ifndef OUTER_CHUNK
int edge = (ns % pad_width);
if (edge) {
const int pad_end = ns + (pad_width - edge);
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min=1, max=15, avg=8
#endif
for ( ; ns < pad_end; ns++)
neighptr[ns] = e_nall;
}
#endif
numneigh[i] = ns;
#ifdef OUTER_CHUNK
if (ns > max_chunk) max_chunk = ns;
lane++;
if (lane == swidth) {
ct += max_chunk * swidth;
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
int edge = (ct % alignb);
if (edge) ct += alignb - edge;
neighptr = firstneigh + ct;
max_chunk = 0;
pack_offset = maxnbors * swidth;
lane = 0;
if (ct + obound > list_size) {
if (i < ito - 1) {
*overflow = 1;
ct = (ifrom + tid * 2) * maxnbors;
}
}
}
#else
ct += ns;
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
edge = (ct % alignb);
if (edge) ct += alignb - edge;
neighptr = firstneigh + ct;
if (ct + obound > list_size) {
if (i < ito - 1) {
*overflow = 1;
ct = (ifrom + tid * 2) * maxnbors;
}
}
#endif
}
if (*overflow == 1)
for (int i = ifrom; i < ito; i++)
numneigh[i] = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (separate_buffers) {
#if defined(_OPENMP)
#pragma omp critical
#endif
{
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
}
#pragma omp barrier
}
int ghost_offset = 0, nall_offset = e_nall;
if (separate_buffers) {
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
if (nghost < 0) nghost = 0;
if (offload) {
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
} else {
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
nall_offset = nlocal + nghost;
}
}
#endif
if (molecular) {
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
#ifndef OUTER_CHUNK
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = 0; jj < jnum; jj++) {
#else
const int trip = jnum * swidth;
for (int jj = 0; jj < trip; jj+= swidth) {
#endif
const int j = jlist[jj];
if (need_ic && j < 0) {
which = 0;
jlist[jj] = -j - 1;
} else
ofind_special(which, special, nspecial, i, tag[j]);
#ifdef _LMP_INTEL_OFFLOAD
if (j >= nlocal) {
if (j == e_nall)
jlist[jj] = nall_offset;
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which) jlist[jj] = j ^ (which << SBBITS);
}
}
}
#ifdef _LMP_INTEL_OFFLOAD
else if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
int jj = 0;
for (jj = 0; jj < jnum; jj++)
if (jlist[jj] >= nlocal) break;
while (jj < jnum) {
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
else jlist[jj] -= ghost_offset;
jj++;
}
}
}
#endif
} // end omp
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
if (offload) {
fix->stop_watch(TIME_OFFLOAD_LATENCY);
#ifdef _LMP_INTEL_OFFLOAD
for (int n = 0; n < aend; n++) {
ilist[n] = n;
numneigh[n] = 0;
}
#endif
} else {
for (int i = astart; i < aend; i++)
list->firstneigh[i] = firstneigh + cnumneigh[i];
fix->stop_watch(TIME_HOST_NEIGHBOR);
#ifdef _LMP_INTEL_OFFLOAD
if (separate_buffers) {
fix->start_watch(TIME_PACK);
fix->set_neighbor_host_sizes();
buffers->pack_sep_from_single(fix->host_min_local(),
fix->host_used_local(),
fix->host_min_ghost(),
fix->host_used_ghost());
fix->stop_watch(TIME_PACK);
}
#endif
}
}
/* ----------------------------------------------------------------------
binned neighbor list construction with Newton's 3rd law for triclinic
each owned atom i checks its own bin and other bins in triclinic stencil
every pair stored exactly once by some processor
------------------------------------------------------------------------- */
void Neighbor::half_bin_newton_tri_intel(NeighList *list)
{
const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
list->inum = nlocal;
// Get fix for intel stuff
FixIntel *fix = static_cast<FixIntel *>(fix_intel);
const int off_end = fix->offload_end_neighbor();
int host_start = fix->host_start_neighbor();
int offload_noghost = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (fix->full_host_list()) host_start = 0;
offload_noghost = fix->offload_noghost();
if (exclude)
error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
#endif
if (list->nstencil / 2 > INTEL_MAX_STENCIL_CHECK)
error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
int need_ic = 0;
if (atom->molecular)
dminimum_image_check(need_ic, cutneighmax, cutneighmax, cutneighmax);
if (need_ic) {
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbnti<float,double,1,1>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbnti<float,double,1,1>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbnti<float,double,0,1>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbnti<float,double,0,1>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix);
}
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbnti<double,double,1,1>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbnti<double,double,1,1>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbnti<double,double,0,1>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbnti<double,double,0,1>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix);
}
} else {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbnti<float,float,1,1>(1, list, fix->get_single_buffers(),
0, off_end, fix);
hbnti<float,float,1,1>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbnti<float,float,0,1>(1, list, fix->get_single_buffers(),
0, off_end, fix);
hbnti<float,float,0,1>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix);
}
}
} else {
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbnti<float,double,1,0>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbnti<float,double,1,0>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbnti<float,double,0,0>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
hbnti<float,double,0,0>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix);
}
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbnti<double,double,1,0>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbnti<double,double,1,0>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbnti<double,double,0,0>(1, list, fix->get_double_buffers(),
0, off_end, fix);
hbnti<double,double,0,0>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix);
}
} else {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
hbnti<float,float,1,0>(1, list, fix->get_single_buffers(),
0, off_end, fix);
hbnti<float,float,1,0>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
hbnti<float,float,0,0>(1, list, fix->get_single_buffers(),
0, off_end, fix);
hbnti<float,float,0,0>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix);
}
}
}
}
template <class flt_t, class acc_t, int offload_noghost, int need_ic>
void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
const int astart, const int aend, void *fix_in,
const int offload_end) {
IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
FixIntel *fix = (FixIntel *)fix_in;
const int nall = atom->nlocal + atom->nghost;
int pad = 1;
if (list->nstencil > INTEL_MAX_STENCIL)
error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
if (offload) {
fix->start_watch(TIME_PACK);
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
ATOM_T biga;
biga.x = INTEL_BIGP;
biga.y = INTEL_BIGP;
biga.z = INTEL_BIGP;
biga.w = 1;
buffers->get_x()[nall]=biga;
const int nthreads = comm->nthreads;
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(buffers)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
sizeof(ATOM_T));
buffers->thr_pack(ifrom, ito, 0);
}
fix->stop_watch(TIME_PACK);
fix->start_watch(TIME_HOST_NEIGHBOR);
bin_atoms<flt_t,acc_t>(buffers->get_x(), buffers->get_atombin(),
buffers->get_binpacked());
if (INTEL_MIC_NBOR_PAD > 1)
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
} else {
fix->start_watch(TIME_HOST_NEIGHBOR);
if (INTEL_NBOR_PAD > 1)
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
}
const int pad_width = pad;
if (aend-astart == 0) {
fix->stop_watch(TIME_HOST_NEIGHBOR);
return;
}
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const firstneigh = buffers->firstneigh(list);
int nall_t = nall;
if (offload_noghost && offload) nall_t = atom->nlocal;
const int e_nall = nall_t;
const int molecular = atom->molecular;
int *ns = NULL;
tagint *s = NULL;
int tag_size = 0, special_size;
if (buffers->need_tag()) tag_size = e_nall;
if (molecular) {
s = atom->special[0];
ns = atom->nspecial[0];
special_size = aend;
} else {
s = &buffers->_special_holder;
ns = &buffers->_nspecial_holder;
special_size = 0;
}
const tagint * _noalias const special = s;
const int * _noalias const nspecial = ns;
const int maxspecial = atom->maxspecial;
const tagint * _noalias const tag = atom->tag;
int * _noalias const ilist = list->ilist;
int * _noalias numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int nstencil = list->nstencil;
const int * _noalias const stencil = list->stencil;
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
const int ntypes = atom->ntypes + 1;
const int nlocal = atom->nlocal;
#ifndef _LMP_INTEL_OFFLOAD
int * const mask = atom->mask;
tagint * const molecule = atom->molecule;
#endif
int tnum;
int *overflow;
double *timer_compute;
if (offload) {
timer_compute = fix->off_watch_neighbor();
tnum = buffers->get_off_threads();
overflow = fix->get_off_overflow_flag();
fix->stop_watch(TIME_HOST_NEIGHBOR);
fix->start_watch(TIME_OFFLOAD_LATENCY);
} else {
tnum = comm->nthreads;
overflow = fix->get_overflow_flag();
}
const int nthreads = tnum;
const int maxnbors = buffers->get_max_nbors();
int * _noalias const atombin = buffers->get_atombin();
const int * _noalias const binpacked = buffers->get_binpacked();
const int xperiodic = domain->xperiodic;
const int yperiodic = domain->yperiodic;
const int zperiodic = domain->zperiodic;
const flt_t xprd_half = domain->xprd_half;
const flt_t yprd_half = domain->yprd_half;
const flt_t zprd_half = domain->zprd_half;
// Make sure dummy coordinates to eliminate loop remainder not within cutoff
{
const flt_t dx = (INTEL_BIGP - bboxhi[0]);
const flt_t dy = (INTEL_BIGP - bboxhi[1]);
const flt_t dz = (INTEL_BIGP - bboxhi[2]);
if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
error->one(FLERR,
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
}
#ifdef _LMP_INTEL_OFFLOAD
const int * _noalias const binhead = this->binhead;
const int * _noalias const bins = this->bins;
const int cop = fix->coprocessor_number();
const int separate_buffers = fix->separate_buffers();
#pragma offload target(mic:cop) if(offload) \
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
out(numneigh:length(0) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
in(atombin:length(aend) alloc_if(0) free_if(0)) \
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
in(maxnbors,nthreads,maxspecial,nstencil,offload_end,pad_width,e_nall) \
in(offload,separate_buffers, astart, aend, nlocal, molecular, ntypes) \
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
out(overflow:length(5) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(tag)
#endif
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
#ifdef _LMP_INTEL_OFFLOAD
overflow[LMP_LOCAL_MIN] = astart;
overflow[LMP_LOCAL_MAX] = aend - 1;
overflow[LMP_GHOST_MIN] = e_nall;
overflow[LMP_GHOST_MAX] = -1;
#endif
int nstencilp = 0;
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
for (int k = 0; k < nstencil; k++) {
binstart[nstencilp] = stencil[k];
int end = stencil[k] + 1;
for (int kk = k + 1; kk < nstencil; kk++) {
if (stencil[kk-1]+1 == stencil[kk]) {
end++;
k++;
} else break;
}
binend[nstencilp] = end;
nstencilp++;
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(numneigh, overflow, nstencilp, binstart, binend)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
#endif
const int num = aend - astart;
int tid, ifrom, ito;
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
ifrom += astart;
ito += astart;
int which;
const int list_size = (ito + tid * 2 + 2) * maxnbors;
int ct = (ifrom + tid * 2) * maxnbors;
int *neighptr = firstneigh + ct;
const int obound = maxnbors * 3;
for (int i = ifrom; i < ito; i++) {
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
const int itype = x[i].w;
const int ioffset = ntypes * itype;
// loop over all atoms in bins in stencil
// pairs for atoms j "below" i are excluded
// below = lower z or (equal z and lower y) or (equal zy and lower x)
// (equal zyx and j <= i)
// latter excludes self-self interaction but allows superposed atoms
const int ibin = atombin[i];
int raw_count = maxnbors;
for (int k = 0; k < nstencilp; k++) {
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
for (int jj = bstart; jj < bend; jj++) {
const int j = binpacked[jj];
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
if (j < nlocal) {
if (i < offload_end) continue;
} else if (offload) continue;
}
#endif
if (x[j].z < ztmp) continue;
if (x[j].z == ztmp) {
if (x[j].y < ytmp) continue;
if (x[j].y == ytmp) {
if (x[j].x < xtmp) continue;
if (x[j].x == xtmp && j <= i) continue;
}
}
#ifndef _LMP_INTEL_OFFLOAD
if (exclude) {
const int jtype = x[j].w;
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
}
#endif
neighptr[raw_count++] = j;
}
}
if (raw_count > obound)
*overflow = 1;
#if defined(LMP_SIMD_COMPILER)
#ifdef _LMP_INTEL_OFFLOAD
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
#if __INTEL_COMPILER+0 > 1499
#pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#endif
#else
#pragma vector aligned
#pragma simd
#endif
#endif
for (int u = maxnbors; u < raw_count; u++) {
int j = neighptr[u];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq > cutneighsq[ioffset + jtype])
neighptr[u] = e_nall;
else {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[u] = -j - 1;
}
#ifdef _LMP_INTEL_OFFLOAD
if (j < nlocal) {
if (j < vlmin) vlmin = j;
if (j > vlmax) vlmax = j;
} else {
if (j < vgmin) vgmin = j;
if (j > vgmax) vgmax = j;
}
#endif
}
}
int n = 0, n2 = maxnbors;
for (int u = maxnbors; u < raw_count; u++) {
const int j = neighptr[u];
int pj = j;
if (pj < e_nall) {
if (need_ic)
if (pj < 0) pj = -pj - 1;
if (pj < nlocal)
neighptr[n++] = j;
else
neighptr[n2++] = j;
}
}
int ns = n;
for (int u = maxnbors; u < n2; u++)
neighptr[n++] = neighptr[u];
ilist[i] = i;
cnumneigh[i] = ct;
ns += n2 - maxnbors;
int edge = (ns % pad_width);
if (edge) {
const int pad_end = ns + (pad_width - edge);
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min=1, max=15, avg=8
#endif
for ( ; ns < pad_end; ns++)
neighptr[ns] = e_nall;
}
numneigh[i] = ns;
ct += ns;
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
edge = (ct % alignb);
if (edge) ct += alignb - edge;
neighptr = firstneigh + ct;
if (ct + obound > list_size) {
if (i < ito - 1) {
*overflow = 1;
ct = (ifrom + tid * 2) * maxnbors;
}
}
}
if (*overflow == 1)
for (int i = ifrom; i < ito; i++)
numneigh[i] = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (separate_buffers) {
#if defined(_OPENMP)
#pragma omp critical
#endif
{
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
}
#pragma omp barrier
}
int ghost_offset = 0, nall_offset = e_nall;
if (separate_buffers) {
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
if (nghost < 0) nghost = 0;
if (offload) {
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
} else {
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
nall_offset = nlocal + nghost;
}
}
#endif
if (molecular) {
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj];
if (need_ic && j < 0) {
which = 0;
jlist[jj] = -j - 1;
} else
ofind_special(which, special, nspecial, i, tag[j]);
#ifdef _LMP_INTEL_OFFLOAD
if (j >= nlocal) {
if (j == e_nall)
jlist[jj] = nall_offset;
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which) jlist[jj] = j ^ (which << SBBITS);
}
}
}
#ifdef _LMP_INTEL_OFFLOAD
else if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
int jj = 0;
for (jj = 0; jj < jnum; jj++)
if (jlist[jj] >= nlocal) break;
while (jj < jnum) {
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
else jlist[jj] -= ghost_offset;
jj++;
}
}
}
#endif
} // end omp
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
if (offload) {
fix->stop_watch(TIME_OFFLOAD_LATENCY);
#ifdef _LMP_INTEL_OFFLOAD
for (int n = 0; n < aend; n++) {
ilist[n] = n;
numneigh[n] = 0;
}
#endif
} else {
for (int i = astart; i < aend; i++)
list->firstneigh[i] = firstneigh + cnumneigh[i];
fix->stop_watch(TIME_HOST_NEIGHBOR);
#ifdef _LMP_INTEL_OFFLOAD
if (separate_buffers) {
fix->start_watch(TIME_PACK);
fix->set_neighbor_host_sizes();
buffers->pack_sep_from_single(fix->host_min_local(),
fix->host_used_local(),
fix->host_min_ghost(),
fix->host_used_ghost());
fix->stop_watch(TIME_PACK);
}
#endif
}
}
/* ----------------------------------------------------------------------
binned neighbor list construction for all neighbors
every neighbor pair appears in list of both atoms i and j
------------------------------------------------------------------------- */
void Neighbor::full_bin_intel(NeighList *list)
{
const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
list->inum = nlocal;
list->gnum = 0;
// Get fix for intel stuff
FixIntel *fix = static_cast<FixIntel *>(fix_intel);
const int off_end = fix->offload_end_neighbor();
int host_start = fix->host_start_neighbor();;
int offload_noghost = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (fix->full_host_list()) host_start = 0;
offload_noghost = fix->offload_noghost();
if (exclude)
error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
#endif
if (list->nstencil > INTEL_MAX_STENCIL_CHECK)
error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
int need_ic = 0;
if (atom->molecular)
dminimum_image_check(need_ic, cutneighmax, cutneighmax, cutneighmax);
if (need_ic) {
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
fbi<float,double,1,1>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
fbi<float,double,1,1>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
fbi<float,double,0,1>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
fbi<float,double,0,1>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix);
}
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
fbi<double,double,1,1>(1, list, fix->get_double_buffers(),
0, off_end, fix);
fbi<double,double,1,1>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
fbi<double,double,0,1>(1, list, fix->get_double_buffers(),
0, off_end, fix);
fbi<double,double,0,1>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix);
}
} else {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
fbi<float,float,1,1>(1, list, fix->get_single_buffers(), 0, off_end,
fix);
fbi<float,float,1,1>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
fbi<float,float,0,1>(1, list, fix->get_single_buffers(), 0, off_end,
fix);
fbi<float,float,0,1>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix);
}
}
} else {
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
fbi<float,double,1,0>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
fbi<float,double,1,0>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
fbi<float,double,0,0>(1, list, fix->get_mixed_buffers(),
0, off_end, fix);
fbi<float,double,0,0>(0, list, fix->get_mixed_buffers(),
host_start, nlocal, fix);
}
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
fbi<double,double,1,0>(1, list, fix->get_double_buffers(),
0, off_end, fix);
fbi<double,double,1,0>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
fbi<double,double,0,0>(1, list, fix->get_double_buffers(),
0, off_end, fix);
fbi<double,double,0,0>(0, list, fix->get_double_buffers(),
host_start, nlocal, fix);
}
} else {
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
fbi<float,float,1,0>(1, list, fix->get_single_buffers(), 0, off_end,
fix);
fbi<float,float,1,0>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix, off_end);
} else
#endif
{
fbi<float,float,0,0>(1, list, fix->get_single_buffers(), 0, off_end,
fix);
fbi<float,float,0,0>(0, list, fix->get_single_buffers(),
host_start, nlocal, fix);
}
}
}
}
template <class flt_t, class acc_t, int offload_noghost, int need_ic>
void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
const int astart, const int aend, void *fix_in,
const int offload_end) {
IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
FixIntel *fix = (FixIntel *)fix_in;
const int nall = atom->nlocal + atom->nghost;
int pad = 1;
const int pack_width = fix->nbor_pack_width();
if (offload) {
fix->start_watch(TIME_PACK);
buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend, pack_width);
ATOM_T biga;
biga.x = INTEL_BIGP;
biga.y = INTEL_BIGP;
biga.z = INTEL_BIGP;
biga.w = 1;
buffers->get_x()[nall]=biga;
const int nthreads = comm->nthreads;
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(buffers)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
sizeof(ATOM_T));
buffers->thr_pack(ifrom, ito, 0);
}
fix->stop_watch(TIME_PACK);
fix->start_watch(TIME_HOST_NEIGHBOR);
bin_atoms<flt_t,acc_t>(buffers->get_x(), buffers->get_atombin(),
buffers->get_binpacked());
} else {
fix->start_watch(TIME_HOST_NEIGHBOR);
}
const int pad_width = pad;
if (aend-astart == 0) {
fix->stop_watch(TIME_HOST_NEIGHBOR);
return;
}
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const firstneigh = buffers->firstneigh(list);
int nall_t = nall;
if (offload_noghost && offload) nall_t = atom->nlocal;
const int e_nall = nall_t;
const int molecular = atom->molecular;
int *ns = NULL;
tagint *s = NULL;
int tag_size = 0, special_size;
if (buffers->need_tag()) tag_size = e_nall;
if (molecular) {
s = atom->special[0];
ns = atom->nspecial[0];
special_size = aend;
} else {
s = &buffers->_special_holder;
ns = &buffers->_nspecial_holder;
special_size = 0;
}
const tagint * _noalias const special = s;
const int * _noalias const nspecial = ns;
const int maxspecial = atom->maxspecial;
const tagint * _noalias const tag = atom->tag;
int * _noalias const ilist = list->ilist;
int * _noalias numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int nstencil = list->nstencil;
const int * _noalias const stencil = list->stencil;
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
const int ntypes = atom->ntypes + 1;
const int nlocal = atom->nlocal;
#ifndef _LMP_INTEL_OFFLOAD
int * const mask = atom->mask;
tagint * const molecule = atom->molecule;
#endif
int tnum;
int *overflow;
double *timer_compute;
if (offload) {
timer_compute = fix->off_watch_neighbor();
tnum = buffers->get_off_threads();
overflow = fix->get_off_overflow_flag();
fix->stop_watch(TIME_HOST_NEIGHBOR);
fix->start_watch(TIME_OFFLOAD_LATENCY);
} else {
tnum = comm->nthreads;
overflow = fix->get_overflow_flag();
}
const int nthreads = tnum;
const int maxnbors = buffers->get_max_nbors();
int * _noalias const atombin = buffers->get_atombin();
const int * _noalias const binpacked = buffers->get_binpacked();
const int xperiodic = domain->xperiodic;
const int yperiodic = domain->yperiodic;
const int zperiodic = domain->zperiodic;
const flt_t xprd_half = domain->xprd_half;
const flt_t yprd_half = domain->yprd_half;
const flt_t zprd_half = domain->zprd_half;
// Make sure dummy coordinates to eliminate loop remainder not within cutoff
{
const flt_t dx = (INTEL_BIGP - bboxhi[0]);
const flt_t dy = (INTEL_BIGP - bboxhi[1]);
const flt_t dz = (INTEL_BIGP - bboxhi[2]);
if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
error->one(FLERR,
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
}
#ifdef _LMP_INTEL_OFFLOAD
const int * _noalias const binhead = this->binhead;
const int * _noalias const bins = this->bins;
const int cop = fix->coprocessor_number();
const int separate_buffers = fix->separate_buffers();
#pragma offload target(mic:cop) if(offload) \
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
out(numneigh:length(0) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
in(atombin:length(aend) alloc_if(0) free_if(0)) \
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width) \
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
out(overflow:length(5) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(tag)
#endif
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
#ifdef _LMP_INTEL_OFFLOAD
overflow[LMP_LOCAL_MIN] = astart;
overflow[LMP_LOCAL_MAX] = aend - 1;
overflow[LMP_GHOST_MIN] = e_nall;
overflow[LMP_GHOST_MAX] = -1;
#endif
int nstencilp = 0;
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
for (int k = 0; k < nstencil; k++) {
binstart[nstencilp] = stencil[k];
int end = stencil[k] + 1;
for (int kk = k + 1; kk < nstencil; kk++) {
if (stencil[kk-1]+1 == stencil[kk]) {
end++;
k++;
} else break;
}
binend[nstencilp] = end;
nstencilp++;
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(numneigh, overflow, nstencilp, binstart, binend)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
#endif
const int num = aend - astart;
int tid, ifrom, ito;
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
ifrom += astart;
ito += astart;
int e_ito = ito;
if (ito == num) {
int imod = ito % pack_width;
if (imod) e_ito += pack_width - imod;
}
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
int which;
int pack_offset = maxnbors * pack_width;
int ct = (ifrom + tid * 2) * maxnbors;
int *neighptr = firstneigh + ct;
const int obound = pack_offset + maxnbors * 2;
int max_chunk = 0;
int lane = 0;
for (int i = ifrom; i < ito; i++) {
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
const int itype = x[i].w;
const tagint itag = tag[i];
const int ioffset = ntypes * itype;
const int ibin = atombin[i];
int raw_count = pack_offset;
// loop over all atoms in surrounding bins in stencil including self
// skip i = j
if (exclude) {
for (int k = 0; k < nstencilp; k++) {
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#ifndef _LMP_INTEL_OFFLOAD
#ifdef INTEL_VMASK
#pragma simd
#endif
#endif
for (int jj = bstart; jj < bend; jj++) {
int j = binpacked[jj];
if (i == j) j=e_nall;
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
if (j < nlocal) {
if (i < offload_end) continue;
} else if (offload) continue;
}
#endif
#ifndef _LMP_INTEL_OFFLOAD
const int jtype = x[j].w;
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
#endif
neighptr[raw_count++] = j;
}
}
} else {
for (int k = 0; k < nstencilp; k++) {
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#ifndef _LMP_INTEL_OFFLOAD
#ifdef INTEL_VMASK
#pragma simd
#endif
#endif
for (int jj = bstart; jj < bend; jj++) {
int j = binpacked[jj];
if (i == j) j=e_nall;
#ifdef _LMP_INTEL_OFFLOAD
if (offload_noghost) {
if (j < nlocal) {
if (i < offload_end) continue;
} else if (offload) continue;
}
#endif
neighptr[raw_count++] = j;
}
}
}
if (raw_count > obound) *overflow = 1;
#if defined(LMP_SIMD_COMPILER)
#ifdef _LMP_INTEL_OFFLOAD
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
#if __INTEL_COMPILER+0 > 1499
#pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#endif
#else
#pragma vector aligned
#pragma simd
#endif
#endif
for (int u = pack_offset; u < raw_count; u++) {
int j = neighptr[u];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
const int jtype = x[j].w;
const flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq > cutneighsq[ioffset + jtype])
neighptr[u] = e_nall;
else {
if (need_ic) {
int no_special;
ominimum_image_check(no_special, delx, dely, delz);
if (no_special)
neighptr[u] = -j - 1;
}
#ifdef _LMP_INTEL_OFFLOAD
if (j < nlocal) {
if (j < vlmin) vlmin = j;
if (j > vlmax) vlmax = j;
} else {
if (j < vgmin) vgmin = j;
if (j > vgmax) vgmax = j;
}
#endif
}
}
#ifdef _LMP_INTEL_OFFLOAD
lmin = MIN(lmin,vlmin);
gmin = MIN(gmin,vgmin);
lmax = MAX(lmax,vlmax);
gmax = MAX(gmax,vgmax);
#endif
int n = lane, n2 = pack_offset;
for (int u = pack_offset; u < raw_count; u++) {
const int j = neighptr[u];
int pj = j;
if (pj < e_nall) {
if (need_ic)
if (pj < 0) pj = -pj - 1;
const int jtag = tag[pj];
int flist = 0;
if (itag > jtag) {
if ((itag+jtag) % 2 == 0) flist = 1;
} else if (itag < jtag) {
if ((itag+jtag) % 2 == 1) flist = 1;
} else {
if (x[pj].z < ztmp) flist = 1;
else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp)
flist = 1;
}
if (flist) {
neighptr[n2++] = j;
} else {
neighptr[n] = j;
n += pack_width;
}
}
}
int ns = (n - lane) / pack_width;
atombin[i] = ns;
for (int u = pack_offset; u < n2; u++) {
neighptr[n] = neighptr[u];
n += pack_width;
}
ilist[i] = i;
cnumneigh[i] = ct + lane;
ns += n2 - pack_offset;
numneigh[i] = ns;
if (ns > max_chunk) max_chunk = ns;
lane++;
if (lane == pack_width) {
ct += max_chunk * pack_width;
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
const int edge = (ct % alignb);
if (edge) ct += alignb - edge;
neighptr = firstneigh + ct;
max_chunk = 0;
pack_offset = maxnbors * pack_width;
lane = 0;
if (ct + obound > list_size) {
if (i < ito - 1) {
*overflow = 1;
ct = (ifrom + tid * 2) * maxnbors;
}
}
}
}
if (*overflow == 1)
for (int i = ifrom; i < ito; i++)
numneigh[i] = 0;
#ifdef _LMP_INTEL_OFFLOAD
if (separate_buffers) {
#if defined(_OPENMP)
#pragma omp critical
#endif
{
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
}
#pragma omp barrier
}
int ghost_offset = 0, nall_offset = e_nall;
if (separate_buffers) {
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
if (nghost < 0) nghost = 0;
if (offload) {
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
} else {
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
nall_offset = nlocal + nghost;
}
}
#endif
if (molecular) {
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
const int trip = jnum * pack_width;
for (int jj = 0; jj < trip; jj+=pack_width) {
const int j = jlist[jj];
if (need_ic && j < 0) {
which = 0;
jlist[jj] = -j - 1;
} else
ofind_special(which, special, nspecial, i, tag[j]);
#ifdef _LMP_INTEL_OFFLOAD
if (j >= nlocal) {
if (j == e_nall)
jlist[jj] = nall_offset;
else if (which)
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
else jlist[jj]-=ghost_offset;
} else
#endif
if (which) jlist[jj] = j ^ (which << SBBITS);
}
}
}
#ifdef _LMP_INTEL_OFFLOAD
else if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) {
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
int jj = 0;
for (jj = 0; jj < jnum; jj++) {
if (jlist[jj] >= nlocal) {
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
else jlist[jj] -= ghost_offset;
}
}
}
}
#endif
} // end omp
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
if (offload) {
fix->stop_watch(TIME_OFFLOAD_LATENCY);
#ifdef _LMP_INTEL_OFFLOAD
for (int n = 0; n < aend; n++) {
ilist[n] = n;
numneigh[n] = 0;
}
#endif
} else {
for (int i = astart; i < aend; i++)
list->firstneigh[i] = firstneigh + cnumneigh[i];
fix->stop_watch(TIME_HOST_NEIGHBOR);
#ifdef _LMP_INTEL_OFFLOAD
if (separate_buffers) {
fix->start_watch(TIME_PACK);
fix->set_neighbor_host_sizes();
buffers->pack_sep_from_single(fix->host_min_local(),
fix->host_used_local(),
fix->host_min_ghost(),
fix->host_used_ghost());
fix->stop_watch(TIME_PACK);
}
#endif
}
}
Event Timeline
Log In to Comment