Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F69795494
neigh_half_bin_intel.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Jul 3, 13:24
Size
48 KB
Mime Type
text/x-c++
Expires
Fri, Jul 5, 13:24 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
18740982
Attached To
rLAMMPS lammps
neigh_half_bin_intel.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include "neighbor.h"
#include "neigh_list.h"
#include "atom.h"
#include "comm.h"
#include "group.h"
#include "fix_intel.h"
#if defined(_OPENMP)
#include <omp.h>
#endif
using
namespace
LAMMPS_NS
;
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push,target(mic))
#endif
template
<
class
flt_t
>
inline
int
mcoord2bin
(
const
flt_t
x0
,
const
flt_t
x1
,
const
flt_t
x2
,
const
flt_t
bboxlo0
,
const
flt_t
bboxlo1
,
const
flt_t
bboxlo2
,
const
flt_t
bboxhi0
,
const
flt_t
bboxhi1
,
const
flt_t
bboxhi2
,
const
flt_t
bininvx
,
const
flt_t
bininvy
,
const
flt_t
bininvz
,
const
int
nbinx
,
const
int
nbiny
,
const
int
nbinz
,
const
int
mbinx
,
const
int
mbiny
,
const
int
mbinz
,
const
int
mbinxlo
,
const
int
mbinylo
,
const
int
mbinzlo
)
{
int
ix
,
iy
,
iz
;
if
(
x0
>=
bboxhi0
)
ix
=
static_cast
<
int
>
((
x0
-
bboxhi0
)
*
bininvx
)
+
nbinx
;
else
if
(
x0
>=
bboxlo0
)
{
ix
=
static_cast
<
int
>
((
x0
-
bboxlo0
)
*
bininvx
);
ix
=
MIN
(
ix
,
nbinx
-
1
);
}
else
ix
=
static_cast
<
int
>
((
x0
-
bboxlo0
)
*
bininvx
)
-
1
;
if
(
x1
>=
bboxhi1
)
iy
=
static_cast
<
int
>
((
x1
-
bboxhi1
)
*
bininvy
)
+
nbiny
;
else
if
(
x1
>=
bboxlo1
)
{
iy
=
static_cast
<
int
>
((
x1
-
bboxlo1
)
*
bininvy
);
iy
=
MIN
(
iy
,
nbiny
-
1
);
}
else
iy
=
static_cast
<
int
>
((
x1
-
bboxlo1
)
*
bininvy
)
-
1
;
if
(
x2
>=
bboxhi2
)
iz
=
static_cast
<
int
>
((
x2
-
bboxhi2
)
*
bininvz
)
+
nbinz
;
else
if
(
x2
>=
bboxlo2
)
{
iz
=
static_cast
<
int
>
((
x2
-
bboxlo2
)
*
bininvz
);
iz
=
MIN
(
iz
,
nbinz
-
1
);
}
else
iz
=
static_cast
<
int
>
((
x2
-
bboxlo2
)
*
bininvz
)
-
1
;
return
(
iz
-
mbinzlo
)
*
mbiny
*
mbinx
+
(
iy
-
mbinylo
)
*
mbinx
+
(
ix
-
mbinxlo
);
}
#define ofind_special(which, special, nspecial, i, tag, special_flag) \
{ \
which = 0; \
const int n1 = nspecial[i * 3]; \
const int n2 = nspecial[i * 3 + 1]; \
const int n3 = nspecial[i * 3 + 2]; \
const int *sptr = special + i * maxspecial; \
for (int s = 0; s < n3; s++) { \
if (sptr[s] == tag) { \
if (s < n1) { \
if (special_flag[1] == 0) which = -1; \
else if (special_flag[1] == 1) which = 0; \
else which = 1; \
} else if (s < n2) { \
if (special_flag[2] == 0) which = -1; \
else if (special_flag[2] == 1) which = 0; \
else which = 2; \
} else { \
if (special_flag[3] == 0) which = -1; \
else if (special_flag[3] == 1) which = 0; \
else which = 3; \
} \
} \
} \
}
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
template
<
class
flt_t
,
class
acc_t
>
void
Neighbor
::
bin_atoms
(
void
*
xin
)
{
const
ATOM_T
*
restrict
const
x
=
(
const
ATOM_T
*
restrict
const
)
xin
;
int
nlocal
=
atom
->
nlocal
;
const
int
nall
=
nlocal
+
atom
->
nghost
;
const
flt_t
bininvx
=
this
->
bininvx
;
const
flt_t
bininvy
=
this
->
bininvy
;
const
flt_t
bininvz
=
this
->
bininvz
;
const
flt_t
bboxlo0
=
this
->
bboxlo
[
0
];
const
flt_t
bboxlo1
=
this
->
bboxlo
[
1
];
const
flt_t
bboxlo2
=
this
->
bboxlo
[
2
];
const
flt_t
bboxhi0
=
this
->
bboxhi
[
0
];
const
flt_t
bboxhi1
=
this
->
bboxhi
[
1
];
const
flt_t
bboxhi2
=
this
->
bboxhi
[
2
];
int
i
,
ibin
;
for
(
i
=
0
;
i
<
mbins
;
i
++
)
binhead
[
i
]
=
-
1
;
int
*
mask
=
atom
->
mask
;
if
(
includegroup
)
{
int
bitmask
=
group
->
bitmask
[
includegroup
];
for
(
i
=
nall
-
1
;
i
>=
nlocal
;
i
--
)
{
if
(
mask
[
i
]
&
bitmask
)
{
ibin
=
mcoord2bin
(
x
[
i
].
x
,
x
[
i
].
y
,
x
[
i
].
z
,
bboxlo0
,
bboxlo1
,
bboxlo2
,
bboxhi0
,
bboxhi1
,
bboxhi2
,
bininvx
,
bininvy
,
bininvz
,
nbinx
,
nbiny
,
nbinz
,
mbinx
,
mbiny
,
mbinz
,
mbinxlo
,
mbinylo
,
mbinzlo
);
bins
[
i
]
=
binhead
[
ibin
];
binhead
[
ibin
]
=
i
;
}
}
for
(
i
=
atom
->
nfirst
-
1
;
i
>=
0
;
i
--
)
{
ibin
=
mcoord2bin
(
x
[
i
].
x
,
x
[
i
].
y
,
x
[
i
].
z
,
bboxlo0
,
bboxlo1
,
bboxlo2
,
bboxhi0
,
bboxhi1
,
bboxhi2
,
bininvx
,
bininvy
,
bininvz
,
nbinx
,
nbiny
,
nbinz
,
mbinx
,
mbiny
,
mbinz
,
mbinxlo
,
mbinylo
,
mbinzlo
);
bins
[
i
]
=
binhead
[
ibin
];
binhead
[
ibin
]
=
i
;
}
}
else
{
for
(
i
=
nall
-
1
;
i
>=
0
;
i
--
)
{
ibin
=
mcoord2bin
(
x
[
i
].
x
,
x
[
i
].
y
,
x
[
i
].
z
,
bboxlo0
,
bboxlo1
,
bboxlo2
,
bboxhi0
,
bboxhi1
,
bboxhi2
,
bininvx
,
bininvy
,
bininvz
,
nbinx
,
nbiny
,
nbinz
,
mbinx
,
mbiny
,
mbinz
,
mbinxlo
,
mbinylo
,
mbinzlo
);
bins
[
i
]
=
binhead
[
ibin
];
binhead
[
ibin
]
=
i
;
}
}
}
/* ----------------------------------------------------------------------
binned neighbor list construction with partial Newton's 3rd law
each owned atom i checks own bin and other bins in stencil
pair stored once if i,j are both owned and i < j
pair stored by me if j is ghost (also stored by proc owning j)
------------------------------------------------------------------------- */
void
Neighbor
::
half_bin_no_newton_intel
(
NeighList
*
list
)
{
const
int
nlocal
=
(
includegroup
)
?
atom
->
nfirst
:
atom
->
nlocal
;
list
->
inum
=
nlocal
;
// Get fix for intel stuff
FixIntel
*
fix
=
static_cast
<
FixIntel
*>
(
fix_intel
);
const
int
off_end
=
fix
->
offload_end_neighbor
();
int
host_start
=
off_end
;;
#ifdef _LMP_INTEL_OFFLOAD
if
(
fix
->
full_host_list
())
host_start
=
0
;
if
(
exclude
)
error
->
all
(
FLERR
,
"Exclusion lists not yet supported for Intel offload"
);
#endif
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
{
hbnni
<
float
,
double
>
(
1
,
list
,
fix
->
get_mixed_buffers
(),
0
,
off_end
,
fix
);
hbnni
<
float
,
double
>
(
0
,
list
,
fix
->
get_mixed_buffers
(),
host_start
,
nlocal
,
fix
);
}
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
{
hbnni
<
double
,
double
>
(
1
,
list
,
fix
->
get_double_buffers
(),
0
,
off_end
,
fix
);
hbnni
<
double
,
double
>
(
0
,
list
,
fix
->
get_double_buffers
(),
host_start
,
nlocal
,
fix
);
}
else
{
hbnni
<
float
,
float
>
(
1
,
list
,
fix
->
get_single_buffers
(),
0
,
off_end
,
fix
);
hbnni
<
float
,
float
>
(
0
,
list
,
fix
->
get_single_buffers
(),
host_start
,
nlocal
,
fix
);
}
}
template
<
class
flt_t
,
class
acc_t
>
void
Neighbor
::
hbnni
(
const
int
offload
,
NeighList
*
list
,
void
*
buffers_in
,
const
int
astart
,
const
int
aend
,
void
*
fix_in
)
{
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
=
(
IntelBuffers
<
flt_t
,
acc_t
>
*
)
buffers_in
;
FixIntel
*
fix
=
(
FixIntel
*
)
fix_in
;
const
int
nall
=
atom
->
nlocal
+
atom
->
nghost
;
int
pad
=
1
;
if
(
offload
)
{
fix
->
start_watch
(
TIME_PACK
);
buffers
->
grow
(
nall
,
atom
->
nlocal
,
comm
->
nthreads
,
aend
);
buffers
->
grow_nbor
(
list
,
atom
->
nlocal
,
aend
);
ATOM_T
biga
;
biga
.
x
=
INTEL_BIGP
;
biga
.
y
=
INTEL_BIGP
;
biga
.
z
=
INTEL_BIGP
;
biga
.
w
=
1
;
buffers
->
get_x
()[
nall
]
=
biga
;
const
int
nthreads
=
comm
->
nthreads
;
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(buffers)
#endif
{
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id_align
(
ifrom
,
ito
,
tid
,
nall
,
nthreads
,
sizeof
(
ATOM_T
));
buffers
->
thr_pack
(
ifrom
,
ito
,
0
);
}
fix
->
stop_watch
(
TIME_PACK
);
fix
->
start_watch
(
TIME_HOST_NEIGHBOR
);
bin_atoms
<
flt_t
,
acc_t
>
(
buffers
->
get_x
());
if
(
INTEL_MIC_NBOR_PAD
>
1
)
pad
=
INTEL_MIC_NBOR_PAD
*
sizeof
(
float
)
/
sizeof
(
flt_t
);
}
else
{
fix
->
start_watch
(
TIME_HOST_NEIGHBOR
);
if
(
INTEL_NBOR_PAD
>
1
)
pad
=
INTEL_NBOR_PAD
*
sizeof
(
float
)
/
sizeof
(
flt_t
);
}
const
int
pad_width
=
pad
;
if
(
aend
-
astart
==
0
)
{
fix
->
stop_watch
(
TIME_HOST_NEIGHBOR
);
return
;
}
const
ATOM_T
*
restrict
const
x
=
buffers
->
get_x
();
int
*
restrict
const
firstneigh
=
buffers
->
firstneigh
(
list
);
const
int
molecular
=
atom
->
molecular
;
int
*
ns
=
NULL
,
*
s
=
NULL
;
int
tag_size
,
special_size
;
if
(
molecular
)
{
s
=
atom
->
special
[
0
];
ns
=
atom
->
nspecial
[
0
];
tag_size
=
nall
;
special_size
=
aend
;
}
else
{
s
=
&
buffers
->
_special_holder
;
ns
=
&
buffers
->
_nspecial_holder
;
tag_size
=
0
;
special_size
=
0
;
}
const
int
*
restrict
const
special
=
s
;
const
int
*
restrict
const
nspecial
=
ns
;
const
int
maxspecial
=
atom
->
maxspecial
;
const
int
*
restrict
const
tag
=
atom
->
tag
;
int
*
restrict
const
ilist
=
list
->
ilist
;
int
*
restrict
numneigh
=
list
->
numneigh
;
int
*
restrict
const
cnumneigh
=
buffers
->
cnumneigh
(
list
);
const
int
nstencil
=
list
->
nstencil
;
const
int
*
restrict
const
stencil
=
list
->
stencil
;
const
flt_t
*
restrict
const
cutneighsq
=
buffers
->
get_cutneighsq
()[
0
];
const
int
ntypes
=
atom
->
ntypes
+
1
;
const
int
nlocal
=
atom
->
nlocal
;
#ifndef _LMP_INTEL_OFFLOAD
int
*
const
mask
=
atom
->
mask
;
int
*
const
molecule
=
atom
->
molecule
;
#endif
int
tnum
;
int
*
overflow
;
double
*
timer_compute
;
if
(
offload
)
{
timer_compute
=
fix
->
off_watch_neighbor
();
tnum
=
buffers
->
get_off_threads
();
overflow
=
fix
->
get_off_overflow_flag
();
fix
->
stop_watch
(
TIME_HOST_NEIGHBOR
);
fix
->
start_watch
(
TIME_OFFLOAD_LATENCY
);
}
else
{
tnum
=
comm
->
nthreads
;
overflow
=
fix
->
get_overflow_flag
();
}
const
int
nthreads
=
tnum
;
const
int
maxnbors
=
buffers
->
get_max_nbors
();
const
flt_t
bboxlo0
=
this
->
bboxlo
[
0
];
const
flt_t
bboxlo1
=
this
->
bboxlo
[
1
];
const
flt_t
bboxlo2
=
this
->
bboxlo
[
2
];
const
flt_t
bboxhi0
=
this
->
bboxhi
[
0
];
const
flt_t
bboxhi1
=
this
->
bboxhi
[
1
];
const
flt_t
bboxhi2
=
this
->
bboxhi
[
2
];
const
flt_t
bininvx
=
this
->
bininvx
;
const
flt_t
bininvy
=
this
->
bininvy
;
const
flt_t
bininvz
=
this
->
bininvz
;
// Make sure dummy coordinates to eliminate loop remainder not within cutoff
{
const
flt_t
dx
=
(
INTEL_BIGP
-
bboxhi0
);
const
flt_t
dy
=
(
INTEL_BIGP
-
bboxhi1
);
const
flt_t
dz
=
(
INTEL_BIGP
-
bboxhi2
);
if
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
<
static_cast
<
flt_t
>
(
cutneighmaxsq
))
error
->
one
(
FLERR
,
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."
);
}
#ifdef _LMP_INTEL_OFFLOAD
const
int
*
restrict
const
binhead
=
this
->
binhead
;
const
int
*
restrict
const
special_flag
=
this
->
special_flag
;
const
int
nbinx
=
this
->
nbinx
;
const
int
nbiny
=
this
->
nbiny
;
const
int
nbinz
=
this
->
nbinz
;
const
int
mbinxlo
=
this
->
mbinxlo
;
const
int
mbinylo
=
this
->
mbinylo
;
const
int
mbinzlo
=
this
->
mbinzlo
;
const
int
mbinx
=
this
->
mbinx
;
const
int
mbiny
=
this
->
mbiny
;
const
int
mbinz
=
this
->
mbinz
;
const
int
*
restrict
const
bins
=
this
->
bins
;
const
int
cop
=
fix
->
coprocessor_number
();
const
int
separate_buffers
=
fix
->
separate_buffers
();
#pragma offload target(mic:cop) if(offload) \
in(x:length(nall+1) alloc_if(0) free_if(0)) \
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
in(bins:length(nall) alloc_if(0) free_if(0)) \
in(binhead:length(mbins) alloc_if(0) free_if(0)) \
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
out(numneigh:length(0) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
in(special_flag:length(0) alloc_if(0) free_if(0)) \
in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz) \
in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload) \
in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \
in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \
out(overflow:length(5) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(numneigh)
#endif
{
#ifdef __MIC__
*
timer_compute
=
MIC_Wtime
();
#endif
#ifdef _LMP_INTEL_OFFLOAD
overflow
[
LMP_LOCAL_MIN
]
=
astart
;
overflow
[
LMP_LOCAL_MAX
]
=
aend
-
1
;
overflow
[
LMP_GHOST_MIN
]
=
nall
;
overflow
[
LMP_GHOST_MAX
]
=
-
1
;
#endif
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(numneigh,overflow)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
int
lmin
=
nall
,
lmax
=
-
1
,
gmin
=
nall
,
gmax
=
-
1
;
#endif
const
int
num
=
aend
-
astart
;
int
tid
,
ifrom
,
ito
;
IP_PRE_omp_range_id
(
ifrom
,
ito
,
tid
,
num
,
nthreads
);
ifrom
+=
astart
;
ito
+=
astart
;
int
which
;
const
int
list_size
=
(
ito
+
tid
+
1
)
*
maxnbors
;
int
ct
=
(
ifrom
+
tid
)
*
maxnbors
;
int
*
neighptr
=
firstneigh
+
ct
;
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
int
j
,
k
,
n
,
n2
,
itype
,
jtype
,
ibin
;
double
xtmp
,
ytmp
,
ztmp
,
delx
,
dely
,
delz
,
rsq
;
n
=
0
;
n2
=
maxnbors
;
xtmp
=
x
[
i
].
x
;
ytmp
=
x
[
i
].
y
;
ztmp
=
x
[
i
].
z
;
itype
=
x
[
i
].
w
;
const
int
ioffset
=
ntypes
*
itype
;
// loop over all atoms in other bins in stencil including self
// only store pair if i < j
// stores own/own pairs only once
// stores own/ghost pairs on both procs
ibin
=
mcoord2bin
(
x
[
i
].
x
,
x
[
i
].
y
,
x
[
i
].
z
,
bboxlo0
,
bboxlo1
,
bboxlo2
,
bboxhi0
,
bboxhi1
,
bboxhi2
,
bininvx
,
bininvy
,
bininvz
,
nbinx
,
nbiny
,
nbinz
,
mbinx
,
mbiny
,
mbinz
,
mbinxlo
,
mbinylo
,
mbinzlo
);
for
(
k
=
0
;
k
<
nstencil
;
k
++
)
{
for
(
j
=
binhead
[
ibin
+
stencil
[
k
]];
j
>=
0
;
j
=
bins
[
j
])
{
if
(
j
<=
i
)
continue
;
jtype
=
x
[
j
].
w
;
#ifndef _LMP_INTEL_OFFLOAD
if
(
exclude
&&
exclusion
(
i
,
j
,
itype
,
jtype
,
mask
,
molecule
))
continue
;
#endif
delx
=
xtmp
-
x
[
j
].
x
;
dely
=
ytmp
-
x
[
j
].
y
;
delz
=
ztmp
-
x
[
j
].
z
;
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
if
(
rsq
<=
cutneighsq
[
ioffset
+
jtype
])
{
if
(
j
<
nlocal
)
{
neighptr
[
n
++
]
=
j
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
<
lmin
)
lmin
=
j
;
if
(
j
>
lmax
)
lmax
=
j
;
#endif
}
else
{
neighptr
[
n2
++
]
=
j
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
<
gmin
)
gmin
=
j
;
if
(
j
>
gmax
)
gmax
=
j
;
#endif
}
}
}
}
ilist
[
i
]
=
i
;
cnumneigh
[
i
]
=
ct
;
if
(
n
>
maxnbors
)
*
overflow
=
1
;
for
(
k
=
maxnbors
;
k
<
n2
;
k
++
)
neighptr
[
n
++
]
=
neighptr
[
k
];
while
(
(
n
%
pad_width
)
!=
0
)
neighptr
[
n
++
]
=
nall
;
numneigh
[
i
]
=
n
;
while
((
n
%
(
INTEL_DATA_ALIGN
/
sizeof
(
int
)))
!=
0
)
n
++
;
ct
+=
n
;
neighptr
+=
n
;
if
(
ct
+
n
+
maxnbors
>
list_size
)
{
*
overflow
=
1
;
ct
=
(
ifrom
+
tid
)
*
maxnbors
;
}
}
if
(
*
overflow
==
1
)
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
numneigh
[
i
]
=
0
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
separate_buffers
)
{
#if defined(_OPENMP)
#pragma omp critical
#endif
{
if
(
lmin
<
overflow
[
LMP_LOCAL_MIN
])
overflow
[
LMP_LOCAL_MIN
]
=
lmin
;
if
(
lmax
>
overflow
[
LMP_LOCAL_MAX
])
overflow
[
LMP_LOCAL_MAX
]
=
lmax
;
if
(
gmin
<
overflow
[
LMP_GHOST_MIN
])
overflow
[
LMP_GHOST_MIN
]
=
gmin
;
if
(
gmax
>
overflow
[
LMP_GHOST_MAX
])
overflow
[
LMP_GHOST_MAX
]
=
gmax
;
}
#pragma omp barrier
}
int
ghost_offset
=
0
,
nall_offset
=
nall
;
if
(
separate_buffers
)
{
int
nghost
=
overflow
[
LMP_GHOST_MAX
]
+
1
-
overflow
[
LMP_GHOST_MIN
];
if
(
nghost
<
0
)
nghost
=
0
;
if
(
offload
)
{
ghost_offset
=
overflow
[
LMP_GHOST_MIN
]
-
overflow
[
LMP_LOCAL_MAX
]
-
1
;
nall_offset
=
overflow
[
LMP_LOCAL_MAX
]
+
1
+
nghost
;
}
else
{
ghost_offset
=
overflow
[
LMP_GHOST_MIN
]
-
nlocal
;
nall_offset
=
nlocal
+
nghost
;
}
}
#endif
if
(
molecular
)
{
for
(
int
i
=
ifrom
;
i
<
ito
;
++
i
)
{
int
*
restrict
jlist
=
firstneigh
+
cnumneigh
[
i
];
const
int
jnum
=
numneigh
[
i
];
for
(
int
jj
=
0
;
jj
<
jnum
;
jj
++
)
{
const
int
j
=
jlist
[
jj
];
ofind_special
(
which
,
special
,
nspecial
,
i
,
tag
[
j
],
special_flag
);
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
>=
nlocal
)
{
if
(
j
==
nall
)
jlist
[
jj
]
=
nall_offset
;
else
if
(
which
>
0
)
jlist
[
jj
]
=
(
j
-
ghost_offset
)
^
(
which
<<
SBBITS
);
else
jlist
[
jj
]
-=
ghost_offset
;
}
else
#endif
if
(
which
>
0
)
jlist
[
jj
]
=
j
^
(
which
<<
SBBITS
);
}
}
}
#ifdef _LMP_INTEL_OFFLOAD
else
if
(
separate_buffers
)
{
for
(
int
i
=
ifrom
;
i
<
ito
;
++
i
)
{
int
*
restrict
jlist
=
firstneigh
+
cnumneigh
[
i
];
const
int
jnum
=
numneigh
[
i
];
int
jj
=
0
;
for
(
jj
=
0
;
jj
<
jnum
;
jj
++
)
if
(
jlist
[
jj
]
>=
nlocal
)
break
;
while
(
jj
<
jnum
)
{
if
(
jlist
[
jj
]
==
nall
)
jlist
[
jj
]
=
nall_offset
;
else
jlist
[
jj
]
-=
ghost_offset
;
jj
++
;
}
}
}
#endif
}
// end omp
#ifdef __MIC__
*
timer_compute
=
MIC_Wtime
()
-
*
timer_compute
;
#endif
}
// end offload
if
(
offload
)
{
fix
->
stop_watch
(
TIME_OFFLOAD_LATENCY
);
#ifdef _LMP_INTEL_OFFLOAD
for
(
int
n
=
0
;
n
<
aend
;
n
++
)
{
ilist
[
n
]
=
n
;
numneigh
[
n
]
=
0
;
}
#endif
}
else
{
for
(
int
i
=
astart
;
i
<
aend
;
i
++
)
list
->
firstneigh
[
i
]
=
firstneigh
+
cnumneigh
[
i
];
fix
->
stop_watch
(
TIME_HOST_NEIGHBOR
);
#ifdef _LMP_INTEL_OFFLOAD
if
(
separate_buffers
)
{
fix
->
start_watch
(
TIME_PACK
);
fix
->
set_neighbor_host_sizes
();
buffers
->
pack_sep_from_single
(
fix
->
host_min_local
(),
fix
->
host_used_local
(),
fix
->
host_min_ghost
(),
fix
->
host_used_ghost
());
fix
->
stop_watch
(
TIME_PACK
);
}
#endif
}
}
/* ----------------------------------------------------------------------
binned neighbor list construction with full Newton's 3rd law
each owned atom i checks its own bin and other bins in Newton stencil
every pair stored exactly once by some processor
------------------------------------------------------------------------- */
void
Neighbor
::
half_bin_newton_intel
(
NeighList
*
list
)
{
const
int
nlocal
=
(
includegroup
)
?
atom
->
nfirst
:
atom
->
nlocal
;
list
->
inum
=
nlocal
;
// Get fix for intel stuff
FixIntel
*
fix
=
static_cast
<
FixIntel
*>
(
fix_intel
);
const
int
off_end
=
fix
->
offload_end_neighbor
();
int
host_start
=
fix
->
host_start_neighbor
();;
int
offload_noghost
=
0
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
fix
->
full_host_list
())
host_start
=
0
;
offload_noghost
=
fix
->
offload_noghost
();
if
(
exclude
)
error
->
all
(
FLERR
,
"Exclusion lists not yet supported for Intel offload"
);
#endif
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
{
if
(
offload_noghost
)
{
hbni
<
float
,
double
,
1
>
(
1
,
list
,
fix
->
get_mixed_buffers
(),
0
,
off_end
,
fix
);
hbni
<
float
,
double
,
1
>
(
0
,
list
,
fix
->
get_mixed_buffers
(),
host_start
,
nlocal
,
fix
,
off_end
);
}
else
{
hbni
<
float
,
double
,
0
>
(
1
,
list
,
fix
->
get_mixed_buffers
(),
0
,
off_end
,
fix
);
hbni
<
float
,
double
,
0
>
(
0
,
list
,
fix
->
get_mixed_buffers
(),
host_start
,
nlocal
,
fix
);
}
}
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
{
if
(
offload_noghost
)
{
hbni
<
double
,
double
,
1
>
(
1
,
list
,
fix
->
get_double_buffers
(),
0
,
off_end
,
fix
);
hbni
<
double
,
double
,
1
>
(
0
,
list
,
fix
->
get_double_buffers
(),
host_start
,
nlocal
,
fix
,
off_end
);
}
else
{
hbni
<
double
,
double
,
0
>
(
1
,
list
,
fix
->
get_double_buffers
(),
0
,
off_end
,
fix
);
hbni
<
double
,
double
,
0
>
(
0
,
list
,
fix
->
get_double_buffers
(),
host_start
,
nlocal
,
fix
);
}
}
else
{
if
(
offload_noghost
)
{
hbni
<
float
,
float
,
1
>
(
1
,
list
,
fix
->
get_single_buffers
(),
0
,
off_end
,
fix
);
hbni
<
float
,
float
,
1
>
(
0
,
list
,
fix
->
get_single_buffers
(),
host_start
,
nlocal
,
fix
,
off_end
);
}
else
{
hbni
<
float
,
float
,
0
>
(
1
,
list
,
fix
->
get_single_buffers
(),
0
,
off_end
,
fix
);
hbni
<
float
,
float
,
0
>
(
0
,
list
,
fix
->
get_single_buffers
(),
host_start
,
nlocal
,
fix
);
}
}
}
template
<
class
flt_t
,
class
acc_t
,
int
offload_noghost
>
void
Neighbor
::
hbni
(
const
int
offload
,
NeighList
*
list
,
void
*
buffers_in
,
const
int
astart
,
const
int
aend
,
void
*
fix_in
,
const
int
offload_end
)
{
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
=
(
IntelBuffers
<
flt_t
,
acc_t
>
*
)
buffers_in
;
FixIntel
*
fix
=
(
FixIntel
*
)
fix_in
;
const
int
nall
=
atom
->
nlocal
+
atom
->
nghost
;
int
pad
=
1
;
if
(
offload
)
{
fix
->
start_watch
(
TIME_PACK
);
buffers
->
grow
(
nall
,
atom
->
nlocal
,
comm
->
nthreads
,
aend
);
buffers
->
grow_nbor
(
list
,
atom
->
nlocal
,
aend
);
ATOM_T
biga
;
biga
.
x
=
INTEL_BIGP
;
biga
.
y
=
INTEL_BIGP
;
biga
.
z
=
INTEL_BIGP
;
biga
.
w
=
1
;
buffers
->
get_x
()[
nall
]
=
biga
;
const
int
nthreads
=
comm
->
nthreads
;
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(buffers)
#endif
{
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id_align
(
ifrom
,
ito
,
tid
,
nall
,
nthreads
,
sizeof
(
ATOM_T
));
buffers
->
thr_pack
(
ifrom
,
ito
,
0
);
}
fix
->
stop_watch
(
TIME_PACK
);
fix
->
start_watch
(
TIME_HOST_NEIGHBOR
);
bin_atoms
<
flt_t
,
acc_t
>
(
buffers
->
get_x
());
if
(
INTEL_MIC_NBOR_PAD
>
1
)
pad
=
INTEL_MIC_NBOR_PAD
*
sizeof
(
float
)
/
sizeof
(
flt_t
);
}
else
{
fix
->
start_watch
(
TIME_HOST_NEIGHBOR
);
if
(
INTEL_NBOR_PAD
>
1
)
pad
=
INTEL_NBOR_PAD
*
sizeof
(
float
)
/
sizeof
(
flt_t
);
}
const
int
pad_width
=
pad
;
if
(
aend
-
astart
==
0
)
{
fix
->
stop_watch
(
TIME_HOST_NEIGHBOR
);
return
;
}
const
ATOM_T
*
restrict
const
x
=
buffers
->
get_x
();
int
*
restrict
const
firstneigh
=
buffers
->
firstneigh
(
list
);
int
nall_t
=
nall
;
if
(
offload_noghost
&&
offload
)
nall_t
=
atom
->
nlocal
;
const
int
e_nall
=
nall_t
;
const
int
molecular
=
atom
->
molecular
;
int
*
ns
=
NULL
,
*
s
=
NULL
;
int
tag_size
,
special_size
;
if
(
molecular
)
{
s
=
atom
->
special
[
0
];
ns
=
atom
->
nspecial
[
0
];
tag_size
=
e_nall
;
special_size
=
aend
;
}
else
{
s
=
&
buffers
->
_special_holder
;
ns
=
&
buffers
->
_nspecial_holder
;
tag_size
=
0
;
special_size
=
0
;
}
const
int
*
restrict
const
special
=
s
;
const
int
*
restrict
const
nspecial
=
ns
;
const
int
maxspecial
=
atom
->
maxspecial
;
const
int
*
restrict
const
tag
=
atom
->
tag
;
int
*
restrict
const
ilist
=
list
->
ilist
;
int
*
restrict
numneigh
=
list
->
numneigh
;
int
*
restrict
const
cnumneigh
=
buffers
->
cnumneigh
(
list
);
const
int
nstencil
=
list
->
nstencil
;
const
int
*
restrict
const
stencil
=
list
->
stencil
;
const
flt_t
*
restrict
const
cutneighsq
=
buffers
->
get_cutneighsq
()[
0
];
const
int
ntypes
=
atom
->
ntypes
+
1
;
const
int
nlocal
=
atom
->
nlocal
;
#ifndef _LMP_INTEL_OFFLOAD
int
*
const
mask
=
atom
->
mask
;
int
*
const
molecule
=
atom
->
molecule
;
#endif
int
tnum
;
int
*
overflow
;
double
*
timer_compute
;
if
(
offload
)
{
timer_compute
=
fix
->
off_watch_neighbor
();
tnum
=
buffers
->
get_off_threads
();
overflow
=
fix
->
get_off_overflow_flag
();
fix
->
stop_watch
(
TIME_HOST_NEIGHBOR
);
fix
->
start_watch
(
TIME_OFFLOAD_LATENCY
);
}
else
{
tnum
=
comm
->
nthreads
;
overflow
=
fix
->
get_overflow_flag
();
}
const
int
nthreads
=
tnum
;
const
int
maxnbors
=
buffers
->
get_max_nbors
();
const
flt_t
bboxlo0
=
this
->
bboxlo
[
0
];
const
flt_t
bboxlo1
=
this
->
bboxlo
[
1
];
const
flt_t
bboxlo2
=
this
->
bboxlo
[
2
];
const
flt_t
bboxhi0
=
this
->
bboxhi
[
0
];
const
flt_t
bboxhi1
=
this
->
bboxhi
[
1
];
const
flt_t
bboxhi2
=
this
->
bboxhi
[
2
];
const
flt_t
bininvx
=
this
->
bininvx
;
const
flt_t
bininvy
=
this
->
bininvy
;
const
flt_t
bininvz
=
this
->
bininvz
;
// Make sure dummy coordinates to eliminate loop remainder not within cutoff
{
const
flt_t
dx
=
(
INTEL_BIGP
-
bboxhi0
);
const
flt_t
dy
=
(
INTEL_BIGP
-
bboxhi1
);
const
flt_t
dz
=
(
INTEL_BIGP
-
bboxhi2
);
if
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
<
static_cast
<
flt_t
>
(
cutneighmaxsq
))
error
->
one
(
FLERR
,
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."
);
}
#ifdef _LMP_INTEL_OFFLOAD
const
int
*
restrict
const
binhead
=
this
->
binhead
;
const
int
*
restrict
const
special_flag
=
this
->
special_flag
;
const
int
nbinx
=
this
->
nbinx
;
const
int
nbiny
=
this
->
nbiny
;
const
int
nbinz
=
this
->
nbinz
;
const
int
mbinxlo
=
this
->
mbinxlo
;
const
int
mbinylo
=
this
->
mbinylo
;
const
int
mbinzlo
=
this
->
mbinzlo
;
const
int
mbinx
=
this
->
mbinx
;
const
int
mbiny
=
this
->
mbiny
;
const
int
mbinz
=
this
->
mbinz
;
const
int
*
restrict
const
bins
=
this
->
bins
;
const
int
cop
=
fix
->
coprocessor_number
();
const
int
separate_buffers
=
fix
->
separate_buffers
();
#pragma offload target(mic:cop) if(offload) \
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
in(bins:length(nall) alloc_if(0) free_if(0)) \
in(binhead:length(mbins) alloc_if(0) free_if(0)) \
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
out(numneigh:length(0) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
in(special_flag:length(0) alloc_if(0) free_if(0)) \
in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,e_nall,offload)\
in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload_end) \
in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \
in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \
out(overflow:length(5) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(numneigh)
#endif
{
#ifdef __MIC__
*
timer_compute
=
MIC_Wtime
();
#endif
#ifdef _LMP_INTEL_OFFLOAD
overflow
[
LMP_LOCAL_MIN
]
=
astart
;
overflow
[
LMP_LOCAL_MAX
]
=
aend
-
1
;
overflow
[
LMP_GHOST_MIN
]
=
e_nall
;
overflow
[
LMP_GHOST_MAX
]
=
-
1
;
#endif
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(numneigh, overflow)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
int
lmin
=
e_nall
,
lmax
=
-
1
,
gmin
=
e_nall
,
gmax
=
-
1
;
#endif
const
int
num
=
aend
-
astart
;
int
tid
,
ifrom
,
ito
;
IP_PRE_omp_range_id
(
ifrom
,
ito
,
tid
,
num
,
nthreads
);
ifrom
+=
astart
;
ito
+=
astart
;
int
which
;
const
int
list_size
=
(
ito
+
tid
+
1
)
*
maxnbors
;
int
ct
=
(
ifrom
+
tid
)
*
maxnbors
;
int
*
neighptr
=
firstneigh
+
ct
;
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
int
j
,
k
,
n
,
n2
,
itype
,
jtype
,
ibin
;
double
xtmp
,
ytmp
,
ztmp
,
delx
,
dely
,
delz
,
rsq
;
n
=
0
;
n2
=
maxnbors
;
xtmp
=
x
[
i
].
x
;
ytmp
=
x
[
i
].
y
;
ztmp
=
x
[
i
].
z
;
itype
=
x
[
i
].
w
;
const
int
ioffset
=
ntypes
*
itype
;
// loop over rest of atoms in i's bin, ghosts are at end of linked list
// if j is owned atom, store it, since j is beyond i in linked list
// if j is ghost, only store if j coords are "above/to the right" of i
for
(
j
=
bins
[
i
];
j
>=
0
;
j
=
bins
[
j
])
{
if
(
j
>=
nlocal
)
{
if
(
offload_noghost
&&
offload
)
continue
;
if
(
x
[
j
].
z
<
ztmp
)
continue
;
if
(
x
[
j
].
z
==
ztmp
)
{
if
(
x
[
j
].
y
<
ytmp
)
continue
;
if
(
x
[
j
].
y
==
ytmp
&&
x
[
j
].
x
<
xtmp
)
continue
;
}
}
else
if
(
offload_noghost
&&
i
<
offload_end
)
continue
;
jtype
=
x
[
j
].
w
;
#ifndef _LMP_INTEL_OFFLOAD
if
(
exclude
&&
exclusion
(
i
,
j
,
itype
,
jtype
,
mask
,
molecule
))
continue
;
#endif
delx
=
xtmp
-
x
[
j
].
x
;
dely
=
ytmp
-
x
[
j
].
y
;
delz
=
ztmp
-
x
[
j
].
z
;
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
if
(
rsq
<=
cutneighsq
[
ioffset
+
jtype
])
{
if
(
j
<
nlocal
)
{
neighptr
[
n
++
]
=
j
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
<
lmin
)
lmin
=
j
;
if
(
j
>
lmax
)
lmax
=
j
;
#endif
}
else
{
neighptr
[
n2
++
]
=
j
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
<
gmin
)
gmin
=
j
;
if
(
j
>
gmax
)
gmax
=
j
;
#endif
}
}
}
// loop over all atoms in other bins in stencil, store every pair
ibin
=
mcoord2bin
(
x
[
i
].
x
,
x
[
i
].
y
,
x
[
i
].
z
,
bboxlo0
,
bboxlo1
,
bboxlo2
,
bboxhi0
,
bboxhi1
,
bboxhi2
,
bininvx
,
bininvy
,
bininvz
,
nbinx
,
nbiny
,
nbinz
,
mbinx
,
mbiny
,
mbinz
,
mbinxlo
,
mbinylo
,
mbinzlo
);
for
(
k
=
0
;
k
<
nstencil
;
k
++
)
{
for
(
j
=
binhead
[
ibin
+
stencil
[
k
]];
j
>=
0
;
j
=
bins
[
j
])
{
if
(
offload_noghost
)
{
if
(
j
<
nlocal
)
{
if
(
i
<
offload_end
)
continue
;
}
else
if
(
offload
)
continue
;
}
jtype
=
x
[
j
].
w
;
#ifndef _LMP_INTEL_OFFLOAD
if
(
exclude
&&
exclusion
(
i
,
j
,
itype
,
jtype
,
mask
,
molecule
))
continue
;
#endif
delx
=
xtmp
-
x
[
j
].
x
;
dely
=
ytmp
-
x
[
j
].
y
;
delz
=
ztmp
-
x
[
j
].
z
;
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
if
(
rsq
<=
cutneighsq
[
ioffset
+
jtype
])
{
if
(
j
<
nlocal
)
{
neighptr
[
n
++
]
=
j
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
<
lmin
)
lmin
=
j
;
if
(
j
>
lmax
)
lmax
=
j
;
#endif
}
else
{
neighptr
[
n2
++
]
=
j
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
<
gmin
)
gmin
=
j
;
if
(
j
>
gmax
)
gmax
=
j
;
#endif
}
}
}
}
ilist
[
i
]
=
i
;
cnumneigh
[
i
]
=
ct
;
if
(
n
>
maxnbors
)
*
overflow
=
1
;
for
(
k
=
maxnbors
;
k
<
n2
;
k
++
)
neighptr
[
n
++
]
=
neighptr
[
k
];
while
(
(
n
%
pad_width
)
!=
0
)
neighptr
[
n
++
]
=
e_nall
;
numneigh
[
i
]
=
n
;
while
((
n
%
(
INTEL_DATA_ALIGN
/
sizeof
(
int
)))
!=
0
)
n
++
;
ct
+=
n
;
neighptr
+=
n
;
if
(
ct
+
n
+
maxnbors
>
list_size
)
{
*
overflow
=
1
;
ct
=
(
ifrom
+
tid
)
*
maxnbors
;
}
}
if
(
*
overflow
==
1
)
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
numneigh
[
i
]
=
0
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
separate_buffers
)
{
#if defined(_OPENMP)
#pragma omp critical
#endif
{
if
(
lmin
<
overflow
[
LMP_LOCAL_MIN
])
overflow
[
LMP_LOCAL_MIN
]
=
lmin
;
if
(
lmax
>
overflow
[
LMP_LOCAL_MAX
])
overflow
[
LMP_LOCAL_MAX
]
=
lmax
;
if
(
gmin
<
overflow
[
LMP_GHOST_MIN
])
overflow
[
LMP_GHOST_MIN
]
=
gmin
;
if
(
gmax
>
overflow
[
LMP_GHOST_MAX
])
overflow
[
LMP_GHOST_MAX
]
=
gmax
;
}
#pragma omp barrier
}
int
ghost_offset
=
0
,
nall_offset
=
e_nall
;
if
(
separate_buffers
)
{
int
nghost
=
overflow
[
LMP_GHOST_MAX
]
+
1
-
overflow
[
LMP_GHOST_MIN
];
if
(
nghost
<
0
)
nghost
=
0
;
if
(
offload
)
{
ghost_offset
=
overflow
[
LMP_GHOST_MIN
]
-
overflow
[
LMP_LOCAL_MAX
]
-
1
;
nall_offset
=
overflow
[
LMP_LOCAL_MAX
]
+
1
+
nghost
;
}
else
{
ghost_offset
=
overflow
[
LMP_GHOST_MIN
]
-
nlocal
;
nall_offset
=
nlocal
+
nghost
;
}
}
#endif
if
(
molecular
)
{
for
(
int
i
=
ifrom
;
i
<
ito
;
++
i
)
{
int
*
restrict
jlist
=
firstneigh
+
cnumneigh
[
i
];
const
int
jnum
=
numneigh
[
i
];
for
(
int
jj
=
0
;
jj
<
jnum
;
jj
++
)
{
const
int
j
=
jlist
[
jj
];
ofind_special
(
which
,
special
,
nspecial
,
i
,
tag
[
j
],
special_flag
);
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
>=
nlocal
)
{
if
(
j
==
e_nall
)
jlist
[
jj
]
=
nall_offset
;
else
if
(
which
>
0
)
jlist
[
jj
]
=
(
j
-
ghost_offset
)
^
(
which
<<
SBBITS
);
else
jlist
[
jj
]
-=
ghost_offset
;
}
else
#endif
if
(
which
>
0
)
jlist
[
jj
]
=
j
^
(
which
<<
SBBITS
);
}
}
}
#ifdef _LMP_INTEL_OFFLOAD
else
if
(
separate_buffers
)
{
for
(
int
i
=
ifrom
;
i
<
ito
;
++
i
)
{
int
*
restrict
jlist
=
firstneigh
+
cnumneigh
[
i
];
const
int
jnum
=
numneigh
[
i
];
int
jj
=
0
;
for
(
jj
=
0
;
jj
<
jnum
;
jj
++
)
if
(
jlist
[
jj
]
>=
nlocal
)
break
;
while
(
jj
<
jnum
)
{
if
(
jlist
[
jj
]
==
e_nall
)
jlist
[
jj
]
=
nall_offset
;
else
jlist
[
jj
]
-=
ghost_offset
;
jj
++
;
}
}
}
#endif
}
// end omp
#ifdef __MIC__
*
timer_compute
=
MIC_Wtime
()
-
*
timer_compute
;
#endif
}
// end offload
if
(
offload
)
{
fix
->
stop_watch
(
TIME_OFFLOAD_LATENCY
);
#ifdef _LMP_INTEL_OFFLOAD
for
(
int
n
=
0
;
n
<
aend
;
n
++
)
{
ilist
[
n
]
=
n
;
numneigh
[
n
]
=
0
;
}
#endif
}
else
{
for
(
int
i
=
astart
;
i
<
aend
;
i
++
)
list
->
firstneigh
[
i
]
=
firstneigh
+
cnumneigh
[
i
];
fix
->
stop_watch
(
TIME_HOST_NEIGHBOR
);
#ifdef _LMP_INTEL_OFFLOAD
if
(
separate_buffers
)
{
fix
->
start_watch
(
TIME_PACK
);
fix
->
set_neighbor_host_sizes
();
buffers
->
pack_sep_from_single
(
fix
->
host_min_local
(),
fix
->
host_used_local
(),
fix
->
host_min_ghost
(),
fix
->
host_used_ghost
());
fix
->
stop_watch
(
TIME_PACK
);
}
#endif
}
}
/* ----------------------------------------------------------------------
binned neighbor list construction with Newton's 3rd law for triclinic
each owned atom i checks its own bin and other bins in triclinic stencil
every pair stored exactly once by some processor
------------------------------------------------------------------------- */
void
Neighbor
::
half_bin_newton_tri_intel
(
NeighList
*
list
)
{
const
int
nlocal
=
(
includegroup
)
?
atom
->
nfirst
:
atom
->
nlocal
;
list
->
inum
=
nlocal
;
// Get fix for intel stuff
FixIntel
*
fix
=
static_cast
<
FixIntel
*>
(
fix_intel
);
const
int
off_end
=
fix
->
offload_end_neighbor
();
int
host_start
=
fix
->
host_start_neighbor
();
int
offload_noghost
=
0
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
fix
->
full_host_list
())
host_start
=
0
;
offload_noghost
=
fix
->
offload_noghost
();
if
(
exclude
)
error
->
all
(
FLERR
,
"Exclusion lists not yet supported for Intel offload"
);
#endif
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
{
if
(
offload_noghost
)
{
hbnti
<
float
,
double
,
1
>
(
1
,
list
,
fix
->
get_mixed_buffers
(),
0
,
off_end
,
fix
);
hbnti
<
float
,
double
,
1
>
(
0
,
list
,
fix
->
get_mixed_buffers
(),
host_start
,
nlocal
,
fix
,
off_end
);
}
else
{
hbnti
<
float
,
double
,
0
>
(
1
,
list
,
fix
->
get_mixed_buffers
(),
0
,
off_end
,
fix
);
hbnti
<
float
,
double
,
0
>
(
0
,
list
,
fix
->
get_mixed_buffers
(),
host_start
,
nlocal
,
fix
);
}
}
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
{
if
(
offload_noghost
)
{
hbnti
<
double
,
double
,
1
>
(
1
,
list
,
fix
->
get_double_buffers
(),
0
,
off_end
,
fix
);
hbnti
<
double
,
double
,
1
>
(
0
,
list
,
fix
->
get_double_buffers
(),
host_start
,
nlocal
,
fix
,
off_end
);
}
else
{
hbnti
<
double
,
double
,
0
>
(
1
,
list
,
fix
->
get_double_buffers
(),
0
,
off_end
,
fix
);
hbnti
<
double
,
double
,
0
>
(
0
,
list
,
fix
->
get_double_buffers
(),
host_start
,
nlocal
,
fix
);
}
}
else
{
if
(
offload_noghost
)
{
hbnti
<
float
,
float
,
1
>
(
1
,
list
,
fix
->
get_single_buffers
(),
0
,
off_end
,
fix
);
hbnti
<
float
,
float
,
1
>
(
0
,
list
,
fix
->
get_single_buffers
(),
host_start
,
nlocal
,
fix
,
off_end
);
}
else
{
hbnti
<
float
,
float
,
0
>
(
1
,
list
,
fix
->
get_single_buffers
(),
0
,
off_end
,
fix
);
hbnti
<
float
,
float
,
0
>
(
0
,
list
,
fix
->
get_single_buffers
(),
host_start
,
nlocal
,
fix
);
}
}
}
template
<
class
flt_t
,
class
acc_t
,
int
offload_noghost
>
void
Neighbor
::
hbnti
(
const
int
offload
,
NeighList
*
list
,
void
*
buffers_in
,
const
int
astart
,
const
int
aend
,
void
*
fix_in
,
const
int
offload_end
)
{
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
=
(
IntelBuffers
<
flt_t
,
acc_t
>
*
)
buffers_in
;
FixIntel
*
fix
=
(
FixIntel
*
)
fix_in
;
const
int
nall
=
atom
->
nlocal
+
atom
->
nghost
;
int
pad
=
1
;
if
(
offload
)
{
fix
->
start_watch
(
TIME_PACK
);
buffers
->
grow
(
nall
,
atom
->
nlocal
,
comm
->
nthreads
,
aend
);
buffers
->
grow_nbor
(
list
,
atom
->
nlocal
,
aend
);
ATOM_T
biga
;
biga
.
x
=
INTEL_BIGP
;
biga
.
y
=
INTEL_BIGP
;
biga
.
z
=
INTEL_BIGP
;
biga
.
w
=
1
;
buffers
->
get_x
()[
nall
]
=
biga
;
const
int
nthreads
=
comm
->
nthreads
;
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(buffers)
#endif
{
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id_align
(
ifrom
,
ito
,
tid
,
nall
,
nthreads
,
sizeof
(
ATOM_T
));
buffers
->
thr_pack
(
ifrom
,
ito
,
0
);
}
fix
->
stop_watch
(
TIME_PACK
);
fix
->
start_watch
(
TIME_HOST_NEIGHBOR
);
bin_atoms
<
flt_t
,
acc_t
>
(
buffers
->
get_x
());
if
(
INTEL_MIC_NBOR_PAD
>
1
)
pad
=
INTEL_MIC_NBOR_PAD
*
sizeof
(
float
)
/
sizeof
(
flt_t
);
}
else
{
fix
->
start_watch
(
TIME_HOST_NEIGHBOR
);
if
(
INTEL_NBOR_PAD
>
1
)
pad
=
INTEL_NBOR_PAD
*
sizeof
(
float
)
/
sizeof
(
flt_t
);
}
const
int
pad_width
=
pad
;
if
(
aend
-
astart
==
0
)
{
fix
->
stop_watch
(
TIME_HOST_NEIGHBOR
);
return
;
}
const
ATOM_T
*
restrict
const
x
=
buffers
->
get_x
();
int
*
restrict
const
firstneigh
=
buffers
->
firstneigh
(
list
);
int
nall_t
=
nall
;
if
(
offload_noghost
&&
offload
)
nall_t
=
atom
->
nlocal
;
const
int
e_nall
=
nall_t
;
const
int
molecular
=
atom
->
molecular
;
int
*
ns
=
NULL
,
*
s
=
NULL
;
int
tag_size
,
special_size
;
if
(
molecular
)
{
s
=
atom
->
special
[
0
];
ns
=
atom
->
nspecial
[
0
];
tag_size
=
e_nall
;
special_size
=
aend
;
}
else
{
s
=
&
buffers
->
_special_holder
;
ns
=
&
buffers
->
_nspecial_holder
;
tag_size
=
0
;
special_size
=
0
;
}
const
int
*
restrict
const
special
=
s
;
const
int
*
restrict
const
nspecial
=
ns
;
const
int
maxspecial
=
atom
->
maxspecial
;
const
int
*
restrict
const
tag
=
atom
->
tag
;
int
*
restrict
const
ilist
=
list
->
ilist
;
int
*
restrict
numneigh
=
list
->
numneigh
;
int
*
restrict
const
cnumneigh
=
buffers
->
cnumneigh
(
list
);
const
int
nstencil
=
list
->
nstencil
;
const
int
*
restrict
const
stencil
=
list
->
stencil
;
const
flt_t
*
restrict
const
cutneighsq
=
buffers
->
get_cutneighsq
()[
0
];
const
int
ntypes
=
atom
->
ntypes
+
1
;
const
int
nlocal
=
atom
->
nlocal
;
#ifndef _LMP_INTEL_OFFLOAD
int
*
const
mask
=
atom
->
mask
;
int
*
const
molecule
=
atom
->
molecule
;
#endif
int
tnum
;
int
*
overflow
;
double
*
timer_compute
;
if
(
offload
)
{
timer_compute
=
fix
->
off_watch_neighbor
();
tnum
=
buffers
->
get_off_threads
();
overflow
=
fix
->
get_off_overflow_flag
();
fix
->
stop_watch
(
TIME_HOST_NEIGHBOR
);
fix
->
start_watch
(
TIME_OFFLOAD_LATENCY
);
}
else
{
tnum
=
comm
->
nthreads
;
overflow
=
fix
->
get_overflow_flag
();
}
const
int
nthreads
=
tnum
;
const
int
maxnbors
=
buffers
->
get_max_nbors
();
const
flt_t
bboxlo0
=
this
->
bboxlo
[
0
];
const
flt_t
bboxlo1
=
this
->
bboxlo
[
1
];
const
flt_t
bboxlo2
=
this
->
bboxlo
[
2
];
const
flt_t
bboxhi0
=
this
->
bboxhi
[
0
];
const
flt_t
bboxhi1
=
this
->
bboxhi
[
1
];
const
flt_t
bboxhi2
=
this
->
bboxhi
[
2
];
const
flt_t
bininvx
=
this
->
bininvx
;
const
flt_t
bininvy
=
this
->
bininvy
;
const
flt_t
bininvz
=
this
->
bininvz
;
// Make sure dummy coordinates to eliminate loop remainder not within cutoff
{
const
flt_t
dx
=
(
INTEL_BIGP
-
bboxhi0
);
const
flt_t
dy
=
(
INTEL_BIGP
-
bboxhi1
);
const
flt_t
dz
=
(
INTEL_BIGP
-
bboxhi2
);
if
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
<
static_cast
<
flt_t
>
(
cutneighmaxsq
))
error
->
one
(
FLERR
,
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."
);
}
#ifdef _LMP_INTEL_OFFLOAD
const
int
*
restrict
const
binhead
=
this
->
binhead
;
const
int
*
restrict
const
special_flag
=
this
->
special_flag
;
const
int
nbinx
=
this
->
nbinx
;
const
int
nbiny
=
this
->
nbiny
;
const
int
nbinz
=
this
->
nbinz
;
const
int
mbinxlo
=
this
->
mbinxlo
;
const
int
mbinylo
=
this
->
mbinylo
;
const
int
mbinzlo
=
this
->
mbinzlo
;
const
int
mbinx
=
this
->
mbinx
;
const
int
mbiny
=
this
->
mbiny
;
const
int
mbinz
=
this
->
mbinz
;
const
int
*
restrict
const
bins
=
this
->
bins
;
const
int
cop
=
fix
->
coprocessor_number
();
const
int
separate_buffers
=
fix
->
separate_buffers
();
#pragma offload target(mic:cop) if(offload) \
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
in(bins:length(nall) alloc_if(0) free_if(0)) \
in(binhead:length(mbins) alloc_if(0) free_if(0)) \
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
out(numneigh:length(0) alloc_if(0) free_if(0)) \
in(ilist:length(0) alloc_if(0) free_if(0)) \
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
in(special_flag:length(0) alloc_if(0) free_if(0)) \
in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,offload_end) \
in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,e_nall,offload) \
in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \
in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \
out(overflow:length(5) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(numneigh)
#endif
{
#ifdef __MIC__
*
timer_compute
=
MIC_Wtime
();
#endif
#ifdef _LMP_INTEL_OFFLOAD
overflow
[
LMP_LOCAL_MIN
]
=
astart
;
overflow
[
LMP_LOCAL_MAX
]
=
aend
-
1
;
overflow
[
LMP_GHOST_MIN
]
=
e_nall
;
overflow
[
LMP_GHOST_MAX
]
=
-
1
;
#endif
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(numneigh, overflow)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
int
lmin
=
e_nall
,
lmax
=
-
1
,
gmin
=
e_nall
,
gmax
=
-
1
;
#endif
const
int
num
=
aend
-
astart
;
int
tid
,
ifrom
,
ito
;
IP_PRE_omp_range_id
(
ifrom
,
ito
,
tid
,
num
,
nthreads
);
ifrom
+=
astart
;
ito
+=
astart
;
int
which
;
const
int
list_size
=
(
ito
+
tid
+
1
)
*
maxnbors
;
int
ct
=
(
ifrom
+
tid
)
*
maxnbors
;
int
*
neighptr
=
firstneigh
+
ct
;
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
{
int
j
,
k
,
n
,
n2
,
itype
,
jtype
,
ibin
;
double
xtmp
,
ytmp
,
ztmp
,
delx
,
dely
,
delz
,
rsq
;
n
=
0
;
n2
=
maxnbors
;
xtmp
=
x
[
i
].
x
;
ytmp
=
x
[
i
].
y
;
ztmp
=
x
[
i
].
z
;
itype
=
x
[
i
].
w
;
const
int
ioffset
=
ntypes
*
itype
;
// loop over all atoms in bins in stencil
// pairs for atoms j "below" i are excluded
// below = lower z or (equal z and lower y) or (equal zy and lower x)
// (equal zyx and j <= i)
// latter excludes self-self interaction but allows superposed atoms
ibin
=
mcoord2bin
(
x
[
i
].
x
,
x
[
i
].
y
,
x
[
i
].
z
,
bboxlo0
,
bboxlo1
,
bboxlo2
,
bboxhi0
,
bboxhi1
,
bboxhi2
,
bininvx
,
bininvy
,
bininvz
,
nbinx
,
nbiny
,
nbinz
,
mbinx
,
mbiny
,
mbinz
,
mbinxlo
,
mbinylo
,
mbinzlo
);
for
(
k
=
0
;
k
<
nstencil
;
k
++
)
{
for
(
j
=
binhead
[
ibin
+
stencil
[
k
]];
j
>=
0
;
j
=
bins
[
j
])
{
if
(
offload_noghost
)
{
if
(
j
<
nlocal
)
{
if
(
i
<
offload_end
)
continue
;
}
else
if
(
offload
)
continue
;
}
if
(
x
[
j
].
z
<
ztmp
)
continue
;
if
(
x
[
j
].
z
==
ztmp
)
{
if
(
x
[
j
].
y
<
ytmp
)
continue
;
if
(
x
[
j
].
y
==
ytmp
)
{
if
(
x
[
j
].
x
<
xtmp
)
continue
;
if
(
x
[
j
].
x
==
xtmp
&&
j
<=
i
)
continue
;
}
}
jtype
=
x
[
j
].
w
;
#ifndef _LMP_INTEL_OFFLOAD
if
(
exclude
&&
exclusion
(
i
,
j
,
itype
,
jtype
,
mask
,
molecule
))
continue
;
#endif
delx
=
xtmp
-
x
[
j
].
x
;
dely
=
ytmp
-
x
[
j
].
y
;
delz
=
ztmp
-
x
[
j
].
z
;
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
if
(
rsq
<=
cutneighsq
[
ioffset
+
jtype
])
{
if
(
j
<
nlocal
)
{
neighptr
[
n
++
]
=
j
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
<
lmin
)
lmin
=
j
;
if
(
j
>
lmax
)
lmax
=
j
;
#endif
}
else
{
neighptr
[
n2
++
]
=
j
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
<
gmin
)
gmin
=
j
;
if
(
j
>
gmax
)
gmax
=
j
;
#endif
}
}
}
}
ilist
[
i
]
=
i
;
cnumneigh
[
i
]
=
ct
;
if
(
n
>
maxnbors
)
*
overflow
=
1
;
for
(
k
=
maxnbors
;
k
<
n2
;
k
++
)
neighptr
[
n
++
]
=
neighptr
[
k
];
while
(
(
n
%
pad_width
)
!=
0
)
neighptr
[
n
++
]
=
e_nall
;
numneigh
[
i
]
=
n
;
while
((
n
%
(
INTEL_DATA_ALIGN
/
sizeof
(
int
)))
!=
0
)
n
++
;
ct
+=
n
;
neighptr
+=
n
;
if
(
ct
+
n
+
maxnbors
>
list_size
)
{
*
overflow
=
1
;
ct
=
(
ifrom
+
tid
)
*
maxnbors
;
}
}
if
(
*
overflow
==
1
)
for
(
int
i
=
ifrom
;
i
<
ito
;
i
++
)
numneigh
[
i
]
=
0
;
#ifdef _LMP_INTEL_OFFLOAD
if
(
separate_buffers
)
{
#if defined(_OPENMP)
#pragma omp critical
#endif
{
if
(
lmin
<
overflow
[
LMP_LOCAL_MIN
])
overflow
[
LMP_LOCAL_MIN
]
=
lmin
;
if
(
lmax
>
overflow
[
LMP_LOCAL_MAX
])
overflow
[
LMP_LOCAL_MAX
]
=
lmax
;
if
(
gmin
<
overflow
[
LMP_GHOST_MIN
])
overflow
[
LMP_GHOST_MIN
]
=
gmin
;
if
(
gmax
>
overflow
[
LMP_GHOST_MAX
])
overflow
[
LMP_GHOST_MAX
]
=
gmax
;
}
#pragma omp barrier
}
int
ghost_offset
=
0
,
nall_offset
=
e_nall
;
if
(
separate_buffers
)
{
int
nghost
=
overflow
[
LMP_GHOST_MAX
]
+
1
-
overflow
[
LMP_GHOST_MIN
];
if
(
nghost
<
0
)
nghost
=
0
;
if
(
offload
)
{
ghost_offset
=
overflow
[
LMP_GHOST_MIN
]
-
overflow
[
LMP_LOCAL_MAX
]
-
1
;
nall_offset
=
overflow
[
LMP_LOCAL_MAX
]
+
1
+
nghost
;
}
else
{
ghost_offset
=
overflow
[
LMP_GHOST_MIN
]
-
nlocal
;
nall_offset
=
nlocal
+
nghost
;
}
}
#endif
if
(
molecular
)
{
for
(
int
i
=
ifrom
;
i
<
ito
;
++
i
)
{
int
*
restrict
jlist
=
firstneigh
+
cnumneigh
[
i
];
const
int
jnum
=
numneigh
[
i
];
for
(
int
jj
=
0
;
jj
<
jnum
;
jj
++
)
{
const
int
j
=
jlist
[
jj
];
ofind_special
(
which
,
special
,
nspecial
,
i
,
tag
[
j
],
special_flag
);
#ifdef _LMP_INTEL_OFFLOAD
if
(
j
>=
nlocal
)
{
if
(
j
==
e_nall
)
jlist
[
jj
]
=
nall_offset
;
else
if
(
which
>
0
)
jlist
[
jj
]
=
(
j
-
ghost_offset
)
^
(
which
<<
SBBITS
);
else
jlist
[
jj
]
-=
ghost_offset
;
}
else
#endif
if
(
which
>
0
)
jlist
[
jj
]
=
j
^
(
which
<<
SBBITS
);
}
}
}
#ifdef _LMP_INTEL_OFFLOAD
else
if
(
separate_buffers
)
{
for
(
int
i
=
ifrom
;
i
<
ito
;
++
i
)
{
int
*
restrict
jlist
=
firstneigh
+
cnumneigh
[
i
];
const
int
jnum
=
numneigh
[
i
];
int
jj
=
0
;
for
(
jj
=
0
;
jj
<
jnum
;
jj
++
)
if
(
jlist
[
jj
]
>=
nlocal
)
break
;
while
(
jj
<
jnum
)
{
if
(
jlist
[
jj
]
==
e_nall
)
jlist
[
jj
]
=
nall_offset
;
else
jlist
[
jj
]
-=
ghost_offset
;
jj
++
;
}
}
}
#endif
}
// end omp
#ifdef __MIC__
*
timer_compute
=
MIC_Wtime
()
-
*
timer_compute
;
#endif
}
// end offload
if
(
offload
)
{
fix
->
stop_watch
(
TIME_OFFLOAD_LATENCY
);
#ifdef _LMP_INTEL_OFFLOAD
for
(
int
n
=
0
;
n
<
aend
;
n
++
)
{
ilist
[
n
]
=
n
;
numneigh
[
n
]
=
0
;
}
#endif
}
else
{
for
(
int
i
=
astart
;
i
<
aend
;
i
++
)
list
->
firstneigh
[
i
]
=
firstneigh
+
cnumneigh
[
i
];
fix
->
stop_watch
(
TIME_HOST_NEIGHBOR
);
#ifdef _LMP_INTEL_OFFLOAD
if
(
separate_buffers
)
{
fix
->
start_watch
(
TIME_PACK
);
fix
->
set_neighbor_host_sizes
();
buffers
->
pack_sep_from_single
(
fix
->
host_min_local
(),
fix
->
host_used_local
(),
fix
->
host_min_ghost
(),
fix
->
host_used_ghost
());
fix
->
stop_watch
(
TIME_PACK
);
}
#endif
}
}
Event Timeline
Log In to Comment