Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91349838
pair_tersoff_intel.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Nov 10, 05:52
Size
58 KB
Mime Type
text/x-c++
Expires
Tue, Nov 12, 05:52 (2 d)
Engine
blob
Format
Raw Data
Handle
22247623
Attached To
rLAMMPS lammps
pair_tersoff_intel.cpp
View Options
This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Markus Höhnerbach (RWTH)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "pair_tersoff_intel.h"
#include "atom.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "force.h"
#include "comm.h"
#include "memory.h"
#include "error.h"
// Currently Intel compiler is required for this pair style.
// For convenience, base class routines are called if not using Intel compiler.
#ifndef __INTEL_COMPILER
using
namespace
LAMMPS_NS
;
PairTersoffIntel
::
PairTersoffIntel
(
LAMMPS
*
lmp
)
:
PairTersoff
(
lmp
)
{
}
void
PairTersoffIntel
::
compute
(
int
eflag
,
int
vflag
)
{
PairTersoff
::
compute
(
eflag
,
vflag
);
}
void
PairTersoffIntel
::
init_style
()
{
if
(
comm
->
me
==
0
)
{
error
->
warning
(
FLERR
,
"Tersoff/intel currently requires intel compiler. "
"Using MANYBODY version."
);
}
PairTersoff
::
init_style
();
}
#else
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push,target(mic))
#endif
#include "intel_intrinsics.h"
#include "math_const.h"
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
#include "group.h"
#include "kspace.h"
#include "modify.h"
#include "suffix.h"
using
namespace
LAMMPS_NS
;
using
namespace
MathConst
;
/* ---------------------------------------------------------------------- */
PairTersoffIntel
::
PairTersoffIntel
(
LAMMPS
*
lmp
)
:
PairTersoff
(
lmp
)
{
suffix_flag
|=
Suffix
::
INTEL
;
respa_enable
=
0
;
}
/* ---------------------------------------------------------------------- */
// Dispatch the requested precision
void
PairTersoffIntel
::
compute
(
int
eflag
,
int
vflag
)
{
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
{
compute
<
float
,
double
>
(
eflag
,
vflag
,
fix
->
get_mixed_buffers
(),
force_const_single
);
}
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
{
compute
<
double
,
double
>
(
eflag
,
vflag
,
fix
->
get_double_buffers
(),
force_const_double
);
}
else
{
compute
<
float
,
float
>
(
eflag
,
vflag
,
fix
->
get_single_buffers
(),
force_const_single
);
}
fix
->
balance_stamp
();
vflag_fdotr
=
0
;
}
// Dispatch the extent of computation:
// do we need to calculate energy/virial
template
<
class
flt_t
,
class
acc_t
>
void
PairTersoffIntel
::
compute
(
int
eflag
,
int
vflag
,
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
,
const
ForceConst
<
flt_t
>
&
fc
)
{
if
(
eflag
||
vflag
)
{
ev_setup
(
eflag
,
vflag
);
}
else
evflag
=
vflag_fdotr
=
0
;
const
int
inum
=
list
->
inum
;
const
int
nthreads
=
comm
->
nthreads
;
const
int
host_start
=
fix
->
host_start_pair
();
const
int
offload_end
=
fix
->
offload_end_pair
();
const
int
ago
=
neighbor
->
ago
;
if
(
ago
!=
0
&&
fix
->
separate_buffers
()
==
0
)
{
fix
->
start_watch
(
TIME_PACK
);
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
#endif
{
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id_align
(
ifrom
,
ito
,
tid
,
atom
->
nlocal
+
atom
->
nghost
,
nthreads
,
sizeof
(
ATOM_T
));
buffers
->
thr_pack
(
ifrom
,
ito
,
ago
);
}
fix
->
stop_watch
(
TIME_PACK
);
}
if
(
evflag
||
vflag_fdotr
)
{
int
ovflag
=
0
;
if
(
vflag_fdotr
)
ovflag
=
2
;
else
if
(
vflag
)
ovflag
=
1
;
if
(
eflag
)
{
eval
<
1
,
1
,
1
>
(
1
,
ovflag
,
buffers
,
fc
,
0
,
offload_end
);
eval
<
1
,
1
,
1
>
(
0
,
ovflag
,
buffers
,
fc
,
host_start
,
inum
);
}
else
{
eval
<
1
,
0
,
1
>
(
1
,
ovflag
,
buffers
,
fc
,
0
,
offload_end
);
eval
<
1
,
0
,
1
>
(
0
,
ovflag
,
buffers
,
fc
,
host_start
,
inum
);
}
}
else
{
eval
<
0
,
0
,
1
>
(
1
,
0
,
buffers
,
fc
,
0
,
offload_end
);
eval
<
0
,
0
,
1
>
(
0
,
0
,
buffers
,
fc
,
host_start
,
inum
);
}
}
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push, target(mic))
#endif
// The complete Tersoff computation kernel is encapsulated here
// everything is static, the class just serves as a unit of organization
template
<
class
flt_t
,
class
acc_t
,
lmp_intel
::
CalculationMode
mic
,
bool
pack_i
>
struct
IntelKernelTersoff
:
public
lmp_intel
::
vector_routines
<
flt_t
,
acc_t
,
mic
>
{
// instantiate the vector library and import the types
typedef
typename
lmp_intel
::
vector_routines
<
flt_t
,
acc_t
,
mic
>
v
;
typedef
typename
v
::
fvec
fvec
;
typedef
typename
v
::
ivec
ivec
;
typedef
typename
v
::
bvec
bvec
;
typedef
typename
v
::
avec
avec
;
typedef
typename
v
::
iarr
iarr
;
typedef
typename
v
::
farr
farr
;
typedef
typename
v
::
aarr
aarr
;
typedef
typename
PairTersoffIntel
::
ForceConst
<
flt_t
>::
c_inner_t
c_inner_t
;
typedef
typename
PairTersoffIntel
::
ForceConst
<
flt_t
>::
c_outer_t
c_outer_t
;
// for descriptions of these methods, please have a look at the original code
// what's done in here is that they are inlined and vectorized
// attractive() also provides an option to compute zeta as well
static
fvec
zeta_vector
(
const
c_inner_t
*
param
,
ivec
xjw
,
bvec
mask
,
fvec
vrij
,
fvec
rsq2
,
fvec
vdijx
,
fvec
vdijy
,
fvec
vdijz
,
fvec
dikx
,
fvec
diky
,
fvec
dikz
);
static
void
force_zeta_vector
(
const
c_outer_t
*
param
,
ivec
xjw
,
bvec
mask
,
fvec
vrijsq
,
fvec
vzeta_ij
,
fvec
*
vfpair
,
fvec
*
vprefactor
,
int
EVDWL
,
fvec
*
vevdwl
,
bvec
vmask_repulsive
);
template
<
bool
ZETA
>
static
void
attractive_vector
(
const
c_inner_t
*
param
,
ivec
xjw
,
bvec
mask
,
fvec
vprefactor
,
fvec
vrijsq
,
fvec
rsq2
,
fvec
vdijx
,
fvec
vdijy
,
fvec
vdijz
,
fvec
dikx
,
fvec
diky
,
fvec
dikz
,
fvec
*
fix
,
fvec
*
fiy
,
fvec
*
fiz
,
fvec
*
fjx
,
fvec
*
fjy
,
fvec
*
fjz
,
fvec
*
fkx
,
fvec
*
fky
,
fvec
*
fkz
,
fvec
*
zeta
);
// perform the actual computation
template
<
bool
EVFLAG
,
bool
EFLAG
>
static
void
kernel
(
int
iito
,
int
iifrom
,
int
eatom
,
int
vflag
,
const
int
*
_noalias
const
numneigh
,
const
int
*
_noalias
const
numneighhalf
,
const
int
*
_noalias
const
cnumneigh
,
const
int
*
_noalias
const
firstneigh
,
int
ntypes
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
atom_t
*
_noalias
const
x
,
const
c_inner_t
*
_noalias
const
c_inner
,
const
c_outer_t
*
_noalias
const
c_outer
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
vec3_acc_t
*
_noalias
const
f
,
acc_t
*
evdwl
,
acc_t
*
ov0
,
acc_t
*
ov1
,
acc_t
*
ov2
,
acc_t
*
ov3
,
acc_t
*
ov4
,
acc_t
*
ov5
);
// perform one step of calculation, pass in i-j pairs of atoms (is, js)
template
<
int
EVFLAG
,
int
EFLAG
>
static
void
kernel_step
(
int
eatom
,
int
vflag
,
const
int
*
_noalias
const
numneigh
,
const
int
*
_noalias
const
cnumneigh
,
const
int
*
_noalias
const
firstneigh
,
int
ntypes
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
atom_t
*
_noalias
const
x
,
const
c_inner_t
*
_noalias
const
c_inner
,
const
c_outer_t
*
_noalias
const
c_outer
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
vec3_acc_t
*
_noalias
const
f
,
avec
*
vsevdwl
,
avec
*
vsv0
,
avec
*
vsv1
,
avec
*
vsv2
,
avec
*
vsv3
,
avec
*
vsv4
,
avec
*
vsv5
,
int
compress_idx
,
iarr
is
,
iarr
js
,
bvec
vmask_repulsive
);
// perform one step of calculation, as opposed to the previous method now
// with fixed i and a number of js
template
<
int
EVFLAG
,
int
EFLAG
>
static
void
kernel_step_const_i
(
int
eatom
,
int
vflag
,
const
int
*
_noalias
const
numneigh
,
const
int
*
_noalias
const
cnumneigh
,
const
int
*
_noalias
const
firstneigh
,
int
ntypes
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
atom_t
*
_noalias
const
x
,
const
c_inner_t
*
_noalias
const
c_inner
,
const
c_outer_t
*
_noalias
const
c_outer
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
vec3_acc_t
*
_noalias
const
f
,
avec
*
vsevdwl
,
avec
*
vsv0
,
avec
*
vsv1
,
avec
*
vsv2
,
avec
*
vsv3
,
avec
*
vsv4
,
avec
*
vsv5
,
int
compress_idx
,
int
i
,
iarr
js
,
bvec
vmask_repulsive
);
};
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
/* ---------------------------------------------------------------------- */
// Dispatch to correct kernel instatiation and perform all the work neccesary
// for offloading. In this routine we enter the Phi.
// This method is nearly identical to what happens in the other /intel styles
template
<
int
EVFLAG
,
int
EFLAG
,
int
NEWTON_PAIR
,
class
flt_t
,
class
acc_t
>
void
PairTersoffIntel
::
eval
(
const
int
offload
,
const
int
vflag
,
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
,
const
ForceConst
<
flt_t
>
&
fc
,
const
int
astart
,
const
int
aend
)
{
const
int
inum
=
aend
-
astart
;
if
(
inum
==
0
)
return
;
int
nlocal
,
nall
,
minlocal
;
fix
->
get_buffern
(
offload
,
nlocal
,
nall
,
minlocal
);
const
int
ago
=
neighbor
->
ago
;
IP_PRE_pack_separate_buffers
(
fix
,
buffers
,
ago
,
offload
,
nlocal
,
nall
);
ATOM_T
*
_noalias
const
x
=
buffers
->
get_x
(
offload
);
tagint
*
_noalias
tag
=
this
->
atom
->
tag
;
flt_t
*
_noalias
const
q
=
buffers
->
get_q
(
offload
);
const
int
*
_noalias
const
numneigh
=
list
->
numneigh
;
const
int
*
_noalias
const
cnumneigh
=
buffers
->
cnumneigh
(
list
);
const
int
*
_noalias
const
numneighhalf
=
buffers
->
get_atombin
();
const
int
*
_noalias
const
firstneigh
=
buffers
->
firstneigh
(
list
);
typedef
typename
ForceConst
<
flt_t
>::
c_inner_t
c_inner_t
;
typedef
typename
ForceConst
<
flt_t
>::
c_outer_t
c_outer_t
;
typedef
typename
ForceConst
<
flt_t
>::
c_cutoff_t
c_cutoff_t
;
const
c_outer_t
*
_noalias
const
c_outer
=
fc
.
c_outer
[
0
];
const
c_inner_t
*
_noalias
const
c_inner
=
fc
.
c_inner
[
0
][
0
];
const
c_cutoff_t
*
_noalias
const
c_inner_cutoff
=
fc
.
c_cutoff_inner
[
0
][
0
];
const
int
ntypes
=
atom
->
ntypes
+
1
;
const
int
eatom
=
this
->
eflag_atom
;
// Determine how much data to transfer
int
x_size
,
q_size
,
f_stride
,
ev_size
,
separate_flag
;
IP_PRE_get_transfern
(
ago
,
NEWTON_PAIR
,
EVFLAG
,
EFLAG
,
vflag
,
buffers
,
offload
,
fix
,
separate_flag
,
x_size
,
q_size
,
ev_size
,
f_stride
);
int
tc
;
FORCE_T
*
_noalias
f_start
;
acc_t
*
_noalias
ev_global
;
IP_PRE_get_buffers
(
offload
,
buffers
,
fix
,
tc
,
f_start
,
ev_global
);
const
int
nthreads
=
tc
;
#ifdef _LMP_INTEL_OFFLOAD
int
*
overflow
=
fix
->
get_off_overflow_flag
();
double
*
timer_compute
=
fix
->
off_watch_pair
();
if
(
offload
)
fix
->
start_watch
(
TIME_OFFLOAD_LATENCY
);
#pragma offload target(mic:_cop) if(offload) \
in(c_inner, c_outer :length(0) alloc_if(0) free_if(0)) \
in(c_inner_cutoff :length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(overflow:length(0) alloc_if(0) free_if(0)) \
in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
in(f_stride,nlocal,minlocal,separate_flag,offload) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(f_start)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
#ifdef __MIC__
*
timer_compute
=
MIC_Wtime
();
#endif
#endif
IP_PRE_repack_for_offload
(
NEWTON_PAIR
,
separate_flag
,
nlocal
,
nall
,
f_stride
,
x
,
0
);
acc_t
oevdwl
,
oecoul
,
ov0
,
ov1
,
ov2
,
ov3
,
ov4
,
ov5
;
if
(
EVFLAG
)
{
oevdwl
=
oecoul
=
(
acc_t
)
0
;
if
(
vflag
)
ov0
=
ov1
=
ov2
=
ov3
=
ov4
=
ov5
=
(
acc_t
)
0
;
}
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,nlocal,nall,minlocal) \
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int
iifrom
,
iito
,
tid
;
IP_PRE_omp_range_id
(
iifrom
,
iito
,
tid
,
inum
,
nthreads
);
iifrom
+=
astart
;
iito
+=
astart
;
FORCE_T
*
_noalias
const
f
=
f_start
-
minlocal
+
(
tid
*
f_stride
);
memset
(
f
+
minlocal
,
0
,
f_stride
*
sizeof
(
FORCE_T
));
{
acc_t
sevdwl
,
sv0
,
sv1
,
sv2
,
sv3
,
sv4
,
sv5
;
sevdwl
=
sv0
=
sv1
=
sv2
=
sv3
=
sv4
=
sv5
=
0.
;
#define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5
// Pick the variable i algorithm under specific conditions
// do use scalar algorithm with very short vectors
int
VL
=
lmp_intel
::
vector_routines
<
flt_t
,
acc_t
,
lmp_intel
::
mode
>::
VL
;
bool
pack_i
=
VL
>=
8
&&
lmp_intel
::
vector_traits
<
lmp_intel
::
mode
>::
support_integer_and_gather_ops
;
bool
use_scalar
=
VL
<
4
;
if
(
use_scalar
)
{
IntelKernelTersoff
<
flt_t
,
acc_t
,
lmp_intel
::
NONE
,
false
>::
kernel
<
EVFLAG
,
EFLAG
>
(
ARGS
);
}
else
if
(
pack_i
)
{
IntelKernelTersoff
<
flt_t
,
acc_t
,
lmp_intel
::
mode
,
true
>::
kernel
<
EVFLAG
,
EFLAG
>
(
ARGS
);
}
else
{
IntelKernelTersoff
<
flt_t
,
acc_t
,
lmp_intel
::
mode
,
false
>::
kernel
<
EVFLAG
,
EFLAG
>
(
ARGS
);
}
if
(
EVFLAG
)
{
if
(
EFLAG
)
oevdwl
+=
sevdwl
;
if
(
vflag
==
1
)
{
ov0
+=
sv0
;
ov1
+=
sv1
;
ov2
+=
sv2
;
ov3
+=
sv3
;
ov4
+=
sv4
;
ov5
+=
sv5
;
}
}
}
#ifndef _LMP_INTEL_OFFLOAD
if
(
vflag
==
2
)
#endif
{
#if defined(_OPENMP)
#pragma omp barrier
#endif
IP_PRE_fdotr_acc_force
(
NEWTON_PAIR
,
EVFLAG
,
EFLAG
,
vflag
,
eatom
,
nall
,
nlocal
,
minlocal
,
nthreads
,
f_start
,
f_stride
,
x
,
offload
);
}
}
// end of omp parallel region
if
(
EVFLAG
)
{
if
(
EFLAG
)
{
ev_global
[
0
]
=
oevdwl
;
ev_global
[
1
]
=
0.0
;
}
if
(
vflag
)
{
ev_global
[
2
]
=
ov0
;
ev_global
[
3
]
=
ov1
;
ev_global
[
4
]
=
ov2
;
ev_global
[
5
]
=
ov3
;
ev_global
[
6
]
=
ov4
;
ev_global
[
7
]
=
ov5
;
}
}
#ifdef _LMP_INTEL_OFFLOAD
#ifdef __MIC__
*
timer_compute
=
MIC_Wtime
()
-
*
timer_compute
;
#endif
#endif
}
// end of offload region
if
(
offload
)
fix
->
stop_watch
(
TIME_OFFLOAD_LATENCY
);
else
fix
->
stop_watch
(
TIME_HOST_PAIR
);
if
(
EVFLAG
)
fix
->
add_result_array
(
f_start
,
ev_global
,
offload
,
eatom
,
0
,
vflag
);
else
fix
->
add_result_array
(
f_start
,
0
,
offload
);
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
// As in any other /intel pair style
void
PairTersoffIntel
::
init_style
()
{
if
(
atom
->
tag_enable
==
0
)
error
->
all
(
FLERR
,
"Pair style Tersoff requires atom IDs"
);
if
(
force
->
newton_pair
==
0
)
error
->
all
(
FLERR
,
"Pair style Tersoff requires newton pair on"
);
// need a full neighbor list
int
irequest
=
neighbor
->
request
(
this
);
neighbor
->
requests
[
irequest
]
->
half
=
0
;
neighbor
->
requests
[
irequest
]
->
full
=
1
;
neighbor
->
requests
[
irequest
]
->
intel
=
1
;
int
ifix
=
modify
->
find_fix
(
"package_intel"
);
if
(
ifix
<
0
)
error
->
all
(
FLERR
,
"The 'package intel' command is required for /intel styles"
);
fix
=
static_cast
<
FixIntel
*>
(
modify
->
fix
[
ifix
]);
fix
->
pair_init_check
();
#ifdef _LMP_INTEL_OFFLOAD
_cop
=
fix
->
coprocessor_number
();
#endif
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
{
pack_force_const
(
force_const_single
,
fix
->
get_mixed_buffers
());
fix
->
get_mixed_buffers
()
->
need_tag
(
1
);
}
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
{
fix
->
get_double_buffers
()
->
need_tag
(
1
);
pack_force_const
(
force_const_double
,
fix
->
get_double_buffers
());
}
else
{
pack_force_const
(
force_const_single
,
fix
->
get_single_buffers
());
fix
->
get_single_buffers
()
->
need_tag
(
1
);
}
}
// As in any other /intel pair style
template
<
class
flt_t
,
class
acc_t
>
void
PairTersoffIntel
::
pack_force_const
(
ForceConst
<
flt_t
>
&
fc
,
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
)
{
int
tp1
=
atom
->
ntypes
+
1
;
fc
.
set_ntypes
(
tp1
,
memory
,
_cop
);
buffers
->
set_ntypes
(
tp1
);
flt_t
**
cutneighsq
=
buffers
->
get_cutneighsq
();
// Repeat cutsq calculation because done after call to init_style
double
cut
,
cutneigh
;
for
(
int
i
=
1
;
i
<=
atom
->
ntypes
;
i
++
)
{
for
(
int
j
=
i
;
j
<=
atom
->
ntypes
;
j
++
)
{
if
(
setflag
[
i
][
j
]
!=
0
||
(
setflag
[
i
][
i
]
!=
0
&&
setflag
[
j
][
j
]
!=
0
))
{
cut
=
init_one
(
i
,
j
);
cutneigh
=
cut
+
neighbor
->
skin
;
cutsq
[
i
][
j
]
=
cutsq
[
j
][
i
]
=
cut
*
cut
;
cutneighsq
[
i
][
j
]
=
cutneighsq
[
j
][
i
]
=
cutneigh
*
cutneigh
;
}
}
}
for
(
int
i
=
1
;
i
<
tp1
;
i
++
)
{
for
(
int
j
=
1
;
j
<
tp1
;
j
++
)
{
fc
.
c_inner_loop
[
i
][
j
][
0
].
d2
=
1.0
;
fc
.
c_inner_loop
[
i
][
0
][
j
].
d2
=
1.0
;
fc
.
c_inner_loop
[
0
][
i
][
j
].
d2
=
1.0
;
for
(
int
k
=
1
;
k
<
tp1
;
k
++
)
{
Param
*
param
=
&
params
[
elem2param
[
map
[
i
]][
map
[
j
]][
map
[
k
]]];
fc
.
c_cutoff_inner
[
i
][
k
][
j
].
cutsq
=
static_cast
<
flt_t
>
(
param
->
cutsq
);
fc
.
c_inner_loop
[
i
][
j
][
k
].
lam3
=
static_cast
<
flt_t
>
(
param
->
lam3
);
fc
.
c_inner_loop
[
i
][
j
][
k
].
bigr
=
static_cast
<
flt_t
>
(
param
->
bigr
);
fc
.
c_inner_loop
[
i
][
j
][
k
].
bigd
=
static_cast
<
flt_t
>
(
param
->
bigd
);
fc
.
c_inner_loop
[
i
][
j
][
k
].
c2
=
static_cast
<
flt_t
>
(
param
->
c
*
param
->
c
);
fc
.
c_inner_loop
[
i
][
j
][
k
].
d2
=
static_cast
<
flt_t
>
(
param
->
d
*
param
->
d
);
fc
.
c_inner_loop
[
i
][
j
][
k
].
h
=
static_cast
<
flt_t
>
(
param
->
h
);
fc
.
c_inner_loop
[
i
][
j
][
k
].
gamma
=
static_cast
<
flt_t
>
(
param
->
gamma
);
fc
.
c_inner_loop
[
i
][
j
][
k
].
powermint
=
static_cast
<
flt_t
>
(
param
->
powermint
);
fc
.
c_inner
[
i
][
j
][
k
].
cutsq
=
static_cast
<
flt_t
>
(
param
->
cutsq
);
fc
.
c_inner
[
i
][
j
][
k
].
lam3
=
static_cast
<
flt_t
>
(
param
->
lam3
);
fc
.
c_inner
[
i
][
j
][
k
].
bigr
=
static_cast
<
flt_t
>
(
param
->
bigr
);
fc
.
c_inner
[
i
][
j
][
k
].
bigd
=
static_cast
<
flt_t
>
(
param
->
bigd
);
fc
.
c_inner
[
i
][
j
][
k
].
c2
=
static_cast
<
flt_t
>
(
param
->
c
*
param
->
c
);
fc
.
c_inner
[
i
][
j
][
k
].
d2
=
static_cast
<
flt_t
>
(
param
->
d
*
param
->
d
);
fc
.
c_inner
[
i
][
j
][
k
].
h
=
static_cast
<
flt_t
>
(
param
->
h
);
fc
.
c_inner
[
i
][
j
][
k
].
gamma
=
static_cast
<
flt_t
>
(
param
->
gamma
);
fc
.
c_inner
[
i
][
j
][
k
].
powermint
=
static_cast
<
flt_t
>
(
param
->
powermint
);
}
Param
*
param
=
&
params
[
elem2param
[
map
[
i
]][
map
[
j
]][
map
[
j
]]];
fc
.
c_cutoff_outer
[
i
][
j
].
cutsq
=
static_cast
<
flt_t
>
(
param
->
cutsq
);
fc
.
c_first_loop
[
i
][
j
].
bigr
=
static_cast
<
flt_t
>
(
param
->
bigr
);
fc
.
c_first_loop
[
i
][
j
].
bigd
=
static_cast
<
flt_t
>
(
param
->
bigd
);
fc
.
c_first_loop
[
i
][
j
].
lam1
=
static_cast
<
flt_t
>
(
param
->
lam1
);
fc
.
c_first_loop
[
i
][
j
].
biga
=
static_cast
<
flt_t
>
(
param
->
biga
);
fc
.
c_second_loop
[
i
][
j
].
lam2
=
static_cast
<
flt_t
>
(
param
->
lam2
);
fc
.
c_second_loop
[
i
][
j
].
beta
=
static_cast
<
flt_t
>
(
param
->
beta
);
fc
.
c_second_loop
[
i
][
j
].
bigb
=
static_cast
<
flt_t
>
(
param
->
bigb
);
fc
.
c_second_loop
[
i
][
j
].
powern
=
static_cast
<
flt_t
>
(
param
->
powern
);
fc
.
c_second_loop
[
i
][
j
].
c1
=
static_cast
<
flt_t
>
(
param
->
c1
);
fc
.
c_second_loop
[
i
][
j
].
c2
=
static_cast
<
flt_t
>
(
param
->
c2
);
fc
.
c_second_loop
[
i
][
j
].
c3
=
static_cast
<
flt_t
>
(
param
->
c3
);
fc
.
c_second_loop
[
i
][
j
].
c4
=
static_cast
<
flt_t
>
(
param
->
c4
);
fc
.
c_outer
[
i
][
j
].
cutsq
=
static_cast
<
flt_t
>
(
param
->
cutsq
);
fc
.
c_outer
[
i
][
j
].
bigr
=
static_cast
<
flt_t
>
(
param
->
bigr
);
fc
.
c_outer
[
i
][
j
].
bigd
=
static_cast
<
flt_t
>
(
param
->
bigd
);
fc
.
c_outer
[
i
][
j
].
lam1
=
static_cast
<
flt_t
>
(
param
->
lam1
);
fc
.
c_outer
[
i
][
j
].
biga
=
static_cast
<
flt_t
>
(
param
->
biga
);
fc
.
c_outer
[
i
][
j
].
lam2
=
static_cast
<
flt_t
>
(
param
->
lam2
);
fc
.
c_outer
[
i
][
j
].
beta
=
static_cast
<
flt_t
>
(
param
->
beta
);
fc
.
c_outer
[
i
][
j
].
bigb
=
static_cast
<
flt_t
>
(
param
->
bigb
);
fc
.
c_outer
[
i
][
j
].
powern
=
static_cast
<
flt_t
>
(
param
->
powern
);
fc
.
c_outer
[
i
][
j
].
c1
=
static_cast
<
flt_t
>
(
param
->
c1
);
fc
.
c_outer
[
i
][
j
].
c2
=
static_cast
<
flt_t
>
(
param
->
c2
);
fc
.
c_outer
[
i
][
j
].
c3
=
static_cast
<
flt_t
>
(
param
->
c3
);
fc
.
c_outer
[
i
][
j
].
c4
=
static_cast
<
flt_t
>
(
param
->
c4
);
}
fc
.
c_outer
[
i
][
0
].
cutsq
=
0.
;
}
#ifdef _LMP_INTEL_OFFLOAD
if
(
_cop
<
0
)
return
;
typename
ForceConst
<
flt_t
>::
c_first_loop_t
*
c_first_loop
=
fc
.
c_first_loop
[
0
];
typename
ForceConst
<
flt_t
>::
c_cutoff_t
*
c_cutoff_outer
=
fc
.
c_cutoff_outer
[
0
];
typename
ForceConst
<
flt_t
>::
c_outer_t
*
c_outer
=
fc
.
c_outer
[
0
];
typename
ForceConst
<
flt_t
>::
c_second_loop_t
*
c_second_loop
=
fc
.
c_second_loop
[
0
];
typename
ForceConst
<
flt_t
>::
c_inner_loop_t
*
c_inner_loop
=
fc
.
c_inner_loop
[
0
][
0
];
typename
ForceConst
<
flt_t
>::
c_cutoff_t
*
c_cutoff_inner
=
fc
.
c_cutoff_inner
[
0
][
0
];
typename
ForceConst
<
flt_t
>::
c_inner_t
*
c_inner
=
fc
.
c_inner
[
0
][
0
];
flt_t
*
ocutneighsq
=
cutneighsq
[
0
];
size_t
VL
=
512
/
8
/
sizeof
(
flt_t
);
int
ntypes
=
tp1
;
int
ntypes_pad
=
ntypes
+
VL
-
ntypes
%
VL
;
int
tp1sq
=
tp1
*
tp1
;
int
tp1cb
=
tp1
*
tp1
*
tp1
;
int
tp1cb_pad
=
tp1
*
tp1
*
ntypes_pad
;
#pragma offload_transfer target(mic:_cop) \
in(c_first_loop, c_second_loop, c_cutoff_outer, c_outer : length(tp1sq) alloc_if(0) free_if(0)) \
in(c_inner : length(tp1cb) alloc_if(0) free_if(0)) \
in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
#endif
}
/* ---------------------------------------------------------------------- */
// As in any other /intel pair style
template
<
class
flt_t
>
void
PairTersoffIntel
::
ForceConst
<
flt_t
>::
set_ntypes
(
const
int
ntypes
,
Memory
*
memory
,
const
int
cop
)
{
if
(
(
ntypes
!=
_ntypes
)
)
{
if
(
_ntypes
>
0
)
{
#ifdef _LMP_INTEL_OFFLOAD
c_first_loop_t
*
oc_first_loop
=
c_first_loop
[
0
];
c_second_loop_t
*
oc_second_loop
=
c_second_loop
[
0
];
c_inner_loop_t
*
oc_inner_loop
=
c_inner_loop
[
0
][
0
];
c_cutoff_t
*
oc_cutoff_inner
=
c_cutoff_inner
[
0
][
0
];
c_cutoff_t
*
oc_cutoff_outer
=
c_cutoff_outer
[
0
];
c_inner_t
*
oc_inner
=
c_inner
[
0
][
0
];
c_outer_t
*
oc_outer
=
c_outer
[
0
];
if
(
c_first_loop
!=
NULL
&&
c_second_loop
!=
NULL
&&
c_inner_loop
!=
NULL
&&
_cop
>=
0
)
{
#pragma offload_transfer target(mic:cop) \
nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
}
#endif
_memory
->
destroy
(
c_first_loop
);
_memory
->
destroy
(
c_second_loop
);
_memory
->
destroy
(
c_inner_loop
);
_memory
->
destroy
(
c_cutoff_outer
);
_memory
->
destroy
(
c_cutoff_inner
);
_memory
->
destroy
(
c_inner
);
_memory
->
destroy
(
c_outer
);
}
if
(
ntypes
>
0
)
{
_cop
=
cop
;
size_t
VL
=
512
/
8
/
sizeof
(
flt_t
);
int
ntypes_pad
=
ntypes
+
VL
-
ntypes
%
VL
;
memory
->
create
(
c_first_loop
,
ntypes
,
ntypes
,
"fc.c_first_loop"
);
memory
->
create
(
c_second_loop
,
ntypes
,
ntypes
,
"fc.c_second_loop"
);
memory
->
create
(
c_cutoff_outer
,
ntypes
,
ntypes
,
"fc.c_cutoff_outer"
);
memory
->
create
(
c_inner_loop
,
ntypes
,
ntypes
,
ntypes
,
"fc.c_inner_loop"
);
memory
->
create
(
c_cutoff_inner
,
ntypes
,
ntypes
,
ntypes_pad
,
"fc.c_cutoff_inner"
);
memory
->
create
(
c_inner
,
ntypes
,
ntypes
,
ntypes
,
"fc.c_inner"
);
memory
->
create
(
c_outer
,
ntypes
,
ntypes
,
"fc.c_outer"
);
#ifdef _LMP_INTEL_OFFLOAD
c_first_loop_t
*
oc_first_loop
=
c_first_loop
[
0
];
c_second_loop_t
*
oc_second_loop
=
c_second_loop
[
0
];
c_cutoff_t
*
oc_cutoff_outer
=
c_cutoff_outer
[
0
];
c_inner_loop_t
*
oc_inner_loop
=
c_inner_loop
[
0
][
0
];
c_cutoff_t
*
oc_cutoff_inner
=
c_cutoff_inner
[
0
][
0
];
c_inner_t
*
oc_inner
=
c_inner
[
0
][
0
];
c_outer_t
*
oc_outer
=
c_outer
[
0
];
int
tp1sq
=
ntypes
*
ntypes
;
int
tp1cb
=
ntypes
*
ntypes
*
ntypes
;
int
tp1cb_pad
=
ntypes
*
ntypes
*
ntypes_pad
;
if
(
oc_first_loop
!=
NULL
&&
oc_second_loop
!=
NULL
&&
oc_inner_loop
!=
NULL
&&
cop
>=
0
)
{
#pragma offload_transfer target(mic:cop) \
nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oc_second_loop: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oc_cutoff_outer: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oc_outer: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oc_inner_loop: length(tp1cb) alloc_if(1) free_if(0)) \
nocopy(oc_inner: length(tp1cb) alloc_if(1) free_if(0)) \
nocopy(oc_cutoff_inner: length(tp1cb_pad) alloc_if(1) free_if(0))
}
#endif
}
}
_ntypes
=
ntypes
;
_memory
=
memory
;
}
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push,target(mic))
#endif
// The factor up to which we do caching
static
const
int
N_CACHE
=
8
;
template
<
class
flt_t
,
class
acc_t
,
lmp_intel
::
CalculationMode
mic
,
bool
pack_i
>
template
<
int
EVFLAG
,
int
EFLAG
>
void
IntelKernelTersoff
<
flt_t
,
acc_t
,
mic
,
pack_i
>::
kernel_step
(
int
eatom
,
int
vflag
,
const
int
*
_noalias
const
numneigh
,
const
int
*
_noalias
const
cnumneigh
,
const
int
*
_noalias
const
firstneigh
,
int
ntypes
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
atom_t
*
_noalias
const
x
,
const
typename
PairTersoffIntel
::
ForceConst
<
flt_t
>::
c_inner_t
*
_noalias
const
c_inner
,
const
typename
PairTersoffIntel
::
ForceConst
<
flt_t
>::
c_outer_t
*
_noalias
const
c_outer
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
vec3_acc_t
*
_noalias
const
f
,
avec
*
vsevdwl
,
avec
*
vsv0
,
avec
*
vsv1
,
avec
*
vsv2
,
avec
*
vsv3
,
avec
*
vsv4
,
avec
*
vsv5
,
int
compress_idx
,
iarr
is
,
iarr
js
,
bvec
vmask_repulsive
)
{
ivec
v_i4floats
((
int
)
(
4
*
sizeof
(
typename
v
::
fscal
)));
ivec
v_i1
(
1
);
fvec
v_2
(
0.
);
fvec
v_0_5
(
0.5
);
ivec
v_i0
(
0
);
ivec
v_i_ntypes
(
ntypes
);
ivec
v_i_NEIGHMASK
(
NEIGHMASK
);
farr
fx
,
fy
,
fz
,
fw
;
int
cache_idx
=
0
;
fvec
vfkx_cache
[
N_CACHE
];
fvec
vfky_cache
[
N_CACHE
];
fvec
vfkz_cache
[
N_CACHE
];
ivec
vks_cache
[
N_CACHE
];
bvec
vmask_cache
[
N_CACHE
];
ivec
vkks_final_cache
;
bvec
vmask_final_cache
;
iarr
ts
;
// compute all the stuff we know from i and j
// TDO: We could extract this from the driver routine
ivec
vis
=
v
::
int_mullo
(
v_i4floats
,
v
::
int_load_vl
(
is
));
ivec
vjs
=
v
::
int_mullo
(
v_i4floats
,
v
::
int_load_vl
(
js
));
bvec
vmask
=
v
::
mask_enable_lower
(
compress_idx
);
fvec
vx_i
=
v
::
zero
(),
vy_i
=
v
::
zero
(),
vz_i
=
v
::
zero
();
ivec
vw_i
=
v_i0
;
v
::
gather_x
(
vis
,
vmask
,
x
,
&
vx_i
,
&
vy_i
,
&
vz_i
,
&
vw_i
);
fvec
vx_j
=
v
::
zero
(),
vy_j
=
v
::
zero
(),
vz_j
=
v
::
zero
();
ivec
vw_j
=
v_i0
;
v
::
gather_x
(
vjs
,
vmask
,
x
,
&
vx_j
,
&
vy_j
,
&
vz_j
,
&
vw_j
);
fvec
vdx_ij
=
vx_j
-
vx_i
,
vdy_ij
=
vy_j
-
vy_i
,
vdz_ij
=
vz_j
-
vz_i
;
fvec
vrijsq
=
vdx_ij
*
vdx_ij
+
vdy_ij
*
vdy_ij
+
vdz_ij
*
vdz_ij
;
fvec
vrij
=
sqrt
(
vrijsq
);
ivec
vis_orig
=
v
::
int_load_vl
(
is
);
ivec
vcnumneigh_i
=
v
::
int_gather
<
4
>
(
v_i0
,
vmask
,
vis_orig
,
cnumneigh
);
ivec
vnumneigh_i
=
v
::
int_gather
<
4
>
(
v_i0
,
vmask
,
vis_orig
,
numneigh
);
ivec
vc_idx_ij
=
v
::
int_mullo
(
v_i4floats
,
vw_j
+
v
::
int_mullo
(
v_i_ntypes
,
vw_i
));
fvec
vzeta
=
v
::
zero
();
fvec
vfxtmp
=
v
::
zero
(),
vfytmp
=
v
::
zero
(),
vfztmp
=
v
::
zero
();
fvec
vfjxtmp
=
v
::
zero
(),
vfjytmp
=
v
::
zero
(),
vfjztmp
=
v
::
zero
();
// This piece of code faciliates the traversal of the k loop assuming
// nothing about i. As such, it uses masking to avoid superfluous loads
// and fast-forwards each lane until work is available.
// This is useful because we can not make assumptions as to where in the
// neighbor list the atoms within the cutoff might be.
// We also implement the caching in here, i.e. collect force contributions
// due to zeta.
// This means that you will see four loops:
// 1. the loop that does zeta calculation and caches the force contributions
// 2. the loop that processes the remaining zeta calculations
// 3. the loop that updates the force based on the cached force contributions
// 4. the loop that computes force contributions for the remainder
{
ivec
vkks
=
v_i0
;
bvec
vactive_mask
=
vmask
;
bvec
veff_old_mask
(
0
);
ivec
vks
,
vw_k
;
fvec
vx_k
,
vy_k
,
vz_k
,
vcutsq
;
while
(
!
v
::
mask_testz
(
vactive_mask
)
&&
cache_idx
<
N_CACHE
)
{
bvec
vnew_mask
=
vactive_mask
&
~
veff_old_mask
;
vks
=
v
::
int_mullo
(
v_i4floats
,
v_i_NEIGHMASK
&
v
::
int_gather
<
4
>
(
vks
,
vactive_mask
,
vkks
+
vcnumneigh_i
,
firstneigh
));
v
::
gather_x
(
vks
,
vnew_mask
,
x
,
&
vx_k
,
&
vy_k
,
&
vz_k
,
&
vw_k
);
fvec
vdx_ik
=
(
vx_k
-
vx_i
);
fvec
vdy_ik
=
(
vy_k
-
vy_i
);
fvec
vdz_ik
=
(
vz_k
-
vz_i
);
fvec
vrsq
=
vdx_ik
*
vdx_ik
+
vdy_ik
*
vdy_ik
+
vdz_ik
*
vdz_ik
;
ivec
vc_idx
=
v
::
int_mullo
(
v_i4floats
,
vw_k
)
+
v
::
int_mullo
(
v_i_ntypes
,
vc_idx_ij
);
vcutsq
=
v
::
gather
<
4
>
(
vcutsq
,
vnew_mask
,
vc_idx
,
c_inner
);
bvec
vcutoff_mask
=
v
::
cmplt
(
vrsq
,
vcutsq
);
bvec
vsame_mask
=
v
::
int_cmpneq
(
vjs
,
vks
);
bvec
veff_mask
=
vcutoff_mask
&
vsame_mask
&
vactive_mask
;
if
(
v
::
mask_testz
(
~
(
veff_mask
|
~
vactive_mask
)))
{
fvec
vzeta_contrib
;
fvec
vfix
,
vfiy
,
vfiz
;
fvec
vfjx
,
vfjy
,
vfjz
;
fvec
vfkx
,
vfky
,
vfkz
;
attractive_vector
<
true
>
(
c_inner
,
vc_idx
,
veff_mask
,
fvec
(
1.
),
vrij
,
vrsq
,
vdx_ij
,
vdy_ij
,
vdz_ij
,
vdx_ik
,
vdy_ik
,
vdz_ik
,
&
vfix
,
&
vfiy
,
&
vfiz
,
&
vfjx
,
&
vfjy
,
&
vfjz
,
&
vfkx
,
&
vfky
,
&
vfkz
,
&
vzeta_contrib
);
vfxtmp
=
v
::
mask_add
(
vfxtmp
,
veff_mask
,
vfxtmp
,
vfix
);
vfytmp
=
v
::
mask_add
(
vfytmp
,
veff_mask
,
vfytmp
,
vfiy
);
vfztmp
=
v
::
mask_add
(
vfztmp
,
veff_mask
,
vfztmp
,
vfiz
);
vfjxtmp
=
v
::
mask_add
(
vfjxtmp
,
veff_mask
,
vfjxtmp
,
vfjx
);
vfjytmp
=
v
::
mask_add
(
vfjytmp
,
veff_mask
,
vfjytmp
,
vfjy
);
vfjztmp
=
v
::
mask_add
(
vfjztmp
,
veff_mask
,
vfjztmp
,
vfjz
);
vfkx_cache
[
cache_idx
]
=
vfkx
;
vfky_cache
[
cache_idx
]
=
vfky
;
vfkz_cache
[
cache_idx
]
=
vfkz
;
vks_cache
[
cache_idx
]
=
vks
;
vmask_cache
[
cache_idx
]
=
veff_mask
;
cache_idx
+=
1
;
vzeta
=
v
::
mask_add
(
vzeta
,
veff_mask
,
vzeta
,
vzeta_contrib
);
vkks
=
vkks
+
v_i1
;
veff_old_mask
=
bvec
(
0
);
}
else
{
vkks
=
v
::
int_mask_add
(
vkks
,
~
veff_mask
,
vkks
,
v_i1
);
veff_old_mask
=
veff_mask
;
}
vactive_mask
&=
v
::
int_cmplt
(
vkks
,
vnumneigh_i
);
}
vkks_final_cache
=
vkks
;
vmask_final_cache
=
vactive_mask
;
while
(
!
v
::
mask_testz
(
vactive_mask
))
{
bvec
vnew_mask
=
vactive_mask
&
~
veff_old_mask
;
vks
=
v
::
int_mullo
(
v_i4floats
,
v_i_NEIGHMASK
&
v
::
int_gather
<
4
>
(
vks
,
vactive_mask
,
vkks
+
vcnumneigh_i
,
firstneigh
));
v
::
gather_x
(
vks
,
vnew_mask
,
x
,
&
vx_k
,
&
vy_k
,
&
vz_k
,
&
vw_k
);
fvec
vdx_ik
=
(
vx_k
-
vx_i
);
fvec
vdy_ik
=
(
vy_k
-
vy_i
);
fvec
vdz_ik
=
(
vz_k
-
vz_i
);
fvec
vrsq
=
vdx_ik
*
vdx_ik
+
vdy_ik
*
vdy_ik
+
vdz_ik
*
vdz_ik
;
ivec
vc_idx
=
v
::
int_mullo
(
v_i4floats
,
vw_k
)
+
v
::
int_mullo
(
v_i_ntypes
,
vc_idx_ij
);
vcutsq
=
v
::
gather
<
4
>
(
vcutsq
,
vnew_mask
,
vc_idx
,
c_inner
);
bvec
vcutoff_mask
=
v
::
cmplt
(
vrsq
,
vcutsq
);
bvec
vsame_mask
=
v
::
int_cmpneq
(
vjs
,
vks
);
bvec
veff_mask
=
vcutoff_mask
&
vsame_mask
&
vactive_mask
;
if
(
v
::
mask_testz
(
~
(
veff_mask
|
~
vactive_mask
)))
{
fvec
vzeta_contrib
;
vzeta_contrib
=
zeta_vector
(
c_inner
,
vc_idx
,
veff_mask
,
vrij
,
vrsq
,
vdx_ij
,
vdy_ij
,
vdz_ij
,
vdx_ik
,
vdy_ik
,
vdz_ik
);
vzeta
=
v
::
mask_add
(
vzeta
,
veff_mask
,
vzeta
,
vzeta_contrib
);
vkks
=
vkks
+
v_i1
;
veff_old_mask
=
bvec
(
0
);
}
else
{
vkks
=
v
::
int_mask_add
(
vkks
,
~
veff_mask
,
vkks
,
v_i1
);
veff_old_mask
=
veff_mask
;
}
vactive_mask
&=
v
::
int_cmplt
(
vkks
,
vnumneigh_i
);
}
}
fvec
vfpair
,
vevdwl
,
vprefactor
,
vfwtmp
,
vfjwtmp
;
force_zeta_vector
(
c_outer
,
vc_idx_ij
,
vmask
,
vrij
,
vzeta
,
&
vfpair
,
&
vprefactor
,
EFLAG
,
&
vevdwl
,
vmask_repulsive
);
vfxtmp
=
vfxtmp
*
vprefactor
+
vdx_ij
*
vfpair
;
vfytmp
=
vfytmp
*
vprefactor
+
vdy_ij
*
vfpair
;
vfztmp
=
vfztmp
*
vprefactor
+
vdz_ij
*
vfpair
;
vfjxtmp
=
vfjxtmp
*
vprefactor
-
vdx_ij
*
vfpair
;
vfjytmp
=
vfjytmp
*
vprefactor
-
vdy_ij
*
vfpair
;
vfjztmp
=
vfjztmp
*
vprefactor
-
vdz_ij
*
vfpair
;
if
(
EVFLAG
)
{
if
(
EFLAG
)
{
*
vsevdwl
=
v
::
acc_mask_add
(
*
vsevdwl
,
vmask
,
*
vsevdwl
,
vevdwl
);
if
(
eatom
)
{
v
::
store
(
fw
,
(
v_0_5
*
vevdwl
));
}
}
if
(
vflag
==
1
)
{
*
vsv0
=
v
::
acc_mask_add
(
*
vsv0
,
vmask
,
*
vsv0
,
vdx_ij
*
vdx_ij
*
vfpair
);
*
vsv1
=
v
::
acc_mask_add
(
*
vsv1
,
vmask
,
*
vsv1
,
vdy_ij
*
vdy_ij
*
vfpair
);
*
vsv2
=
v
::
acc_mask_add
(
*
vsv2
,
vmask
,
*
vsv2
,
vdz_ij
*
vdz_ij
*
vfpair
);
*
vsv3
=
v
::
acc_mask_add
(
*
vsv3
,
vmask
,
*
vsv3
,
vdx_ij
*
vdy_ij
*
vfpair
);
*
vsv4
=
v
::
acc_mask_add
(
*
vsv4
,
vmask
,
*
vsv4
,
vdx_ij
*
vdz_ij
*
vfpair
);
*
vsv5
=
v
::
acc_mask_add
(
*
vsv5
,
vmask
,
*
vsv5
,
vdy_ij
*
vdz_ij
*
vfpair
);
}
}
{
while
(
cache_idx
--
>
0
)
{
fvec
vfkx
=
vprefactor
*
vfkx_cache
[
cache_idx
];
fvec
vfky
=
vprefactor
*
vfky_cache
[
cache_idx
];
fvec
vfkz
=
vprefactor
*
vfkz_cache
[
cache_idx
];
ivec
vks
=
vks_cache
[
cache_idx
];
bvec
veff_mask
=
vmask_cache
[
cache_idx
];
v
::
store
(
fx
,
vfkx
);
v
::
store
(
fy
,
vfky
);
v
::
store
(
fz
,
vfkz
);
v
::
int_store
(
ts
,
vks
);
for
(
int
t
=
0
;
t
<
v
::
VL
;
t
++
)
{
if
(
v
::
mask_test_at
(
veff_mask
,
t
))
{
int
t_
=
ts
[
t
]
/
(
4
*
sizeof
(
typename
v
::
fscal
));
f
[
t_
].
x
+=
fx
[
t
];
f
[
t_
].
y
+=
fy
[
t
];
f
[
t_
].
z
+=
fz
[
t
];
}
}
}
ivec
vkks
=
vkks_final_cache
;
bvec
vactive_mask
=
vmask_final_cache
;
bvec
veff_old_mask
(
0
);
ivec
vks
,
vw_k
;
fvec
vx_k
,
vy_k
,
vz_k
,
vcutsq
;
while
(
!
v
::
mask_testz
(
vactive_mask
))
{
bvec
vnew_mask
=
vactive_mask
&
~
veff_old_mask
;
vks
=
v
::
int_mullo
(
v_i4floats
,
v_i_NEIGHMASK
&
v
::
int_gather
<
4
>
(
vks
,
vactive_mask
,
vkks
+
vcnumneigh_i
,
firstneigh
));
v
::
gather_x
(
vks
,
vnew_mask
,
x
,
&
vx_k
,
&
vy_k
,
&
vz_k
,
&
vw_k
);
fvec
vdx_ik
=
vx_k
-
vx_i
;
fvec
vdy_ik
=
vy_k
-
vy_i
;
fvec
vdz_ik
=
vz_k
-
vz_i
;
fvec
vrsq
=
vdx_ik
*
vdx_ik
+
vdy_ik
*
vdy_ik
+
vdz_ik
*
vdz_ik
;
ivec
vc_idx
=
v
::
int_mullo
(
v_i4floats
,
vw_k
)
+
v
::
int_mullo
(
v_i_ntypes
,
vc_idx_ij
);
vcutsq
=
v
::
gather
<
4
>
(
vcutsq
,
vnew_mask
,
vc_idx
,
c_inner
);
bvec
vcutoff_mask
=
v
::
cmplt
(
vrsq
,
vcutsq
);
bvec
vsame_mask
=
v
::
int_cmpneq
(
vjs
,
vks
);
bvec
veff_mask
=
vcutoff_mask
&
vsame_mask
&
vactive_mask
;
if
(
v
::
mask_testz
(
~
(
veff_mask
|
~
vactive_mask
)))
{
fvec
vfix
,
vfiy
,
vfiz
;
fvec
vfjx
,
vfjy
,
vfjz
;
fvec
vfkx
,
vfky
,
vfkz
;
attractive_vector
<
false
>
(
c_inner
,
vc_idx
,
veff_mask
,
vprefactor
,
vrij
,
vrsq
,
vdx_ij
,
vdy_ij
,
vdz_ij
,
vdx_ik
,
vdy_ik
,
vdz_ik
,
&
vfix
,
&
vfiy
,
&
vfiz
,
&
vfjx
,
&
vfjy
,
&
vfjz
,
&
vfkx
,
&
vfky
,
&
vfkz
,
0
);
vfxtmp
=
v
::
mask_add
(
vfxtmp
,
veff_mask
,
vfxtmp
,
vfix
);
vfytmp
=
v
::
mask_add
(
vfytmp
,
veff_mask
,
vfytmp
,
vfiy
);
vfztmp
=
v
::
mask_add
(
vfztmp
,
veff_mask
,
vfztmp
,
vfiz
);
vfjxtmp
=
v
::
mask_add
(
vfjxtmp
,
veff_mask
,
vfjxtmp
,
vfjx
);
vfjytmp
=
v
::
mask_add
(
vfjytmp
,
veff_mask
,
vfjytmp
,
vfjy
);
vfjztmp
=
v
::
mask_add
(
vfjztmp
,
veff_mask
,
vfjztmp
,
vfjz
);
v
::
store
(
fx
,
vfkx
);
v
::
store
(
fy
,
vfky
);
v
::
store
(
fz
,
vfkz
);
v
::
int_store
(
ts
,
vks
);
for
(
int
t
=
0
;
t
<
v
::
VL
;
t
++
)
{
if
(
v
::
mask_test_at
(
veff_mask
,
t
))
{
int
t_
=
ts
[
t
]
/
(
4
*
sizeof
(
typename
v
::
fscal
));
f
[
t_
].
x
+=
fx
[
t
];
f
[
t_
].
y
+=
fy
[
t
];
f
[
t_
].
z
+=
fz
[
t
];
}
}
vkks
=
vkks
+
v_i1
;
veff_old_mask
=
bvec
(
0
);
}
else
{
vkks
=
v
::
int_mask_add
(
vkks
,
~
veff_mask
,
vkks
,
v_i1
);
veff_old_mask
=
veff_mask
;
}
vactive_mask
&=
v
::
int_cmplt
(
vkks
,
vnumneigh_i
);
}
// while (vactive_mask != 0)
}
// section k
// We can not make any assumptions regarding conflicts.
// So we sequentialize this.
// TDO: Once AVX-512 is around check out VPCONFLICT
v
::
store
(
fx
,
vfjxtmp
);
v
::
store
(
fy
,
vfjytmp
);
v
::
store
(
fz
,
vfjztmp
);
for
(
int
t
=
0
;
t
<
compress_idx
;
t
++
)
{
int
t_
=
js
[
t
];
f
[
t_
].
x
+=
fx
[
t
];
f
[
t_
].
y
+=
fy
[
t
];
f
[
t_
].
z
+=
fz
[
t
];
if
(
EVFLAG
&&
EFLAG
&&
eatom
)
{
f
[
t_
].
w
+=
fw
[
t
];
}
}
v
::
store
(
fx
,
vfxtmp
);
v
::
store
(
fy
,
vfytmp
);
v
::
store
(
fz
,
vfztmp
);
for
(
int
t
=
0
;
t
<
compress_idx
;
t
++
)
{
int
t_
=
is
[
t
];
f
[
t_
].
x
+=
fx
[
t
];
f
[
t_
].
y
+=
fy
[
t
];
f
[
t_
].
z
+=
fz
[
t
];
if
(
EVFLAG
&&
EFLAG
&&
eatom
)
{
f
[
t_
].
w
+=
fw
[
t
];
}
}
}
// Specialized kernel step for fixed i, means that we don't have to use the
// convoluted iteration scheme above, as the loop variables are uniform.
template
<
class
flt_t
,
class
acc_t
,
lmp_intel
::
CalculationMode
mic
,
bool
pack_i
>
template
<
int
EVFLAG
,
int
EFLAG
>
void
IntelKernelTersoff
<
flt_t
,
acc_t
,
mic
,
pack_i
>::
kernel_step_const_i
(
int
eatom
,
int
vflag
,
const
int
*
_noalias
const
numneigh
,
const
int
*
_noalias
const
cnumneigh
,
const
int
*
_noalias
const
firstneigh
,
int
ntypes
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
atom_t
*
_noalias
const
x
,
const
typename
PairTersoffIntel
::
ForceConst
<
flt_t
>::
c_inner_t
*
_noalias
const
c_inner
,
const
typename
PairTersoffIntel
::
ForceConst
<
flt_t
>::
c_outer_t
*
_noalias
const
c_outer
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
vec3_acc_t
*
_noalias
const
f
,
avec
*
vsevdwl
,
avec
*
vsv0
,
avec
*
vsv1
,
avec
*
vsv2
,
avec
*
vsv3
,
avec
*
vsv4
,
avec
*
vsv5
,
int
compress_idx
,
int
i
,
iarr
js
,
bvec
vmask_repulsive
)
{
typedef
typename
v
::
fvec
fvec
;
typedef
typename
v
::
ivec
ivec
;
typedef
typename
v
::
bvec
bvec
;
typedef
typename
v
::
farr
farr
;
typedef
typename
v
::
iarr
iarr
;
typedef
typename
v
::
avec
avec
;
typedef
typename
v
::
aarr
aarr
;
ivec
v_i4floats
((
int
)
(
4
*
sizeof
(
typename
v
::
fscal
)));
ivec
v_i1
(
1
),
v_i0
(
0
),
v_i_ntypes
(
ntypes
),
v_i_NEIGHMASK
(
NEIGHMASK
);
fvec
v_0_5
(
0.5
);
int
cache_idx
=
0
;
fvec
vfkx_cache
[
N_CACHE
];
fvec
vfky_cache
[
N_CACHE
];
fvec
vfkz_cache
[
N_CACHE
];
int
k_cache
[
N_CACHE
];
bvec
vmask_cache
[
N_CACHE
];
int
kk_final_cache
;
aarr
fx
,
fy
,
fz
,
fw
;
iarr
ts
;
bvec
vmask
=
v
::
mask_enable_lower
(
compress_idx
);
fvec
vx_i
(
x
[
i
].
x
),
vy_i
(
x
[
i
].
y
),
vz_i
(
x
[
i
].
z
);
int
w_i
=
x
[
i
].
w
;
ivec
vjs
=
v
::
int_mullo
(
v_i4floats
,
v
::
int_load_vl
(
js
));
fvec
vx_j
=
v
::
zero
(),
vy_j
=
v
::
zero
(),
vz_j
=
v
::
zero
();
ivec
vw_j
=
v_i0
;
v
::
gather_x
(
vjs
,
vmask
,
x
,
&
vx_j
,
&
vy_j
,
&
vz_j
,
&
vw_j
);
fvec
vdx_ij
=
vx_j
-
vx_i
,
vdy_ij
=
vy_j
-
vy_i
,
vdz_ij
=
vz_j
-
vz_i
;
fvec
vrijsq
=
vdx_ij
*
vdx_ij
+
vdy_ij
*
vdy_ij
+
vdz_ij
*
vdz_ij
;
fvec
vrij
=
sqrt
(
vrijsq
);
int
cnumneigh_i
=
cnumneigh
[
i
];
int
numneigh_i
=
numneigh
[
i
];
ivec
vc_idx_j
=
v
::
int_mullo
(
v_i4floats
,
vw_j
);
ivec
vc_idx_j_ntypes
=
v
::
int_mullo
(
v_i_ntypes
,
vc_idx_j
);
avec
vzeta
=
v
::
acc_zero
();
avec
vfxtmp
=
v
::
acc_zero
(),
vfytmp
=
v
::
acc_zero
(),
vfztmp
=
v
::
acc_zero
();
avec
vfjxtmp
=
v
::
acc_zero
(),
vfjytmp
=
v
::
acc_zero
(),
vfjztmp
=
v
::
acc_zero
();
// Same structure as kernel_step, just simpler as the loops all iterate over
// the same k
int
kk
=
0
;
for
(;
kk
<
numneigh_i
&&
cache_idx
<
N_CACHE
;
kk
++
)
{
int
k
=
firstneigh
[
kk
+
cnumneigh_i
]
&
NEIGHMASK
;
fvec
vx_k
(
x
[
k
].
x
);
fvec
vy_k
(
x
[
k
].
y
);
fvec
vz_k
(
x
[
k
].
z
);
int
w_k
=
x
[
k
].
w
;
fvec
vdx_ik
=
vx_k
-
vx_i
;
fvec
vdy_ik
=
vy_k
-
vy_i
;
fvec
vdz_ik
=
vz_k
-
vz_i
;
fvec
vrsq
=
vdx_ik
*
vdx_ik
+
vdy_ik
*
vdy_ik
+
vdz_ik
*
vdz_ik
;
fvec
vcutsq
=
v
::
gather
<
4
>
(
v
::
zero
(),
vmask
,
vc_idx_j_ntypes
,
&
c_inner
[
ntypes
*
ntypes
*
w_i
+
w_k
]);
bvec
vcutoff_mask
=
v
::
cmplt
(
vrsq
,
vcutsq
);
bvec
vsame_mask
=
v
::
int_cmpneq
(
vjs
,
ivec
(
static_cast
<
int
>
(
4
*
sizeof
(
typename
v
::
fscal
)
*
k
)));
bvec
veff_mask
=
vcutoff_mask
&
vsame_mask
&
vmask
;
if
(
!
v
::
mask_testz
(
veff_mask
))
{
fvec
vzeta_contrib
;
fvec
vfix
,
vfiy
,
vfiz
;
fvec
vfjx
,
vfjy
,
vfjz
;
fvec
vfkx
,
vfky
,
vfkz
;
attractive_vector
<
true
>
(
&
c_inner
[
ntypes
*
ntypes
*
w_i
+
w_k
],
vc_idx_j_ntypes
,
veff_mask
,
fvec
(
1.
),
vrij
,
vrsq
,
vdx_ij
,
vdy_ij
,
vdz_ij
,
vdx_ik
,
vdy_ik
,
vdz_ik
,
&
vfix
,
&
vfiy
,
&
vfiz
,
&
vfjx
,
&
vfjy
,
&
vfjz
,
&
vfkx
,
&
vfky
,
&
vfkz
,
&
vzeta_contrib
);
vfxtmp
=
v
::
acc_mask_add
(
vfxtmp
,
veff_mask
,
vfxtmp
,
vfix
);
vfytmp
=
v
::
acc_mask_add
(
vfytmp
,
veff_mask
,
vfytmp
,
vfiy
);
vfztmp
=
v
::
acc_mask_add
(
vfztmp
,
veff_mask
,
vfztmp
,
vfiz
);
vfjxtmp
=
v
::
acc_mask_add
(
vfjxtmp
,
veff_mask
,
vfjxtmp
,
vfjx
);
vfjytmp
=
v
::
acc_mask_add
(
vfjytmp
,
veff_mask
,
vfjytmp
,
vfjy
);
vfjztmp
=
v
::
acc_mask_add
(
vfjztmp
,
veff_mask
,
vfjztmp
,
vfjz
);
vfkx_cache
[
cache_idx
]
=
v
::
mask_add
(
v
::
zero
(),
veff_mask
,
vfkx
,
v
::
zero
());
vfky_cache
[
cache_idx
]
=
v
::
mask_add
(
v
::
zero
(),
veff_mask
,
vfky
,
v
::
zero
());
vfkz_cache
[
cache_idx
]
=
v
::
mask_add
(
v
::
zero
(),
veff_mask
,
vfkz
,
v
::
zero
());
vmask_cache
[
cache_idx
]
=
veff_mask
;
k_cache
[
cache_idx
]
=
k
;
cache_idx
+=
1
;
vzeta
=
v
::
acc_mask_add
(
vzeta
,
veff_mask
,
vzeta
,
vzeta_contrib
);
}
}
kk_final_cache
=
kk
;
for
(;
kk
<
numneigh_i
;
kk
++
)
{
int
k
=
firstneigh
[
kk
+
cnumneigh_i
]
&
NEIGHMASK
;
fvec
vx_k
(
x
[
k
].
x
);
fvec
vy_k
(
x
[
k
].
y
);
fvec
vz_k
(
x
[
k
].
z
);
int
w_k
=
x
[
k
].
w
;
fvec
vdx_ik
=
vx_k
-
vx_i
;
fvec
vdy_ik
=
vy_k
-
vy_i
;
fvec
vdz_ik
=
vz_k
-
vz_i
;
fvec
vrsq
=
vdx_ik
*
vdx_ik
+
vdy_ik
*
vdy_ik
+
vdz_ik
*
vdz_ik
;
fvec
vcutsq
=
v
::
gather
<
4
>
(
v
::
zero
(),
vmask
,
vc_idx_j_ntypes
,
&
c_inner
[
ntypes
*
ntypes
*
w_i
+
w_k
]);
bvec
vcutoff_mask
=
v
::
cmplt
(
vrsq
,
vcutsq
);
bvec
vsame_mask
=
v
::
int_cmpneq
(
vjs
,
ivec
(
static_cast
<
int
>
(
4
*
sizeof
(
typename
v
::
fscal
)
*
k
)));
bvec
veff_mask
=
vcutoff_mask
&
vsame_mask
&
vmask
;
if
(
!
v
::
mask_testz
(
veff_mask
))
{
fvec
vzeta_contrib
=
zeta_vector
(
&
c_inner
[
ntypes
*
ntypes
*
w_i
+
w_k
],
vc_idx_j_ntypes
,
veff_mask
,
vrij
,
vrsq
,
vdx_ij
,
vdy_ij
,
vdz_ij
,
vdx_ik
,
vdy_ik
,
vdz_ik
);
vzeta
=
v
::
acc_mask_add
(
vzeta
,
veff_mask
,
vzeta
,
vzeta_contrib
);
}
}
fvec
vfpair
,
vevdwl
,
vprefactor
,
vfwtmp
;
force_zeta_vector
(
&
c_outer
[
ntypes
*
w_i
],
vc_idx_j
,
vmask
,
vrij
,
vzeta
,
&
vfpair
,
&
vprefactor
,
EFLAG
,
&
vevdwl
,
vmask_repulsive
);
avec
vaprefactor
(
vprefactor
);
vfxtmp
=
vfxtmp
*
vaprefactor
+
avec
(
vdx_ij
*
vfpair
);
vfytmp
=
vfytmp
*
vaprefactor
+
avec
(
vdy_ij
*
vfpair
);
vfztmp
=
vfztmp
*
vaprefactor
+
avec
(
vdz_ij
*
vfpair
);
vfjxtmp
=
vfjxtmp
*
vaprefactor
-
avec
(
vdx_ij
*
vfpair
);
vfjytmp
=
vfjytmp
*
vaprefactor
-
avec
(
vdy_ij
*
vfpair
);
vfjztmp
=
vfjztmp
*
vaprefactor
-
avec
(
vdz_ij
*
vfpair
);
if
(
EVFLAG
)
{
if
(
EFLAG
)
{
*
vsevdwl
=
v
::
acc_mask_add
(
*
vsevdwl
,
vmask
,
*
vsevdwl
,
vevdwl
);
if
(
eatom
)
{
vfwtmp
=
v_0_5
*
vevdwl
;
v
::
store
(
fw
,
vfwtmp
);
}
}
if
(
vflag
==
1
)
{
*
vsv0
=
v
::
acc_mask_add
(
*
vsv0
,
vmask
,
*
vsv0
,
vdx_ij
*
vdx_ij
*
vfpair
);
*
vsv1
=
v
::
acc_mask_add
(
*
vsv1
,
vmask
,
*
vsv1
,
vdy_ij
*
vdy_ij
*
vfpair
);
*
vsv2
=
v
::
acc_mask_add
(
*
vsv2
,
vmask
,
*
vsv2
,
vdz_ij
*
vdz_ij
*
vfpair
);
*
vsv3
=
v
::
acc_mask_add
(
*
vsv3
,
vmask
,
*
vsv3
,
vdx_ij
*
vdy_ij
*
vfpair
);
*
vsv4
=
v
::
acc_mask_add
(
*
vsv4
,
vmask
,
*
vsv4
,
vdx_ij
*
vdz_ij
*
vfpair
);
*
vsv5
=
v
::
acc_mask_add
(
*
vsv5
,
vmask
,
*
vsv5
,
vdy_ij
*
vdz_ij
*
vfpair
);
}
}
while
(
cache_idx
--
>
0
)
{
fvec
vfkx
=
vprefactor
*
vfkx_cache
[
cache_idx
];
fvec
vfky
=
vprefactor
*
vfky_cache
[
cache_idx
];
fvec
vfkz
=
vprefactor
*
vfkz_cache
[
cache_idx
];
int
k
=
k_cache
[
cache_idx
];
bvec
veff_mask
=
vmask_cache
[
cache_idx
];
f
[
k
].
x
+=
v
::
reduce_add
(
v
::
mask_add
(
v
::
zero
(),
veff_mask
,
vfkx
,
v
::
zero
()));
f
[
k
].
y
+=
v
::
reduce_add
(
v
::
mask_add
(
v
::
zero
(),
veff_mask
,
vfky
,
v
::
zero
()));
f
[
k
].
z
+=
v
::
reduce_add
(
v
::
mask_add
(
v
::
zero
(),
veff_mask
,
vfkz
,
v
::
zero
()));
}
for
(
int
kk
=
kk_final_cache
;
kk
<
numneigh_i
;
kk
++
)
{
int
k
=
firstneigh
[
kk
+
cnumneigh_i
]
&
NEIGHMASK
;
fvec
vx_k
(
x
[
k
].
x
);
fvec
vy_k
(
x
[
k
].
y
);
fvec
vz_k
(
x
[
k
].
z
);
int
w_k
=
x
[
k
].
w
;
fvec
vdx_ik
=
vx_k
-
vx_i
;
fvec
vdy_ik
=
vy_k
-
vy_i
;
fvec
vdz_ik
=
vz_k
-
vz_i
;
fvec
vrsq
=
vdx_ik
*
vdx_ik
+
vdy_ik
*
vdy_ik
+
vdz_ik
*
vdz_ik
;
fvec
vcutsq
=
v
::
gather
<
4
>
(
v
::
zero
(),
vmask
,
vc_idx_j_ntypes
,
&
c_inner
[
ntypes
*
ntypes
*
w_i
+
w_k
].
cutsq
);
bvec
vcutoff_mask
=
v
::
cmplt
(
vrsq
,
vcutsq
);
bvec
vsame_mask
=
v
::
int_cmpneq
(
vjs
,
ivec
(
static_cast
<
int
>
(
4
*
sizeof
(
typename
v
::
fscal
)
*
k
)));
bvec
veff_mask
=
vcutoff_mask
&
vsame_mask
&
vmask
;
if
(
!
v
::
mask_testz
(
veff_mask
))
{
fvec
vfix
,
vfiy
,
vfiz
;
fvec
vfjx
,
vfjy
,
vfjz
;
fvec
vfkx
,
vfky
,
vfkz
;
attractive_vector
<
false
>
(
&
c_inner
[
ntypes
*
ntypes
*
w_i
+
w_k
],
vc_idx_j_ntypes
,
veff_mask
,
vprefactor
,
vrij
,
vrsq
,
vdx_ij
,
vdy_ij
,
vdz_ij
,
vdx_ik
,
vdy_ik
,
vdz_ik
,
&
vfix
,
&
vfiy
,
&
vfiz
,
&
vfjx
,
&
vfjy
,
&
vfjz
,
&
vfkx
,
&
vfky
,
&
vfkz
,
0
);
vfxtmp
=
v
::
acc_mask_add
(
vfxtmp
,
veff_mask
,
vfxtmp
,
vfix
);
vfytmp
=
v
::
acc_mask_add
(
vfytmp
,
veff_mask
,
vfytmp
,
vfiy
);
vfztmp
=
v
::
acc_mask_add
(
vfztmp
,
veff_mask
,
vfztmp
,
vfiz
);
vfjxtmp
=
v
::
acc_mask_add
(
vfjxtmp
,
veff_mask
,
vfjxtmp
,
vfjx
);
vfjytmp
=
v
::
acc_mask_add
(
vfjytmp
,
veff_mask
,
vfjytmp
,
vfjy
);
vfjztmp
=
v
::
acc_mask_add
(
vfjztmp
,
veff_mask
,
vfjztmp
,
vfjz
);
f
[
k
].
x
+=
v
::
reduce_add
(
v
::
mask_add
(
v
::
zero
(),
veff_mask
,
vfkx
,
v
::
zero
()));
f
[
k
].
y
+=
v
::
reduce_add
(
v
::
mask_add
(
v
::
zero
(),
veff_mask
,
vfky
,
v
::
zero
()));
f
[
k
].
z
+=
v
::
reduce_add
(
v
::
mask_add
(
v
::
zero
(),
veff_mask
,
vfkz
,
v
::
zero
()));
}
}
// TDO: This could be a scatter
v
::
acc_store
(
fx
,
vfjxtmp
);
v
::
acc_store
(
fy
,
vfjytmp
);
v
::
acc_store
(
fz
,
vfjztmp
);
for
(
int
t
=
0
;
t
<
compress_idx
;
t
++
)
{
int
t_
=
js
[
t
];
f
[
t_
].
x
+=
fx
[
t
];
f
[
t_
].
y
+=
fy
[
t
];
f
[
t_
].
z
+=
fz
[
t
];
if
(
EVFLAG
&&
EFLAG
&&
eatom
)
{
f
[
t_
].
w
+=
fw
[
t
];
}
}
f
[
i
].
x
+=
v
::
acc_reduce_add
(
v
::
acc_mask_add
(
v
::
acc_zero
(),
vmask
,
vfxtmp
,
v
::
zero
()));
f
[
i
].
y
+=
v
::
acc_reduce_add
(
v
::
acc_mask_add
(
v
::
acc_zero
(),
vmask
,
vfytmp
,
v
::
zero
()));
f
[
i
].
z
+=
v
::
acc_reduce_add
(
v
::
acc_mask_add
(
v
::
acc_zero
(),
vmask
,
vfztmp
,
v
::
zero
()));
if
(
EVFLAG
&&
EFLAG
&&
eatom
)
{
f
[
i
].
z
+=
v
::
acc_reduce_add
(
v
::
acc_mask_add
(
v
::
acc_zero
(),
vmask
,
vfwtmp
,
v
::
zero
()));
}
}
template
<
class
flt_t
,
class
acc_t
,
lmp_intel
::
CalculationMode
mic
,
bool
pack_i
>
template
<
bool
EVFLAG
,
bool
EFLAG
>
void
IntelKernelTersoff
<
flt_t
,
acc_t
,
mic
,
pack_i
>::
kernel
(
int
iito
,
int
iifrom
,
int
eatom
,
int
vflag
,
const
int
*
_noalias
const
numneigh
,
const
int
*
_noalias
const
numneighhalf
,
const
int
*
_noalias
const
cnumneigh
,
const
int
*
_noalias
const
firstneigh
,
int
ntypes
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
atom_t
*
_noalias
const
x
,
const
c_inner_t
*
_noalias
const
c_inner
,
const
c_outer_t
*
_noalias
const
c_outer
,
typename
IntelBuffers
<
flt_t
,
acc_t
>::
vec3_acc_t
*
_noalias
const
f
,
acc_t
*
evdwl
,
acc_t
*
ov0
,
acc_t
*
ov1
,
acc_t
*
ov2
,
acc_t
*
ov3
,
acc_t
*
ov4
,
acc_t
*
ov5
)
{
int
compress_idx
=
0
;
int
ii
,
jj
;
iarr
is
,
js
;
avec
vsevdwl
=
v
::
acc_zero
();
avec
vsv0
=
v
::
acc_zero
(),
vsv1
=
v
::
acc_zero
(),
vsv2
=
v
::
acc_zero
();
avec
vsv3
=
v
::
acc_zero
(),
vsv4
=
v
::
acc_zero
(),
vsv5
=
v
::
acc_zero
();
ivec
v_i4floats
(
static_cast
<
int
>
(
sizeof
(
typename
v
::
fscal
)
*
4
));
ivec
vj
,
v_NEIGHMASK
(
NEIGHMASK
);
bvec
vmask_repulsive
(
0
);
iarr
repulsive_flag
=
{
0
};
// If you want to get the very most out of this, please uncomment.
// Consider getting a coffee or doing something else.
// Also good for heating.
//#pragma forceinline recursive
for
(
ii
=
iifrom
;
ii
<
iito
;
ii
++
)
{
// Right now this loop is scalar, to allow for the compiler to do
// its prefetching magic.
int
i
=
ii
;
int
w_i
=
x
[
i
].
w
;
flt_t
x_i
=
x
[
i
].
x
;
flt_t
y_i
=
x
[
i
].
y
;
flt_t
z_i
=
x
[
i
].
z
;
int
jlist_off_i
=
cnumneigh
[
i
];
int
jnum
=
numneigh
[
ii
];
for
(
jj
=
0
;
jj
<
jnum
;
jj
++
)
{
int
j
=
firstneigh
[
jlist_off_i
+
jj
]
&
NEIGHMASK
;
int
w_j
=
x
[
j
].
w
;
flt_t
dx_ij
=
x
[
j
].
x
-
x_i
;
flt_t
dy_ij
=
x
[
j
].
y
-
y_i
;
flt_t
dz_ij
=
x
[
j
].
z
-
z_i
;
flt_t
rsq
=
dx_ij
*
dx_ij
+
dy_ij
*
dy_ij
+
dz_ij
*
dz_ij
;
flt_t
cutsq
=
c_outer
[
w_i
*
ntypes
+
w_j
].
cutsq
;
if
(
rsq
<
cutsq
)
{
is
[
compress_idx
]
=
ii
;
js
[
compress_idx
]
=
j
;
if
(
jj
<
numneighhalf
[
i
])
repulsive_flag
[
compress_idx
]
=
1
;
compress_idx
+=
1
;
}
if
(
pack_i
)
{
if
(
compress_idx
==
v
::
VL
)
{
vmask_repulsive
=
v
::
int_cmpneq
(
v
::
int_load_vl
(
repulsive_flag
),
ivec
(
0
));
kernel_step
<
EVFLAG
,
EFLAG
>
(
eatom
,
vflag
,
numneigh
,
cnumneigh
,
firstneigh
,
ntypes
,
x
,
c_inner
,
c_outer
,
f
,
&
vsevdwl
,
&
vsv0
,
&
vsv1
,
&
vsv2
,
&
vsv3
,
&
vsv4
,
&
vsv5
,
compress_idx
,
is
,
js
,
vmask_repulsive
);
compress_idx
=
0
;
v
::
int_clear_arr
(
repulsive_flag
);
}
}
else
{
if
(
compress_idx
==
v
::
VL
||
(
compress_idx
>
0
&&
jj
==
jnum
-
1
))
{
vmask_repulsive
=
v
::
int_cmpneq
(
v
::
int_load_vl
(
repulsive_flag
),
ivec
(
0
));
kernel_step_const_i
<
EVFLAG
,
EFLAG
>
(
eatom
,
vflag
,
numneigh
,
cnumneigh
,
firstneigh
,
ntypes
,
x
,
c_inner
,
c_outer
,
f
,
&
vsevdwl
,
&
vsv0
,
&
vsv1
,
&
vsv2
,
&
vsv3
,
&
vsv4
,
&
vsv5
,
compress_idx
,
i
,
js
,
vmask_repulsive
);
compress_idx
=
0
;
v
::
int_clear_arr
(
repulsive_flag
);
}
}
}
}
if
(
compress_idx
>
0
)
{
vmask_repulsive
=
v
::
int_cmpneq
(
v
::
int_load_vl
(
repulsive_flag
),
ivec
(
0
));
IntelKernelTersoff
::
kernel_step
<
EVFLAG
,
EFLAG
>
(
eatom
,
vflag
,
numneigh
,
cnumneigh
,
firstneigh
,
ntypes
,
x
,
c_inner
,
c_outer
,
f
,
&
vsevdwl
,
&
vsv0
,
&
vsv1
,
&
vsv2
,
&
vsv3
,
&
vsv4
,
&
vsv5
,
compress_idx
,
is
,
js
,
vmask_repulsive
);
}
if
(
EVFLAG
)
{
if
(
EFLAG
)
{
*
evdwl
+=
v
::
acc_reduce_add
(
vsevdwl
);
}
if
(
vflag
==
1
)
{
*
ov0
+=
v
::
acc_reduce_add
(
vsv0
);
*
ov1
+=
v
::
acc_reduce_add
(
vsv1
);
*
ov2
+=
v
::
acc_reduce_add
(
vsv2
);
*
ov3
+=
v
::
acc_reduce_add
(
vsv3
);
*
ov4
+=
v
::
acc_reduce_add
(
vsv4
);
*
ov5
+=
v
::
acc_reduce_add
(
vsv5
);
}
}
}
template
<
class
flt_t
,
class
acc_t
,
lmp_intel
::
CalculationMode
mic
,
bool
pack_i
>
IntelKernelTersoff
<
flt_t
,
acc_t
,
mic
,
pack_i
>::
fvec
IntelKernelTersoff
<
flt_t
,
acc_t
,
mic
,
pack_i
>::
zeta_vector
(
const
c_inner_t
*
param
,
ivec
xjw
,
bvec
mask
,
fvec
vrij
,
fvec
rsq2
,
fvec
vdijx
,
fvec
vdijy
,
fvec
vdijz
,
fvec
dikx
,
fvec
diky
,
fvec
dikz
)
{
fvec
v_1_0
(
1.0
);
fvec
v_0_5
(
0.5
);
fvec
vph
=
v
::
zero
(),
vpc2
=
v
::
zero
(),
vpd2
=
v
::
zero
(),
vpgamma
=
v
::
zero
(),
vplam3
=
v
::
zero
(),
vppowermint
=
v
::
zero
(),
vpbigr
=
v
::
zero
(),
vpbigd
=
v
::
zero
();
// TDO: Specialize on number of species
v
::
gather_8
(
xjw
,
mask
,
&
param
[
0
].
lam3
,
&
vplam3
,
&
vppowermint
,
&
vpbigr
,
&
vpbigd
,
&
vpc2
,
&
vpd2
,
&
vph
,
&
vpgamma
);
fvec
vrik
=
sqrt
(
rsq2
);
fvec
vcostheta
=
(
vdijx
*
dikx
+
vdijy
*
diky
+
vdijz
*
dikz
)
*
v
::
recip
(
vrij
*
vrik
);
fvec
vhcth
=
vph
-
vcostheta
;
fvec
vgijk_a
=
vhcth
*
vhcth
;
fvec
vgijk
=
vpgamma
*
(
v_1_0
+
vpc2
*
vgijk_a
*
v
::
recip
(
vpd2
*
(
vpd2
+
vgijk_a
)));
fvec
varg1
=
vplam3
*
(
vrij
-
vrik
);
fvec
varg3
=
varg1
*
varg1
*
varg1
;
bvec
mask_ex
=
v
::
cmpeq
(
vppowermint
,
fvec
(
3.
));
fvec
varg
=
v
::
blend
(
mask_ex
,
varg1
,
varg3
);
fvec
vex_delr
=
v
::
min
(
fvec
(
1.e30
),
exp
(
varg
));
bvec
vmask_need_sine
=
v
::
cmpnle
(
vrik
,
vpbigr
-
vpbigd
)
&
mask
;
fvec
vfc
=
v_1_0
;
// Its kind of important to check the mask.
// Some simulations never/rarely invoke this branch.
if
(
!
v
::
mask_testz
(
vmask_need_sine
))
{
vfc
=
v
::
blend
(
vmask_need_sine
,
vfc
,
v_0_5
*
(
v_1_0
-
sin
(
fvec
(
MY_PI2
)
*
(
vrik
-
vpbigr
)
*
v
::
recip
(
vpbigd
))));
}
return
vgijk
*
vex_delr
*
vfc
;
}
template
<
class
flt_t
,
class
acc_t
,
lmp_intel
::
CalculationMode
mic
,
bool
pack_i
>
void
IntelKernelTersoff
<
flt_t
,
acc_t
,
mic
,
pack_i
>::
force_zeta_vector
(
const
c_outer_t
*
param
,
ivec
xjw
,
bvec
mask
,
fvec
vrij
,
fvec
vzeta_ij
,
fvec
*
vfpair
,
fvec
*
vprefactor
,
int
EVDWL
,
fvec
*
vevdwl
,
bvec
vmask_repulsive
)
{
fvec
v_0_0
(
0.0
);
fvec
v_0_5
(
0.5
);
fvec
v_m0_5
(
-
0.5
);
fvec
v_1_0
(
1.0
);
fvec
v_m1_0
(
-
1.0
);
fvec
v_2_0
(
2.0
);
fvec
vpbigr
=
v
::
zero
(),
vpbigd
=
v
::
zero
(),
vplam1
=
v
::
zero
(),
vpbiga
=
v
::
zero
(),
vplam2
=
v
::
zero
(),
vpbeta
=
v
::
zero
(),
vpbigb
=
v
::
zero
(),
vppowern
=
v
::
zero
();
v
::
gather_8
(
xjw
,
mask
,
&
param
[
0
].
bigr
,
&
vpbigr
,
&
vpbigd
,
&
vplam1
,
&
vpbiga
,
&
vplam2
,
&
vpbeta
,
&
vpbigb
,
&
vppowern
);
fvec
vfccos
;
// This is pretty much a literal translation.
bvec
vmask_need_sine
=
v
::
cmpnle
(
vrij
,
vpbigr
-
vpbigd
)
&
mask
;
fvec
vfc
=
v_1_0
;
fvec
vfc_d
=
v_0_0
;
if
(
!
v
::
mask_testz
(
vmask_need_sine
))
{
fvec
vtmp
=
fvec
(
MY_PI2
)
*
v
::
recip
(
vpbigd
);
vfc
=
v
::
blend
(
vmask_need_sine
,
vfc
,
v_0_5
*
(
v_1_0
-
v
::
sincos
(
&
vfccos
,
vtmp
*
(
vrij
-
vpbigr
))));
vfc_d
=
v
::
blend
(
vmask_need_sine
,
vfc_d
,
v_m0_5
*
vtmp
*
vfccos
);
}
fvec
vpminus_lam2
=
-
vplam2
;
fvec
vpminus_bigb
=
-
vpbigb
;
fvec
vexp
=
exp
(
vpminus_lam2
*
vrij
);
fvec
vfa
=
vpminus_bigb
*
vexp
*
vfc
;
fvec
vfa_d
=
vpminus_lam2
*
vfa
+
vpminus_bigb
*
vexp
*
vfc_d
;
fvec
vpc1
=
v
::
zero
(),
vpc2
=
v
::
zero
(),
vpc3
=
v
::
zero
(),
vpc4
=
v
::
zero
();
v
::
gather_4
(
xjw
,
mask
,
&
param
[
0
].
c1
,
&
vpc1
,
&
vpc2
,
&
vpc3
,
&
vpc4
);
fvec
vpminus_powern
=
-
vppowern
;
fvec
vbij
(
0.
),
vbij_d
(
0.
);
fvec
vtmp
=
vpbeta
*
vzeta_ij
;
bvec
vmc1
=
v
::
cmple
(
vpc1
,
vtmp
)
&
mask
;
if
(
!
v
::
mask_testz
(
vmc1
))
{
vbij
=
v
::
invsqrt
(
vtmp
);
vbij_d
=
vpbeta
*
v_m0_5
*
vbij
*
v
::
recip
(
vtmp
);
}
bvec
vmc2
=
v
::
cmple
(
vpc2
,
vtmp
)
&
~
vmc1
&
mask
;
if
(
!
v
::
mask_testz
(
vmc2
))
{
fvec
vpowminus_powern
=
pow
(
vtmp
,
vpminus_powern
);
fvec
vinvsqrt
=
v
::
invsqrt
(
vtmp
);
fvec
vrcp2powern
=
v
::
recip
(
v_2_0
*
vppowern
);
fvec
va
=
(
v_1_0
-
vpowminus_powern
*
vrcp2powern
)
*
vinvsqrt
;
fvec
va_d
=
vpbeta
*
v_m0_5
*
vinvsqrt
*
v
::
recip
(
vtmp
)
*
(
v_1_0
+
v_m0_5
*
vpowminus_powern
*
(
v_1_0
+
vrcp2powern
));
vbij
=
v
::
blend
(
vmc2
,
vbij
,
va
);
vbij_d
=
v
::
blend
(
vmc2
,
vbij_d
,
va_d
);
}
bvec
vmc3
=
v
::
cmplt
(
vtmp
,
vpc4
)
&
~
vmc2
&
~
vmc1
&
mask
;
if
(
!
v
::
mask_testz
(
vmc3
))
{
vbij
=
v
::
blend
(
vmc3
,
vbij
,
v_1_0
);
vbij_d
=
v
::
blend
(
vmc3
,
vbij_d
,
v_0_0
);
}
bvec
vmc4
=
v
::
cmple
(
vtmp
,
vpc3
)
&
~
vmc3
&
~
vmc2
&
~
vmc1
&
mask
;
if
(
!
v
::
mask_testz
(
vmc4
))
{
fvec
vpowm1
=
pow
(
vtmp
,
vppowern
-
v_1_0
);
fvec
vrcp2powern
=
v
::
recip
(
v_2_0
*
vppowern
);
fvec
va
=
v_1_0
-
vtmp
*
vrcp2powern
*
vpowm1
;
fvec
va_d
=
v_m0_5
*
vpbeta
*
vpowm1
;
vbij
=
v
::
blend
(
vmc4
,
vbij
,
va
);
vbij_d
=
v
::
blend
(
vmc4
,
vbij_d
,
va_d
);
}
bvec
vmc5
=
mask
&
~
vmc1
&
~
vmc2
&
~
vmc3
&
~
vmc4
;
if
(
!
v
::
mask_testz
(
vmc5
))
{
fvec
vtmp_n
=
pow
(
vtmp
,
vppowern
);
fvec
vpow2
=
pow
(
v_1_0
+
vtmp_n
,
v_m1_0
-
v
::
recip
(
v_2_0
*
vppowern
));
fvec
va
=
(
v_1_0
+
vtmp_n
)
*
vpow2
;
fvec
va_d
=
v_m0_5
*
vpow2
*
vtmp_n
*
v
::
recip
(
vzeta_ij
);
vbij
=
v
::
blend
(
vmc5
,
vbij
,
va
);
vbij_d
=
v
::
blend
(
vmc5
,
vbij_d
,
va_d
);
}
fvec
vtmp_exp
=
exp
(
-
vplam1
*
vrij
);
fvec
vrep_fforce
=
vpbiga
*
vtmp_exp
*
(
vfc_d
-
vfc
*
vplam1
);
fvec
vfz_fforce
=
v_0_5
*
vbij
*
vfa_d
;
*
vfpair
=
v
::
mask_add
(
vfz_fforce
,
vmask_repulsive
,
vfz_fforce
,
vrep_fforce
)
*
v
::
recip
(
vrij
);
*
vprefactor
=
v_m0_5
*
vfa
*
vbij_d
;
if
(
EVDWL
)
{
fvec
vrep_eng
=
vfc
*
vpbiga
*
vtmp_exp
;
fvec
vfz_eng
=
v_0_5
*
vfa
*
vbij
;
*
vevdwl
=
v
::
mask_add
(
vfz_eng
,
vmask_repulsive
,
vfz_eng
,
vrep_eng
);
}
}
template
<
class
flt_t
,
class
acc_t
,
lmp_intel
::
CalculationMode
mic
,
bool
pack_i
>
template
<
bool
ZETA
>
void
IntelKernelTersoff
<
flt_t
,
acc_t
,
mic
,
pack_i
>::
attractive_vector
(
const
c_inner_t
*
param
,
ivec
xjw
,
bvec
mask
,
fvec
vprefactor
,
fvec
vrij
,
fvec
rsq2
,
fvec
vdijx
,
fvec
vdijy
,
fvec
vdijz
,
fvec
dikx
,
fvec
diky
,
fvec
dikz
,
fvec
*
fix
,
fvec
*
fiy
,
fvec
*
fiz
,
fvec
*
fjx
,
fvec
*
fjy
,
fvec
*
fjz
,
fvec
*
fkx
,
fvec
*
fky
,
fvec
*
fkz
,
fvec
*
zeta
)
{
fvec
v_1_0
=
fvec
(
1.0
);
fvec
vph
=
v
::
zero
(),
vpc2
=
v
::
zero
(),
vpd2
=
fvec
(
1.0
),
vpgamma
=
v
::
zero
(),
vplam3
=
v
::
zero
(),
vppowermint
=
v
::
zero
(),
vpbigr
=
v
::
zero
(),
vpbigd
=
fvec
(
1.0
);
v
::
gather_8
(
xjw
,
mask
,
&
param
[
0
].
lam3
,
&
vplam3
,
&
vppowermint
,
&
vpbigr
,
&
vpbigd
,
&
vpc2
,
&
vpd2
,
&
vph
,
&
vpgamma
);
fvec
vrijinv
=
v
::
recip
(
vrij
);
fvec
vrij_hatx
=
vrijinv
*
vdijx
;
fvec
vrij_haty
=
vrijinv
*
vdijy
;
fvec
vrij_hatz
=
vrijinv
*
vdijz
;
fvec
rikinv
=
invsqrt
(
rsq2
);
fvec
rik_hatx
=
rikinv
*
dikx
;
fvec
rik_haty
=
rikinv
*
diky
;
fvec
rik_hatz
=
rikinv
*
dikz
;
fvec
vrik
=
sqrt
(
rsq2
);
fvec
vcostheta
=
(
vdijx
*
dikx
+
vdijy
*
diky
+
vdijz
*
dikz
)
*
v
::
recip
(
vrij
*
vrik
);
fvec
vhcth
=
vph
-
vcostheta
;
fvec
vdenominator
=
v
::
recip
(
vpd2
+
vhcth
*
vhcth
);
fvec
vgijk
=
vpgamma
*
(
v_1_0
+
vpc2
*
v
::
recip
(
vpd2
)
-
vpc2
*
vdenominator
);
fvec
vnumerator
=
fvec
(
-
2.
)
*
vpc2
*
vhcth
;
fvec
vgijk_d
=
vpgamma
*
vnumerator
*
vdenominator
*
vdenominator
;
fvec
varg1
=
vplam3
*
(
vrij
-
vrik
);
fvec
varg3
=
varg1
*
varg1
*
varg1
;
bvec
mask_ex
=
v
::
cmpeq
(
vppowermint
,
fvec
(
3.
));
fvec
varg
=
v
::
blend
(
mask_ex
,
varg1
,
varg3
);
fvec
vex_delr
=
min
(
fvec
(
1.e30
),
exp
(
varg
));
fvec
vex_delr_d_factor
=
v
::
blend
(
mask_ex
,
v_1_0
,
fvec
(
3.0
)
*
varg1
*
varg1
);
fvec
vex_delr_d
=
vplam3
*
vex_delr_d_factor
*
vex_delr
;
bvec
vmask_need_sine
=
v
::
cmpnle
(
vrik
,
vpbigr
-
vpbigd
)
&
mask
;
fvec
vfccos
;
fvec
vfc
=
v_1_0
;
fvec
vfc_d
=
v
::
zero
();
if
(
!
v
::
mask_testz
(
vmask_need_sine
))
{
fvec
vtmp
=
fvec
(
MY_PI2
)
*
v
::
recip
(
vpbigd
);
vfc
=
v
::
blend
(
vmask_need_sine
,
vfc
,
fvec
(
0.5
)
*
(
v_1_0
-
v
::
sincos
(
&
vfccos
,
vtmp
*
(
vrik
-
vpbigr
))));
vfc_d
=
v
::
blend
(
vmask_need_sine
,
vfc_d
,
fvec
(
-
0.5
)
*
vtmp
*
vfccos
);
}
fvec
vzeta_d_fc
=
vfc_d
*
vgijk
*
vex_delr
;
fvec
vzeta_d_gijk
=
vfc
*
vgijk_d
*
vex_delr
;
fvec
vzeta_d_ex_delr
=
vfc
*
vgijk
*
vex_delr_d
;
if
(
ZETA
)
*
zeta
=
vfc
*
vgijk
*
vex_delr
;
fvec
vminus_costheta
=
-
vcostheta
;
fvec
vdcosdrjx
=
vrijinv
*
fmadd
(
vminus_costheta
,
vrij_hatx
,
rik_hatx
);
fvec
vdcosdrjy
=
vrijinv
*
fmadd
(
vminus_costheta
,
vrij_haty
,
rik_haty
);
fvec
vdcosdrjz
=
vrijinv
*
fmadd
(
vminus_costheta
,
vrij_hatz
,
rik_hatz
);
fvec
vdcosdrkx
=
rikinv
*
fmadd
(
vminus_costheta
,
rik_hatx
,
vrij_hatx
);
fvec
vdcosdrky
=
rikinv
*
fmadd
(
vminus_costheta
,
rik_haty
,
vrij_haty
);
fvec
vdcosdrkz
=
rikinv
*
fmadd
(
vminus_costheta
,
rik_hatz
,
vrij_hatz
);
fvec
vdcosdrix
=
-
(
vdcosdrjx
+
vdcosdrkx
);
fvec
vdcosdriy
=
-
(
vdcosdrjy
+
vdcosdrky
);
fvec
vdcosdriz
=
-
(
vdcosdrjz
+
vdcosdrkz
);
*
fix
=
vprefactor
*
(
vzeta_d_gijk
*
vdcosdrix
+
vzeta_d_ex_delr
*
(
rik_hatx
-
vrij_hatx
)
-
vzeta_d_fc
*
rik_hatx
);
*
fiy
=
vprefactor
*
(
vzeta_d_gijk
*
vdcosdriy
+
vzeta_d_ex_delr
*
(
rik_haty
-
vrij_haty
)
-
vzeta_d_fc
*
rik_haty
);
*
fiz
=
vprefactor
*
(
vzeta_d_gijk
*
vdcosdriz
+
vzeta_d_ex_delr
*
(
rik_hatz
-
vrij_hatz
)
-
vzeta_d_fc
*
rik_hatz
);
*
fjx
=
vprefactor
*
(
vzeta_d_gijk
*
vdcosdrjx
+
vzeta_d_ex_delr
*
vrij_hatx
);
*
fjy
=
vprefactor
*
(
vzeta_d_gijk
*
vdcosdrjy
+
vzeta_d_ex_delr
*
vrij_haty
);
*
fjz
=
vprefactor
*
(
vzeta_d_gijk
*
vdcosdrjz
+
vzeta_d_ex_delr
*
vrij_hatz
);
*
fkx
=
vprefactor
*
((
vzeta_d_fc
-
vzeta_d_ex_delr
)
*
rik_hatx
+
vzeta_d_gijk
*
vdcosdrkx
);
*
fky
=
vprefactor
*
((
vzeta_d_fc
-
vzeta_d_ex_delr
)
*
rik_haty
+
vzeta_d_gijk
*
vdcosdrky
);
*
fkz
=
vprefactor
*
((
vzeta_d_fc
-
vzeta_d_ex_delr
)
*
rik_hatz
+
vzeta_d_gijk
*
vdcosdrkz
);
}
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
#endif
Event Timeline
Log In to Comment