Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F106899272
pair_airebo_intel.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Apr 2, 06:26
Size
172 KB
Mime Type
text/x-c
Expires
Fri, Apr 4, 06:26 (2 d)
Engine
blob
Format
Raw Data
Handle
25301113
Attached To
rLAMMPS lammps
pair_airebo_intel.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Markus Hohnerbach (RWTH)
------------------------------------------------------------------------- */
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(push, target(mic))
#endif
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <stddef.h>
#include "lmptype.h"
#include "intel_preprocess.h"
#include "intel_intrinsics_airebo.h"
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
#include <omp.h>
#include <string.h>
#include "pair_airebo_intel.h"
#include "atom.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "force.h"
#include "comm.h"
#include "memory.h"
#include "error.h"
#include "group.h"
#include "kspace.h"
#include "modify.h"
#include "suffix.h"
using
namespace
LAMMPS_NS
;
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(push, target(mic))
#endif
template
<
typename
flt_t
,
typename
acc_t
>
struct
LAMMPS_NS
::
PairAIREBOIntelParam
{
flt_t
cutlj
,
cutljrebosq
,
cut3rebo
;
flt_t
sigmin
,
sigcut
;
flt_t
cutljsq
[
2
][
2
];
flt_t
lj1
[
2
][
2
],
lj2
[
2
][
2
],
lj3
[
2
][
2
],
lj4
[
2
][
2
];
flt_t
smin
,
Nmin
,
Nmax
,
NCmin
,
NCmax
,
thmin
,
thmax
;
flt_t
rcmin
[
2
][
2
],
rcmax
[
2
][
2
],
rcmaxsq
[
2
][
2
],
rcmaxp
[
2
][
2
];
flt_t
Q
[
2
][
2
],
alpha
[
2
][
2
],
A
[
2
][
2
],
rho
[
2
][
2
],
BIJc
[
2
][
2
][
3
],
Beta
[
2
][
2
][
3
];
flt_t
rcLJmin
[
2
][
2
],
rcLJmax
[
2
][
2
],
rcLJmaxsq
[
2
][
2
],
bLJmin
[
2
][
2
],
bLJmax
[
2
][
2
];
flt_t
epsilon
[
2
][
2
],
sigma
[
2
][
2
],
epsilonT
[
2
][
2
];
// spline coefficients
flt_t
gCdom
[
5
],
gC1
[
4
][
6
],
gC2
[
4
][
6
],
gHdom
[
4
],
gH
[
3
][
6
];
flt_t
gDom
[
5
+
4
];
flt_t
gVal
[(
4
+
4
+
3
)
*
6
];
flt_t
pCCdom
[
2
][
2
],
pCHdom
[
2
][
2
],
pCC
[
4
][
4
][
16
],
pCH
[
4
][
4
][
16
];
flt_t
piCCdom
[
3
][
2
],
piCHdom
[
3
][
2
],
piHHdom
[
3
][
2
];
acc_t
piCC
[
4
][
4
][
9
][
64
],
piCH
[
4
][
4
][
9
][
64
],
piHH
[
4
][
4
][
9
][
64
];
flt_t
Tijdom
[
3
][
2
];
acc_t
Tijc
[
4
][
4
][
9
][
64
];
// spline knot values
flt_t
PCCf
[
5
][
5
],
PCCdfdx
[
5
][
5
],
PCCdfdy
[
5
][
5
],
PCHf
[
5
][
5
];
flt_t
PCHdfdx
[
5
][
5
],
PCHdfdy
[
5
][
5
];
flt_t
piCCf
[
5
][
5
][
11
],
piCCdfdx
[
5
][
5
][
11
];
flt_t
piCCdfdy
[
5
][
5
][
11
],
piCCdfdz
[
5
][
5
][
11
];
flt_t
piCHf
[
5
][
5
][
11
],
piCHdfdx
[
5
][
5
][
11
];
flt_t
piCHdfdy
[
5
][
5
][
11
],
piCHdfdz
[
5
][
5
][
11
];
flt_t
piHHf
[
5
][
5
][
11
],
piHHdfdx
[
5
][
5
][
11
];
flt_t
piHHdfdy
[
5
][
5
][
11
],
piHHdfdz
[
5
][
5
][
11
];
flt_t
Tf
[
5
][
5
][
10
],
Tdfdx
[
5
][
5
][
10
],
Tdfdy
[
5
][
5
][
10
],
Tdfdz
[
5
][
5
][
10
];
};
namespace
{
struct
NeighListAIREBO
{
int
*
num
;
/* num_all */
int
*
num_half
;
/* num_all */
int
*
offset
;
/* num_all */
int
*
entries
;
/* num_all * num_neighs_per_atom */
};
template
<
typename
flt_t
>
struct
AtomAIREBOT
{
flt_t
x
,
y
,
z
;
int
w
;
};
template
<
typename
acc_t
>
struct
ResultForceT
{
acc_t
x
,
y
,
z
,
w
;
};
template
<
typename
flt_t
,
typename
acc_t
>
struct
KernelArgsAIREBOT
{
int
num_local
;
int
num_all
;
int
num_neighs_per_atom
;
int
num_types
;
int
frebo_from_atom
,
frebo_to_atom
;
int
neigh_from_atom
,
neigh_to_atom
;
int
rebuild_flag
;
flt_t
skin
;
struct
NeighListAIREBO
neigh_lmp
;
struct
NeighListAIREBO
neigh_rebo
;
PairAIREBOIntelParam
<
flt_t
,
acc_t
>
params
;
struct
AtomAIREBOT
<
flt_t
>
*
x
;
/* num_all */
int
*
tag
;
/* num_all */
flt_t
*
nC
,
*
nH
;
/* num_all */
int
*
map
;
/* num_types+1 */
struct
ResultForceT
<
acc_t
>
*
result_f
;
/* num_all */
acc_t
result_eng
;
};
template
<
typename
flt_t
,
typename
acc_t
>
void
aut_lennard_jones
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
morseflag
);
template
<
typename
flt_t
,
typename
acc_t
>
void
aut_rebo_neigh
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
);
template
<
typename
flt_t
,
typename
acc_t
>
void
aut_frebo
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
torsion_flag
);
}
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
/* ---------------------------------------------------------------------- */
PairAIREBOIntel
::
PairAIREBOIntel
(
LAMMPS
*
lmp
)
:
PairAIREBO
(
lmp
)
{
suffix_flag
|=
Suffix
::
INTEL
;
REBO_cnumneigh
=
NULL
;
REBO_num_skin
=
NULL
;
REBO_list_data
=
NULL
;
fix
=
NULL
;
}
/* ---------------------------------------------------------------------- */
PairAIREBOIntel
::~
PairAIREBOIntel
()
{
memory
->
destroy
(
REBO_cnumneigh
);
memory
->
destroy
(
REBO_num_skin
);
memory
->
destroy
(
REBO_list_data
);
}
/* ---------------------------------------------------------------------- */
void
PairAIREBOIntel
::
init_style
()
{
PairAIREBO
::
init_style
();
neighbor
->
requests
[
neighbor
->
nrequest
-
1
]
->
intel
=
1
;
int
ifix
=
modify
->
find_fix
(
"package_intel"
);
if
(
ifix
<
0
)
error
->
all
(
FLERR
,
"The 'package intel' command is required for /intel styles"
);
fix
=
static_cast
<
FixIntel
*>
(
modify
->
fix
[
ifix
]);
fix
->
pair_init_check
();
#ifdef _LMP_INTEL_OFFLOAD
_cop
=
fix
->
coprocessor_number
();
#endif
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
{
pack_force_const
(
fix
->
get_mixed_buffers
());
fix
->
get_mixed_buffers
()
->
need_tag
(
1
);
}
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
{
pack_force_const
(
fix
->
get_double_buffers
());
fix
->
get_double_buffers
()
->
need_tag
(
1
);
}
else
{
pack_force_const
(
fix
->
get_single_buffers
());
fix
->
get_single_buffers
()
->
need_tag
(
1
);
}
#ifdef _LMP_INTEL_OFFLOAD
if
(
fix
->
offload_noghost
())
error
->
all
(
FLERR
,
"The 'ghost no' option cannot be used with airebo/intel."
);
#endif
}
/* ---------------------------------------------------------------------- */
template
<
typename
T
>
T
*
calloc_it
(
size_t
size
)
{
return
static_cast
<
T
*>
(
calloc
(
size
,
sizeof
(
T
)));
}
void
PairAIREBOIntel
::
compute
(
int
eflag
,
int
vflag
)
{
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_MIXED
)
compute
<
float
,
double
>
(
eflag
,
vflag
,
fix
->
get_mixed_buffers
());
else
if
(
fix
->
precision
()
==
FixIntel
::
PREC_MODE_DOUBLE
)
compute
<
double
,
double
>
(
eflag
,
vflag
,
fix
->
get_double_buffers
());
else
compute
<
float
,
float
>
(
eflag
,
vflag
,
fix
->
get_single_buffers
());
fix
->
balance_stamp
();
vflag_fdotr
=
0
;
}
/* ---------------------------------------------------------------------- */
template
<
class
flt_t
,
class
acc_t
>
PairAIREBOIntelParam
<
flt_t
,
acc_t
>
PairAIREBOIntel
::
get_param
()
{
PairAIREBOIntelParam
<
flt_t
,
acc_t
>
fc
;
#define A(a) \
for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) { \
reinterpret_cast<flt_t*>(&fc.a)[i] = \
reinterpret_cast<double*>(&this->a)[i]; \
}
#define A0(a) \
for (int i = 0; i < sizeof(fc.a)/sizeof(flt_t); i++) { \
reinterpret_cast<flt_t*>(&fc.a)[i] = \
reinterpret_cast<double*>(this->a[0])[i]; \
}
#define B(a) \
for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) { \
reinterpret_cast<acc_t*>(&fc.a)[i] = \
reinterpret_cast<double*>(&this->a)[i]; \
}
A
(
cutlj
)
A
(
cutljrebosq
)
A
(
cut3rebo
)
A
(
sigmin
);
A
(
sigcut
)
A0
(
cutljsq
)
A0
(
lj1
)
A0
(
lj2
)
A0
(
lj3
);
A0
(
lj4
)
A
(
smin
)
A
(
Nmin
)
A
(
Nmax
)
A
(
NCmin
)
A
(
NCmax
)
A
(
thmin
)
A
(
thmax
);
A
(
rcmin
)
A
(
rcmax
)
A
(
rcmaxsq
)
A
(
rcmaxp
)
A
(
Q
)
A
(
alpha
)
A
(
A
)
A
(
rho
)
A
(
BIJc
);
A
(
Beta
)
A
(
rcLJmin
)
A
(
rcLJmax
)
A
(
rcLJmaxsq
)
A
(
bLJmin
)
A
(
bLJmax
)
A
(
epsilon
);
A
(
sigma
)
A
(
epsilonT
)
A
(
gCdom
)
A
(
gC1
)
A
(
gC2
)
A
(
gHdom
)
A
(
gH
)
A
(
pCCdom
);
A
(
pCHdom
)
A
(
pCC
)
A
(
pCH
)
A
(
piCCdom
)
A
(
piCHdom
)
A
(
piHHdom
)
B
(
piCC
);
B
(
piCH
)
B
(
piHH
)
A
(
Tijdom
)
B
(
Tijc
)
A
(
PCCf
)
A
(
PCCdfdx
)
A
(
PCCdfdy
)
A
(
PCHf
);
A
(
PCHdfdx
)
A
(
PCHdfdy
)
A
(
piCCf
)
A
(
piCCdfdx
)
A
(
piCCdfdy
)
A
(
piCCdfdz
);
A
(
piCHf
)
A
(
piCHdfdx
)
A
(
piCHdfdy
)
A
(
piCHdfdz
)
A
(
piHHf
)
A
(
piHHdfdx
);
A
(
piHHdfdy
)
A
(
piHHdfdz
)
A
(
Tf
)
A
(
Tdfdx
)
A
(
Tdfdy
)
A
(
Tdfdz
);
#undef A
#undef A0
#undef B
for
(
int
i
=
0
;
i
<
5
;
i
++
)
fc
.
gDom
[
i
]
=
fc
.
gCdom
[
i
];
for
(
int
i
=
0
;
i
<
4
;
i
++
)
fc
.
gDom
[
5
+
i
]
=
fc
.
gHdom
[
i
];
for
(
int
i
=
0
;
i
<
4
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
fc
.
gVal
[
6
*
i
+
j
]
=
fc
.
gC1
[
i
][
j
];
for
(
int
i
=
0
;
i
<
4
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
fc
.
gVal
[
4
*
6
+
6
*
i
+
j
]
=
fc
.
gC2
[
i
][
j
];
for
(
int
i
=
0
;
i
<
3
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
fc
.
gVal
[
8
*
6
+
6
*
i
+
j
]
=
fc
.
gH
[
i
][
j
];
return
fc
;
}
/* ---------------------------------------------------------------------- */
template
<
class
flt_t
,
class
acc_t
>
void
PairAIREBOIntel
::
compute
(
int
eflag
,
int
vflag
,
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
)
{
if
(
eflag
||
vflag
)
ev_setup
(
eflag
,
vflag
);
else
evflag
=
vflag_fdotr
=
vflag_atom
=
0
;
pvector
[
0
]
=
pvector
[
1
]
=
pvector
[
2
]
=
0.0
;
const
int
inum
=
list
->
inum
;
const
int
nthreads
=
comm
->
nthreads
;
const
int
host_start
=
fix
->
host_start_pair
();
const
int
offload_end
=
fix
->
offload_end_pair
();
const
int
ago
=
neighbor
->
ago
;
if
(
ago
!=
0
&&
fix
->
separate_buffers
()
==
0
)
{
fix
->
start_watch
(
TIME_PACK
);
int
packthreads
;
if
(
nthreads
>
INTEL_HTHREADS
)
packthreads
=
nthreads
;
else
packthreads
=
1
;
#if defined(_OPENMP)
#pragma omp parallel if(packthreads > 1)
#endif
{
int
ifrom
,
ito
,
tid
;
IP_PRE_omp_range_id_align
(
ifrom
,
ito
,
tid
,
atom
->
nlocal
+
atom
->
nghost
,
packthreads
,
sizeof
(
ATOM_T
));
buffers
->
thr_pack
(
ifrom
,
ito
,
ago
);
}
fix
->
stop_watch
(
TIME_PACK
);
}
if
(
atom
->
nmax
>
maxlocal
)
{
#ifdef LMP_INTEL_OFFLOAD
if
(
maxlocal
>
0
&&
_cop
>=
0
)
{
int
*
const
REBO_numneigh
=
this
->
REBO_numneigh
;
int
*
const
REBO_num_skin
=
this
->
REBO_num_skin
;
int
*
const
REBO_cnumneigh
=
this
->
REBO_cnumneigh
;
int
*
const
REBO_list_data
=
this
->
REBO_list_data
;
double
*
const
nC
=
this
->
nC
;
double
*
const
nH
=
this
->
nH
;
#pragma offload_transfer target(mic:_cop) \
nocopy(REBO_numneigh: alloc_if(0) free_if(1)) \
nocopy(REBO_cnumneigh: alloc_if(0) free_if(1)) \
nocopy(REBO_num_skin: alloc_if(0) free_if(1)) \
nocopy(REBO_list_data: alloc_if(0) free_if(1)) \
nocopy(nH: alloc_if(0) free_if(1)) \
nocopy(nC: alloc_if(0) free_if(1))
}
#endif
maxlocal
=
atom
->
nmax
;
memory
->
destroy
(
REBO_numneigh
);
memory
->
destroy
(
REBO_cnumneigh
);
memory
->
destroy
(
REBO_list_data
);
memory
->
sfree
(
REBO_firstneigh
);
memory
->
destroy
(
nC
);
memory
->
destroy
(
nH
);
memory
->
create
(
REBO_numneigh
,
maxlocal
,
"AIREBO:numneigh"
);
memory
->
create
(
REBO_cnumneigh
,
maxlocal
,
"AIREBO:cnumneigh"
);
memory
->
create
(
REBO_num_skin
,
maxlocal
,
"AIREBO:cnumneigh"
);
int
max_nbors
=
buffers
->
get_max_nbors
();
memory
->
create
(
REBO_list_data
,
maxlocal
*
max_nbors
,
"AIREBO:list_data"
);
REBO_firstneigh
=
(
int
**
)
memory
->
smalloc
(
maxlocal
*
sizeof
(
int
*
),
"AIREBO:firstneigh"
);
memory
->
create
(
nC
,
maxlocal
,
"AIREBO:nC"
);
memory
->
create
(
nH
,
maxlocal
,
"AIREBO:nH"
);
#ifdef _LMP_INTEL_OFFLOAD
if
(
_cop
>=
0
)
{
int
*
const
REBO_numneigh
=
this
->
REBO_numneigh
;
int
*
const
REBO_num_skin
=
this
->
REBO_num_skin
;
int
*
const
REBO_cnumneigh
=
this
->
REBO_cnumneigh
;
int
*
const
REBO_list_data
=
this
->
REBO_list_data
;
double
*
const
nC
=
this
->
nC
;
double
*
const
nH
=
this
->
nH
;
const
int
mnml
=
max_nbors
*
maxlocal
;
#pragma offload_transfer target(mic:_cop) \
nocopy(REBO_numneigh: length(maxlocal) alloc_if(1) free_if(0)) \
nocopy(REBO_cnumneigh:length(maxlocal) alloc_if(1) free_if(0)) \
nocopy(REBO_num_skin: length(maxlocal) alloc_if(1) free_if(0)) \
nocopy(REBO_list_data:length(mnml) alloc_if(1) free_if(0)) \
nocopy(nH: length(maxlocal) alloc_if(1) free_if(0)) \
nocopy(nC: length(maxlocal) alloc_if(1) free_if(0))
}
#endif
}
if
(
evflag
||
vflag_fdotr
)
{
int
ovflag
=
0
;
if
(
vflag_fdotr
)
ovflag
=
2
;
else
if
(
vflag
)
ovflag
=
1
;
if
(
eflag
)
{
eval
<
1
,
1
>
(
1
,
ovflag
,
buffers
,
0
,
offload_end
);
eval
<
1
,
1
>
(
0
,
ovflag
,
buffers
,
host_start
,
inum
);
}
else
{
eval
<
1
,
0
>
(
1
,
ovflag
,
buffers
,
0
,
offload_end
);
eval
<
1
,
0
>
(
0
,
ovflag
,
buffers
,
host_start
,
inum
);
}
}
else
{
eval
<
0
,
0
>
(
1
,
0
,
buffers
,
0
,
offload_end
);
eval
<
0
,
0
>
(
0
,
0
,
buffers
,
host_start
,
inum
);
}
}
/* ---------------------------------------------------------------------- */
template
<
int
EVFLAG
,
int
EFLAG
,
class
flt_t
,
class
acc_t
>
void
PairAIREBOIntel
::
eval
(
const
int
offload
,
const
int
vflag
,
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
,
const
int
astart
,
const
int
aend
)
{
const
int
inum
=
aend
-
astart
;
if
(
inum
==
0
)
{
return
;
}
int
nlocal
,
nall
,
minlocal
;
fix
->
get_buffern
(
offload
,
nlocal
,
nall
,
minlocal
);
const
int
ago
=
neighbor
->
ago
;
IP_PRE_pack_separate_buffers
(
fix
,
buffers
,
ago
,
offload
,
nlocal
,
nall
);
ATOM_T
*
_noalias
const
x
=
buffers
->
get_x
(
offload
);
const
int
*
_noalias
const
numneighhalf
=
buffers
->
get_atombin
();
const
int
*
_noalias
const
numneigh
=
list
->
numneigh
;
const
int
*
_noalias
const
cnumneigh
=
buffers
->
cnumneigh
(
list
);
const
int
*
_noalias
const
firstneigh
=
buffers
->
firstneigh
(
list
);
int
*
const
tag
=
atom
->
tag
;
const
int
ntypes
=
atom
->
ntypes
+
1
;
const
int
eatom
=
this
->
eflag_atom
;
int
x_size
,
q_size
,
f_stride
,
ev_size
,
separate_flag
;
IP_PRE_get_transfern
(
ago
,
1
/*NEWTON_PAIR*/
,
EFLAG
,
vflag
,
buffers
,
offload
,
fix
,
separate_flag
,
x_size
,
q_size
,
ev_size
,
f_stride
);
int
tc
;
FORCE_T
*
_noalias
f_start
;
acc_t
*
_noalias
ev_global
;
IP_PRE_get_buffers
(
offload
,
buffers
,
fix
,
tc
,
f_start
,
ev_global
);
const
int
nthreads
=
tc
;
const
double
skin
=
neighbor
->
skin
;
const
int
max_nbor
=
buffers
->
get_max_nbors
();
const
PairAIREBOIntelParam
<
flt_t
,
acc_t
>
param
=
get_param
<
flt_t
,
acc_t
>
();
// offload here
#ifdef _LMP_INTEL_OFFLOAD
int
*
overflow
=
fix
->
get_off_overflow_flag
();
double
*
timer_compute
=
fix
->
off_watch_pair
();
int
*
const
REBO_numneigh
=
this
->
REBO_numneigh
;
int
*
const
REBO_num_skin
=
this
->
REBO_num_skin
;
int
*
const
REBO_cnumneigh
=
this
->
REBO_cnumneigh
;
int
*
const
REBO_list_data
=
this
->
REBO_list_data
;
double
*
const
nC
=
this
->
nC
;
double
*
const
nH
=
this
->
nH
;
const
int
torflag
=
this
->
torflag
;
const
int
ljflag
=
this
->
ljflag
;
const
int
morseflag
=
this
->
morseflag
;
int
*
const
map
=
this
->
map
;
if
(
offload
)
fix
->
start_watch
(
TIME_OFFLOAD_LATENCY
);
#pragma offload target(mic:_cop) if(offload) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(overflow:length(0) alloc_if(0) free_if(0)) \
in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
in(f_stride,nlocal,minlocal,separate_flag,offload) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
in(param,skin,max_nbor) \
in(tag: length(0) alloc_if(0) free_if(0)) \
in(torflag, ljflag, morseflag, ago) \
in(nC: length(0) alloc_if(0) free_if(0)) \
in(nH: length(0) alloc_if(0) free_if(0)) \
in(REBO_numneigh: length(0) alloc_if(0) free_if(0)) \
in(REBO_cnumneigh: length(0) alloc_if(0) free_if(0)) \
in(REBO_num_skin: length(0) alloc_if(0) free_if(0)) \
in(REBO_list_data: length(0) alloc_if(0) free_if(0)) \
in(map: length(0) alloc_if(0) free_if(0)) \
signal(f_start)
#endif
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*
timer_compute
=
MIC_Wtime
();
#endif
IP_PRE_repack_for_offload
(
1
/*NEWTON_PAIR*/
,
separate_flag
,
nlocal
,
nall
,
f_stride
,
x
,
0
/*q*/
);
acc_t
oevdwl
,
oecoul
,
ov0
,
ov1
,
ov2
,
ov3
,
ov4
,
ov5
;
if
(
EVFLAG
)
{
oevdwl
=
oecoul
=
(
acc_t
)
0
;
if
(
vflag
)
ov0
=
ov1
=
ov2
=
ov3
=
ov4
=
ov5
=
(
acc_t
)
0
;
}
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel \
shared(f_start,f_stride,nlocal,nall,minlocal) \
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int
iifrom
,
iito
,
tid
;
IP_PRE_omp_range_id
(
iifrom
,
iito
,
tid
,
inum
,
nthreads
);
iifrom
+=
astart
;
iito
+=
astart
;
int
neigh_iifrom
,
neigh_iito
;
IP_PRE_omp_range
(
neigh_iifrom
,
neigh_iito
,
tid
,
nall
,
nthreads
);
FORCE_T
*
_noalias
const
f
=
f_start
-
minlocal
+
(
tid
*
f_stride
);
memset
(
f
+
minlocal
,
0
,
f_stride
*
sizeof
(
FORCE_T
));
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
args
;
args
.
num_local
=
nlocal
;
args
.
num_all
=
nall
;
args
.
num_neighs_per_atom
=
max_nbor
;
args
.
num_types
=
ntypes
;
args
.
frebo_from_atom
=
0
;
args
.
frebo_to_atom
=
args
.
num_local
;
args
.
neigh_from_atom
=
0
;
args
.
neigh_to_atom
=
args
.
num_all
;
args
.
rebuild_flag
=
ago
==
0
;
args
.
skin
=
skin
;
args
.
neigh_lmp
.
num
=
const_cast
<
int
*>
(
numneigh
);
args
.
neigh_lmp
.
num_half
=
const_cast
<
int
*>
(
numneighhalf
);
args
.
neigh_lmp
.
offset
=
const_cast
<
int
*>
(
cnumneigh
);
args
.
neigh_lmp
.
entries
=
const_cast
<
int
*>
(
firstneigh
);
args
.
neigh_rebo
.
num
=
REBO_numneigh
;
args
.
neigh_rebo
.
num_half
=
REBO_num_skin
;
args
.
neigh_rebo
.
offset
=
REBO_cnumneigh
;
args
.
neigh_rebo
.
entries
=
REBO_list_data
;
args
.
params
=
param
;
args
.
tag
=
tag
;
args
.
nC
=
reinterpret_cast
<
flt_t
*>
(
nC
);
args
.
nH
=
reinterpret_cast
<
flt_t
*>
(
nH
);
args
.
map
=
map
;
args
.
result_eng
=
0
;
args
.
x
=
(
AtomAIREBOT
<
flt_t
>*
)
x
;
args
.
result_f
=
(
ResultForceT
<
acc_t
>
*
)
f
;
args
.
neigh_from_atom
=
neigh_iifrom
;
args
.
neigh_to_atom
=
neigh_iito
;
args
.
frebo_from_atom
=
iifrom
;
args
.
frebo_to_atom
=
iito
;
aut_rebo_neigh
(
&
args
);
#if defined(_OPENMP)
#pragma omp barrier
#endif
aut_frebo
(
&
args
,
torflag
);
if
(
ljflag
)
aut_lennard_jones
(
&
args
,
morseflag
);
oevdwl
+=
args
.
result_eng
;
IP_PRE_fdotr_reduce_omp
(
1
,
nall
,
minlocal
,
nthreads
,
f_start
,
f_stride
,
x
,
offload
,
vflag
,
ov0
,
ov1
,
ov2
,
ov3
,
ov4
,
ov5
);
}
// end of omp parallel region
IP_PRE_fdotr_reduce
(
1
,
nall
,
nthreads
,
f_stride
,
vflag
,
ov0
,
ov1
,
ov2
,
ov3
,
ov4
,
ov5
);
if
(
EVFLAG
)
{
if
(
EFLAG
)
{
ev_global
[
0
]
=
oevdwl
;
ev_global
[
1
]
=
oecoul
;
}
if
(
vflag
)
{
ev_global
[
2
]
=
ov0
;
ev_global
[
3
]
=
ov1
;
ev_global
[
4
]
=
ov2
;
ev_global
[
5
]
=
ov3
;
ev_global
[
6
]
=
ov4
;
ev_global
[
7
]
=
ov5
;
}
}
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*
timer_compute
=
MIC_Wtime
()
-
*
timer_compute
;
#endif
}
// end of offload region
if
(
offload
)
fix
->
stop_watch
(
TIME_OFFLOAD_LATENCY
);
else
fix
->
stop_watch
(
TIME_HOST_PAIR
);
if
(
EVFLAG
)
fix
->
add_result_array
(
f_start
,
ev_global
,
offload
,
eatom
,
0
,
vflag
);
else
fix
->
add_result_array
(
f_start
,
0
,
offload
);
}
/* ---------------------------------------------------------------------- */
template
<
class
flt_t
,
class
acc_t
>
void
PairAIREBOIntel
::
pack_force_const
(
IntelBuffers
<
flt_t
,
acc_t
>
*
buffers
)
{
int
tp1
=
atom
->
ntypes
+
1
;
buffers
->
set_ntypes
(
tp1
,
1
);
flt_t
**
cutneighsq
=
buffers
->
get_cutneighsq
();
flt_t
**
cutneighghostsq
=
buffers
->
get_cutneighghostsq
();
// Repeat cutsq calculation because done after call to init_style
double
cut
,
cutneigh
;
for
(
int
i
=
1
;
i
<=
atom
->
ntypes
;
i
++
)
{
for
(
int
j
=
i
;
j
<=
atom
->
ntypes
;
j
++
)
{
if
(
setflag
[
i
][
j
]
!=
0
||
(
setflag
[
i
][
i
]
!=
0
&&
setflag
[
j
][
j
]
!=
0
))
{
cut
=
init_one
(
i
,
j
);
cutneigh
=
cut
+
neighbor
->
skin
;
cutsq
[
i
][
j
]
=
cutsq
[
j
][
i
]
=
cut
*
cut
;
cutneighsq
[
i
][
j
]
=
cutneighsq
[
j
][
i
]
=
cutneigh
*
cutneigh
;
cut
=
cutghost
[
i
][
j
]
+
neighbor
->
skin
;
cutneighghostsq
[
i
][
j
]
=
cutneighghostsq
[
j
][
i
]
=
cut
*
cut
;
}
}
}
#ifdef _LMP_INTEL_OFFLOAD
if
(
_cop
<
0
)
return
;
flt_t
*
ocutneighsq
=
cutneighsq
[
0
];
size_t
VL
=
512
/
8
/
sizeof
(
flt_t
);
int
ntypes
=
tp1
;
int
tp1sq
=
tp1
*
tp1
;
// TODO the lifecycle of "map" is currently not 100% correct
// it might not be freed if this method is called more than once
int
*
map
=
this
->
map
;
#pragma offload_transfer target(mic:_cop) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \
in(map: length(tp1) alloc_if(1) free_if(0))
#endif
}
/* ----------------------------------------------------------------------
Implementation
---------------------------------------------------------------------- */
namespace
{
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(push, target(mic))
#endif
namespace
overloaded
{
double
sqrt
(
double
a
)
{
return
::
sqrt
(
a
);
}
float
sqrt
(
float
a
)
{
return
::
sqrtf
(
a
);
}
double
sin
(
double
a
)
{
return
::
sin
(
a
);
}
float
sin
(
float
a
)
{
return
::
sinf
(
a
);
}
double
cos
(
double
a
)
{
return
::
cos
(
a
);
}
float
cos
(
float
a
)
{
return
::
cosf
(
a
);
}
double
exp
(
double
a
)
{
return
::
exp
(
a
);
}
float
exp
(
float
a
)
{
return
::
expf
(
a
);
}
double
pow
(
double
a
,
double
b
)
{
return
::
pow
(
a
,
b
);
}
float
pow
(
float
a
,
float
b
)
{
return
::
powf
(
a
,
b
);
}
}
/* ----------------------------------------------------------------------
Scalar AIREBO implementation, standalone, with massive code reuse
compared to original code.
---------------------------------------------------------------------- */
#define M_PI 3.14159265358979323846
/* pi */
#define CARBON 0
#define HYDROGEN 1
#define TOL 1.0e-9
template
<
typename
T
>
inline
T
fmin_nonan
(
T
a
,
T
b
)
{
return
a
<
b
?
a
:
b
;
}
template
<
typename
T
>
inline
T
fmax_nonan
(
T
a
,
T
b
)
{
return
a
>
b
?
a
:
b
;
}
template
<
typename
flt_t
>
inline
flt_t
Sp
(
flt_t
r
,
flt_t
lo
,
flt_t
hi
,
flt_t
*
del
)
{
flt_t
t
=
(
r
-
lo
)
/
(
hi
-
lo
);
if
(
t
<=
0
)
{
if
(
del
)
*
del
=
0
;
return
1
;
}
else
if
(
t
>=
1
)
{
if
(
del
)
*
del
=
0
;
return
0
;
}
else
{
t
*=
static_cast
<
flt_t
>
(
M_PI
);
if
(
del
)
*
del
=
static_cast
<
flt_t
>
(
-
0.5
*
M_PI
)
*
overloaded
::
sin
(
t
)
/
(
hi
-
lo
);
return
static_cast
<
flt_t
>
(
0.5
)
*
(
1
+
overloaded
::
cos
(
t
));
}
}
template
<
typename
flt_t
>
inline
flt_t
Sp2
(
flt_t
r
,
flt_t
lo
,
flt_t
hi
,
flt_t
*
del
)
{
flt_t
t
=
(
r
-
lo
)
/
(
hi
-
lo
);
if
(
t
<=
0
)
{
if
(
del
)
*
del
=
0
;
return
1
;
}
else
if
(
t
>=
1
)
{
if
(
del
)
*
del
=
0
;
return
0
;
}
else
{
if
(
del
)
*
del
=
6
*
(
t
*
t
-
t
)
/
(
hi
-
lo
);
return
1
-
t
*
t
*
(
3
-
2
*
t
);
}
}
template
<
typename
flt_t
>
inline
flt_t
eval_poly_lin
(
int
n
,
flt_t
*
coeffs
,
flt_t
x
,
flt_t
*
deriv
)
{
flt_t
result
=
coeffs
[
n
-
1
];
*
deriv
=
coeffs
[
n
-
1
]
*
(
n
-
1
);
for
(
int
i
=
n
-
2
;
i
>
0
;
i
--
)
{
result
=
coeffs
[
i
]
+
x
*
result
;
*
deriv
=
coeffs
[
i
]
*
i
+
x
*
(
*
deriv
);
}
result
=
coeffs
[
0
]
+
x
*
result
;
return
result
;
}
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
gSpline
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
itype
,
flt_t
cos
,
flt_t
N
,
flt_t
*
dgdc
,
flt_t
*
dgdN
)
{
flt_t
NCmin
=
ka
->
params
.
NCmin
;
flt_t
NCmax
=
ka
->
params
.
NCmax
;
int
index
=
0
;
flt_t
*
gDom
=
NULL
;
int
nDom
=
0
;
int
offs
=
0
;
if
(
itype
==
0
)
{
nDom
=
4
;
gDom
=
&
ka
->
params
.
gCdom
[
0
];
if
(
N
>
NCmin
)
offs
=
4
*
6
;
}
else
{
nDom
=
3
;
gDom
=
&
ka
->
params
.
gHdom
[
0
];
offs
=
8
*
6
;
}
cos
=
fmax_nonan
(
gDom
[
0
],
fmin_nonan
(
gDom
[
nDom
],
cos
));
int
i
;
for
(
i
=
0
;
i
<
nDom
;
i
++
)
{
if
(
cos
>=
gDom
[
i
]
&&
cos
<=
gDom
[
i
+
1
])
{
index
=
i
;
}
}
flt_t
g
=
eval_poly_lin
(
6
,
&
ka
->
params
.
gVal
[
offs
+
index
*
6
],
cos
,
dgdc
);
*
dgdN
=
0
;
if
(
itype
==
0
&&
N
>
NCmin
&&
N
<
NCmax
)
{
flt_t
dg1
;
flt_t
g1
=
eval_poly_lin
(
6
,
&
ka
->
params
.
gVal
[
index
*
6
],
cos
,
&
dg1
);
flt_t
dS
;
flt_t
cut
=
Sp
(
N
,
NCmin
,
NCmax
,
&
dS
);
*
dgdN
=
dS
*
(
g1
-
g
);
g
=
g
+
cut
*
(
g1
-
g
);
*
dgdc
=
*
dgdc
+
cut
*
(
dg1
-
*
dgdc
);
}
return
g
;
}
template
<
typename
flt_t
>
inline
flt_t
eval_poly_bi
(
int
n
,
flt_t
*
coeffs
,
flt_t
x
,
flt_t
y
,
flt_t
*
deriv
)
{
flt_t
dy
;
flt_t
vy
=
eval_poly_lin
(
n
,
&
coeffs
[
n
*
(
n
-
1
)],
y
,
&
dy
);
flt_t
result
=
vy
;
deriv
[
0
]
=
vy
*
(
n
-
1
);
deriv
[
1
]
=
dy
;
for
(
int
i
=
n
-
2
;
i
>
0
;
i
--
)
{
vy
=
eval_poly_lin
(
n
,
&
coeffs
[
n
*
i
],
y
,
&
dy
);
result
=
vy
+
x
*
result
;
deriv
[
0
]
=
vy
*
i
+
x
*
deriv
[
0
];
deriv
[
1
]
=
dy
+
x
*
deriv
[
1
];
}
result
=
eval_poly_lin
(
n
,
&
coeffs
[
0
],
y
,
&
dy
)
+
x
*
result
;
deriv
[
1
]
=
dy
+
x
*
deriv
[
1
];
return
result
;
}
template
<
typename
flt_t
>
inline
flt_t
eval_poly_tri
(
int
n
,
flt_t
*
coeffs
,
flt_t
x
,
flt_t
y
,
flt_t
z
,
flt_t
*
deriv
)
{
flt_t
dyz
[
2
];
flt_t
vyz
=
eval_poly_bi
(
n
,
&
coeffs
[
n
*
n
*
(
n
-
1
)],
y
,
z
,
&
dyz
[
0
]);
flt_t
result
=
vyz
;
deriv
[
0
]
=
vyz
*
(
n
-
1
);
deriv
[
1
]
=
dyz
[
0
];
deriv
[
2
]
=
dyz
[
1
];
for
(
int
i
=
n
-
2
;
i
>
0
;
i
--
)
{
vyz
=
eval_poly_bi
(
n
,
&
coeffs
[
n
*
n
*
i
],
y
,
z
,
&
dyz
[
0
]);
result
=
vyz
+
x
*
result
;
deriv
[
0
]
=
vyz
*
i
+
x
*
deriv
[
0
];
deriv
[
1
]
=
dyz
[
0
]
+
x
*
deriv
[
1
];
deriv
[
2
]
=
dyz
[
1
]
+
x
*
deriv
[
2
];
}
result
=
eval_poly_bi
(
n
,
&
coeffs
[
0
],
y
,
z
,
&
dyz
[
0
])
+
x
*
result
;
deriv
[
1
]
=
dyz
[
0
]
+
x
*
deriv
[
1
];
deriv
[
2
]
=
dyz
[
1
]
+
x
*
deriv
[
2
];
return
result
;
}
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
PijSpline
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
itype
,
int
jtype
,
flt_t
NC
,
flt_t
NH
,
flt_t
*
dN
)
{
dN
[
0
]
=
0.0
;
dN
[
1
]
=
0.0
;
if
(
itype
==
HYDROGEN
)
return
0
;
flt_t
*
pCJdom
=
jtype
==
CARBON
?
&
ka
->
params
.
pCCdom
[
0
][
0
]
:
&
ka
->
params
.
pCHdom
[
0
][
0
];
NC
=
fmax_nonan
(
pCJdom
[
0
],
fmin_nonan
(
pCJdom
[
1
],
NC
));
NH
=
fmax_nonan
(
pCJdom
[
2
],
fmin_nonan
(
pCJdom
[
3
],
NH
));
int
nC
=
floor
(
NC
);
int
nH
=
floor
(
NH
);
#define PijSelect(a, b) (jtype == CARBON ? ka->params.a : ka->params.b)
if
(
fabs
(
NC
-
nC
)
<
TOL
&&
fabs
(
NH
-
nH
)
<
TOL
)
{
dN
[
0
]
=
PijSelect
(
PCCdfdx
,
PCHdfdx
)[
nC
][
nH
];
dN
[
1
]
=
PijSelect
(
PCCdfdy
,
PCHdfdy
)[
nC
][
nH
];
return
PijSelect
(
PCCf
,
PCHf
)[
nC
][
nH
];
}
if
(
NC
==
pCJdom
[
1
])
nC
-=
1
;
if
(
NH
==
pCJdom
[
3
])
nH
-=
1
;
return
eval_poly_bi
(
4
,
&
PijSelect
(
pCC
,
pCH
)[
nC
][
nH
][
0
],
NC
,
NH
,
dN
);
#undef PijSelect
}
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
TijSpline
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
flt_t
Nij
,
flt_t
Nji
,
flt_t
Nijconj
,
acc_t
*
dN3
)
{
flt_t
*
Tijdom
=
&
ka
->
params
.
Tijdom
[
0
][
0
];
Nij
=
fmax_nonan
(
Tijdom
[
0
],
fmin_nonan
(
Tijdom
[
1
],
Nij
));
Nji
=
fmax_nonan
(
Tijdom
[
2
],
fmin_nonan
(
Tijdom
[
3
],
Nji
));
Nijconj
=
fmax_nonan
(
Tijdom
[
4
],
fmin_nonan
(
Tijdom
[
5
],
Nijconj
));
int
nij
=
floor
(
Nij
);
int
nji
=
floor
(
Nji
);
int
nijconj
=
floor
(
Nijconj
);
if
(
fabs
(
Nij
-
nij
)
<
TOL
&&
fabs
(
Nji
-
nji
)
<
TOL
&&
fabs
(
Nijconj
-
nijconj
)
<
TOL
)
{
dN3
[
0
]
=
ka
->
params
.
Tdfdx
[
nij
][
nji
][
nijconj
];
dN3
[
1
]
=
ka
->
params
.
Tdfdy
[
nij
][
nji
][
nijconj
];
dN3
[
2
]
=
ka
->
params
.
Tdfdz
[
nij
][
nji
][
nijconj
];
return
ka
->
params
.
Tf
[
nij
][
nji
][
nijconj
];
}
if
(
Nij
==
Tijdom
[
1
])
nij
-=
1
;
if
(
Nji
==
Tijdom
[
3
])
nji
-=
1
;
if
(
Nijconj
==
Tijdom
[
5
])
nijconj
-=
1
;
return
eval_poly_tri
<
acc_t
>
(
4
,
&
ka
->
params
.
Tijc
[
nij
][
nji
][
nijconj
][
0
],
Nij
,
Nji
,
Nijconj
,
dN3
);
}
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
piRCSpline
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
itype
,
int
jtype
,
flt_t
Nij
,
flt_t
Nji
,
flt_t
Nijconj
,
acc_t
*
dN3
)
{
const
int
HH
=
2
;
const
int
CH
=
1
;
/* const int CC = 0; */
int
select
=
itype
+
jtype
;
#define piRCSelect(a, b, c) (select == HH ? ka->params.a : select == CH ? \
ka->params.b : ka->params.c)
flt_t
*
piIJdom
=
&
piRCSelect
(
piHHdom
,
piCHdom
,
piCCdom
)[
0
][
0
];
if
(
select
==
HH
)
{
if
(
Nij
<
piIJdom
[
0
]
||
Nij
>
piIJdom
[
1
]
||
Nji
<
piIJdom
[
2
]
||
Nji
>
piIJdom
[
3
]
||
Nijconj
<
piIJdom
[
4
]
||
Nijconj
>
piIJdom
[
5
])
{
Nij
=
0
;
Nji
=
0
;
Nijconj
=
0
;
}
}
Nij
=
fmax_nonan
(
piIJdom
[
0
],
fmin_nonan
(
piIJdom
[
1
],
Nij
));
Nji
=
fmax_nonan
(
piIJdom
[
2
],
fmin_nonan
(
piIJdom
[
3
],
Nji
));
Nijconj
=
fmax_nonan
(
piIJdom
[
4
],
fmin_nonan
(
piIJdom
[
5
],
Nijconj
));
int
nij
=
floor
(
Nij
);
int
nji
=
floor
(
Nji
);
int
nijconj
=
floor
(
Nijconj
);
if
(
fabs
(
Nij
-
nij
)
<
TOL
&&
fabs
(
Nji
-
nji
)
<
TOL
&&
fabs
(
Nijconj
-
nijconj
)
<
TOL
)
{
dN3
[
0
]
=
piRCSelect
(
piHHdfdx
,
piCHdfdx
,
piCCdfdx
)[
nij
][
nji
][
nijconj
];
dN3
[
1
]
=
piRCSelect
(
piHHdfdy
,
piCHdfdy
,
piCCdfdy
)[
nij
][
nji
][
nijconj
];
dN3
[
2
]
=
piRCSelect
(
piHHdfdz
,
piCHdfdz
,
piCCdfdz
)[
nij
][
nji
][
nijconj
];
return
piRCSelect
(
piHHf
,
piCHf
,
piCCf
)[
nij
][
nji
][
nijconj
];
}
if
(
Nij
==
piIJdom
[
1
])
nij
-=
1
;
if
(
Nji
==
piIJdom
[
3
])
nji
-=
1
;
if
(
Nijconj
==
piIJdom
[
5
])
nijconj
-=
1
;
return
eval_poly_tri
<
acc_t
>
(
4
,
&
piRCSelect
(
piHH
,
piCH
,
piCC
)[
nij
][
nji
][
nijconj
][
0
],
Nij
,
Nji
,
Nijconj
,
dN3
);
#undef piRCSelect
}
/*
* Implements the p_ij term in airebo, which occurs on 4 different occasions
* in the original lammps code.
*/
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
frebo_pij
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
int
j
,
flt_t
rijx
,
flt_t
rijy
,
flt_t
rijz
,
flt_t
rijmag
,
flt_t
wij
,
flt_t
VA
,
flt_t
*
sum_N
,
acc_t
fij
[
3
])
{
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
map
=
ka
->
map
;
flt_t
*
nC
=
ka
->
nC
;
flt_t
*
nH
=
ka
->
nH
;
flt_t
x_i
=
x
[
i
].
x
;
flt_t
y_i
=
x
[
i
].
y
;
flt_t
z_i
=
x
[
i
].
z
;
int
itype
=
map
[
x
[
i
].
w
];
int
jtype
=
map
[
x
[
j
].
w
];
flt_t
invrijm
=
1
/
rijmag
;
flt_t
invrijm2
=
invrijm
*
invrijm
;
flt_t
rcminij
=
ka
->
params
.
rcmin
[
itype
][
jtype
];
flt_t
rcmaxij
=
ka
->
params
.
rcmax
[
itype
][
jtype
];
flt_t
Nmin
=
ka
->
params
.
Nmin
;
flt_t
Nmax
=
ka
->
params
.
Nmax
;
flt_t
Nij
=
nC
[
i
]
+
nH
[
i
]
-
wij
;
flt_t
NijC
=
nC
[
i
]
-
wij
*
(
1
-
jtype
);
flt_t
NijH
=
nH
[
i
]
-
wij
*
jtype
;
flt_t
sum_pij
=
0
;
flt_t
sum_dpij_dN
=
0
;
flt_t
dN2
[
2
]
=
{
0
};
flt_t
pij
=
0
;
*
sum_N
=
0
;
int
*
neighs
=
ka
->
neigh_rebo
.
entries
+
ka
->
neigh_rebo
.
offset
[
i
];
int
pass
;
for
(
pass
=
0
;
pass
<
2
;
pass
++
)
{
int
kk
;
int
knum
=
ka
->
neigh_rebo
.
num
[
i
];
for
(
kk
=
0
;
kk
<
knum
;
kk
++
)
{
int
k
=
neighs
[
kk
];
if
(
k
==
j
)
continue
;
flt_t
rikx
=
x_i
-
x
[
k
].
x
;
flt_t
riky
=
y_i
-
x
[
k
].
y
;
flt_t
rikz
=
z_i
-
x
[
k
].
z
;
int
ktype
=
map
[
x
[
k
].
w
];
flt_t
rikmag
=
overloaded
::
sqrt
(
rikx
*
rikx
+
riky
*
riky
+
rikz
*
rikz
);
flt_t
rho_k
=
ka
->
params
.
rho
[
ktype
][
1
];
flt_t
rho_j
=
ka
->
params
.
rho
[
jtype
][
1
];
flt_t
lamdajik
=
4
*
itype
*
((
rho_k
-
rikmag
)
-
(
rho_j
-
rijmag
));
flt_t
ex_lam
=
exp
(
lamdajik
);
flt_t
rcminik
=
ka
->
params
.
rcmin
[
itype
][
ktype
];
flt_t
rcmaxik
=
ka
->
params
.
rcmax
[
itype
][
ktype
];
flt_t
dwik
;
flt_t
wik
=
Sp
(
rikmag
,
rcminik
,
rcmaxik
,
&
dwik
);
flt_t
Nki
=
nC
[
k
]
+
nH
[
k
]
-
wik
;
flt_t
cosjik
=
(
rijx
*
rikx
+
rijy
*
riky
+
rijz
*
rikz
)
/
(
rijmag
*
rikmag
);
cosjik
=
fmin_nonan
<
flt_t
>
(
1
,
fmax_nonan
<
flt_t
>
(
-
1
,
cosjik
));
flt_t
dgdc
,
dgdN
;
flt_t
g
=
gSpline
(
ka
,
itype
,
cosjik
,
Nij
,
&
dgdc
,
&
dgdN
);
if
(
pass
==
0
)
{
sum_pij
+=
wik
*
g
*
ex_lam
;
sum_dpij_dN
+=
wik
*
dgdN
*
ex_lam
;
flt_t
cutN
=
Sp
<
flt_t
>
(
Nki
,
Nmin
,
Nmax
,
NULL
);
*
sum_N
+=
(
1
-
ktype
)
*
wik
*
cutN
;
}
else
{
flt_t
tmp
=
-
0.5
*
pij
*
pij
*
pij
;
flt_t
invrikm
=
1
/
rikmag
;
flt_t
rjkx
=
rikx
-
rijx
;
flt_t
rjky
=
riky
-
rijy
;
flt_t
rjkz
=
rikz
-
rijz
;
flt_t
rjkmag
=
sqrt
(
rjkx
*
rjkx
+
rjky
*
rjky
+
rjkz
*
rjkz
);
flt_t
rijrik
=
2
*
rijmag
*
rikmag
;
flt_t
rr
=
rijmag
*
rijmag
-
rikmag
*
rikmag
;
flt_t
dctdjk
=
-
2
/
rijrik
;
flt_t
dctdik
=
(
-
rr
+
rjkmag
*
rjkmag
)
/
(
rijrik
*
rikmag
*
rikmag
);
flt_t
dctdij
=
(
rr
+
rjkmag
*
rjkmag
)
/
(
rijrik
*
rijmag
*
rijmag
);
acc_t
fi
[
3
],
fj
[
3
],
fk
[
3
];
flt_t
pref
=
0.5
*
VA
*
tmp
;
flt_t
tmp20
=
pref
*
wik
*
dgdc
*
ex_lam
;
fj
[
0
]
=
fj
[
1
]
=
fj
[
2
]
=
0
;
fi
[
0
]
=
-
tmp20
*
dctdik
*
rikx
;
fi
[
1
]
=
-
tmp20
*
dctdik
*
riky
;
fi
[
2
]
=
-
tmp20
*
dctdik
*
rikz
;
fk
[
0
]
=
tmp20
*
dctdik
*
rikx
;
fk
[
1
]
=
tmp20
*
dctdik
*
riky
;
fk
[
2
]
=
tmp20
*
dctdik
*
rikz
;
fij
[
0
]
+=
-
tmp20
*
dctdij
*
rijx
;
fij
[
1
]
+=
-
tmp20
*
dctdij
*
rijy
;
fij
[
2
]
+=
-
tmp20
*
dctdij
*
rijz
;
fi
[
0
]
+=
-
tmp20
*
dctdjk
*
rjkx
;
fi
[
1
]
+=
-
tmp20
*
dctdjk
*
rjky
;
fi
[
2
]
+=
-
tmp20
*
dctdjk
*
rjkz
;
fk
[
0
]
+=
tmp20
*
dctdjk
*
rjkx
;
fk
[
1
]
+=
tmp20
*
dctdjk
*
rjky
;
fk
[
2
]
+=
tmp20
*
dctdjk
*
rjkz
;
fij
[
0
]
-=
-
tmp20
*
dctdjk
*
rjkx
;
fij
[
1
]
-=
-
tmp20
*
dctdjk
*
rjky
;
fij
[
2
]
-=
-
tmp20
*
dctdjk
*
rjkz
;
flt_t
tmp21
=
pref
*
(
wik
*
g
*
ex_lam
*
4
*
itype
);
fij
[
0
]
-=
1
*
tmp21
*
rijx
*
invrijm
;
fij
[
1
]
-=
1
*
tmp21
*
rijy
*
invrijm
;
fij
[
2
]
-=
1
*
tmp21
*
rijz
*
invrijm
;
fi
[
0
]
-=
tmp21
*
(
-
rikx
*
invrikm
);
fi
[
1
]
-=
tmp21
*
(
-
riky
*
invrikm
);
fi
[
2
]
-=
tmp21
*
(
-
rikz
*
invrikm
);
fk
[
0
]
-=
tmp21
*
(
rikx
*
invrikm
);
fk
[
1
]
-=
tmp21
*
(
riky
*
invrikm
);
fk
[
2
]
-=
tmp21
*
(
rikz
*
invrikm
);
// coordination forces
// dwik forces
flt_t
tmp22
=
pref
*
dwik
*
g
*
ex_lam
*
invrikm
;
fi
[
0
]
-=
tmp22
*
rikx
;
fi
[
1
]
-=
tmp22
*
riky
;
fi
[
2
]
-=
tmp22
*
rikz
;
fk
[
0
]
+=
tmp22
*
rikx
;
fk
[
1
]
+=
tmp22
*
riky
;
fk
[
2
]
+=
tmp22
*
rikz
;
// PIJ forces
flt_t
tmp23
=
pref
*
dN2
[
ktype
]
*
dwik
*
invrikm
;
fi
[
0
]
-=
tmp23
*
rikx
;
fi
[
1
]
-=
tmp23
*
riky
;
fi
[
2
]
-=
tmp23
*
rikz
;
fk
[
0
]
+=
tmp23
*
rikx
;
fk
[
1
]
+=
tmp23
*
riky
;
fk
[
2
]
+=
tmp23
*
rikz
;
// dgdN forces
flt_t
tmp24
=
pref
*
sum_dpij_dN
*
dwik
*
invrikm
;
fi
[
0
]
-=
tmp24
*
rikx
;
fi
[
1
]
-=
tmp24
*
riky
;
fi
[
2
]
-=
tmp24
*
rikz
;
fk
[
0
]
+=
tmp24
*
rikx
;
fk
[
1
]
+=
tmp24
*
riky
;
fk
[
2
]
+=
tmp24
*
rikz
;
result_f
[
i
].
x
+=
fi
[
0
];
result_f
[
i
].
y
+=
fi
[
1
];
result_f
[
i
].
z
+=
fi
[
2
];
result_f
[
j
].
x
+=
fj
[
0
];
result_f
[
j
].
y
+=
fj
[
1
];
result_f
[
j
].
z
+=
fj
[
2
];
result_f
[
k
].
x
+=
fk
[
0
];
result_f
[
k
].
y
+=
fk
[
1
];
result_f
[
k
].
z
+=
fk
[
2
];
}
}
if
(
pass
==
0
)
{
flt_t
PijS
=
PijSpline
(
ka
,
itype
,
jtype
,
NijC
,
NijH
,
dN2
);
pij
=
1
/
overloaded
::
sqrt
(
1
+
sum_pij
+
PijS
);
}
}
return
pij
;
}
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
frebo_pi_rc
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
itype
,
int
jtype
,
flt_t
Nij
,
flt_t
Nji
,
flt_t
Nijconj
,
flt_t
*
dN3
)
{
acc_t
dN3tmp
[
3
]
=
{
0
};
flt_t
ret
=
piRCSpline
(
ka
,
itype
,
jtype
,
Nij
,
Nji
,
Nijconj
,
dN3tmp
);
dN3
[
0
]
=
dN3tmp
[
0
];
dN3
[
1
]
=
dN3tmp
[
1
];
dN3
[
2
]
=
dN3tmp
[
2
];
return
ret
;
}
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
frebo_Tij
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
itype
,
int
jtype
,
flt_t
Nij
,
flt_t
Nji
,
flt_t
Nijconj
,
flt_t
*
dN3
)
{
dN3
[
0
]
=
0
;
dN3
[
1
]
=
0
;
dN3
[
2
]
=
0
;
if
(
itype
==
HYDROGEN
||
jtype
==
HYDROGEN
)
return
0
;
acc_t
dN3tmp
[
3
]
=
{
0
};
flt_t
ret
=
TijSpline
(
ka
,
Nij
,
Nji
,
Nijconj
,
dN3tmp
);
dN3
[
0
]
=
dN3tmp
[
0
];
dN3
[
1
]
=
dN3tmp
[
1
];
dN3
[
2
]
=
dN3tmp
[
2
];
return
ret
;
}
/*
* Implements a scalar version of the sum cos^1(omega) term used in pi^dh_ij.
* Occurs in both bondorder and bondorderLJ.
*/
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
frebo_sum_omega
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
int
j
,
flt_t
r23x
,
flt_t
r23y
,
flt_t
r23z
,
flt_t
r23mag
,
flt_t
VA
,
acc_t
fij
[
3
])
{
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
acc_t
sum_omega
=
0
;
int
a2
=
i
;
int
a3
=
j
;
flt_t
r32x
=
-
r23x
;
flt_t
r32y
=
-
r23y
;
flt_t
r32z
=
-
r23z
;
int
*
map
=
ka
->
map
;
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
flt_t
thmin
=
ka
->
params
.
thmin
;
flt_t
thmax
=
ka
->
params
.
thmax
;
int
itype
=
map
[
x
[
i
].
w
];
int
jtype
=
map
[
x
[
j
].
w
];
int
*
neighs_i
=
ka
->
neigh_rebo
.
entries
+
ka
->
neigh_rebo
.
offset
[
i
];
int
*
neighs_j
=
ka
->
neigh_rebo
.
entries
+
ka
->
neigh_rebo
.
offset
[
j
];
int
num_i
=
ka
->
neigh_rebo
.
num
[
i
];
int
num_j
=
ka
->
neigh_rebo
.
num
[
j
];
int
kk
;
for
(
kk
=
0
;
kk
<
num_i
;
kk
++
)
{
int
k
=
neighs_i
[
kk
];
if
(
k
==
j
)
continue
;
int
a1
=
k
;
int
ktype
=
map
[
x
[
k
].
w
];
flt_t
r21x
=
x
[
a2
].
x
-
x
[
a1
].
x
;
flt_t
r21y
=
x
[
a2
].
y
-
x
[
a1
].
y
;
flt_t
r21z
=
x
[
a2
].
z
-
x
[
a1
].
z
;
flt_t
r21mag
=
overloaded
::
sqrt
(
r21x
*
r21x
+
r21y
*
r21y
+
r21z
*
r21z
);
flt_t
cos321
=
(
r23x
*
r21x
+
r23y
*
r21y
+
r23z
*
r21z
)
/
(
r23mag
*
r21mag
);
cos321
=
fmin_nonan
<
flt_t
>
(
1
,
fmax_nonan
<
flt_t
>
(
-
1
,
cos321
));
flt_t
sin321
=
overloaded
::
sqrt
(
1
-
cos321
*
cos321
);
if
(
sin321
==
0
)
continue
;
flt_t
sink2i
=
1
/
(
sin321
*
sin321
);
flt_t
rik2i
=
1
/
(
r21mag
*
r21mag
);
flt_t
rr
=
r23mag
*
r23mag
-
r21mag
*
r21mag
;
flt_t
r31x
=
r21x
-
r23x
;
flt_t
r31y
=
r21y
-
r23y
;
flt_t
r31z
=
r21z
-
r23z
;
flt_t
r31mag2
=
r31x
*
r31x
+
r31y
*
r31y
+
r31z
*
r31z
;
flt_t
rijrik
=
2
*
r23mag
*
r21mag
;
flt_t
r21mag2
=
r21mag
*
r21mag
;
flt_t
dctik
=
(
-
rr
+
r31mag2
)
/
(
rijrik
*
r21mag2
);
flt_t
dctij
=
(
rr
+
r31mag2
)
/
(
rijrik
*
r23mag
*
r23mag
);
flt_t
dctjk
=
-
2
/
rijrik
;
flt_t
rcmin21
=
ka
->
params
.
rcmin
[
itype
][
ktype
];
flt_t
rcmaxp21
=
ka
->
params
.
rcmaxp
[
itype
][
ktype
];
flt_t
dw21
;
flt_t
w21
=
Sp
(
r21mag
,
rcmin21
,
rcmaxp21
,
&
dw21
);
// why does this additional cutoff in the cosine exist?
// the original code by stuart answers this:
// it avoid issues when bonds in the dihedral are linear
// by switching the dihedral off beforehand.
// This is the reason for both the sin == 0 checks and the
// tspjik = Sp2(..) calls.
// Unfortunately, this is not exactly stated in the original paper.
// It might be similar in purpose to the H(sin - s^min) term that
// appears in that paper, but can not be found in original REBO papers.
flt_t
dtsjik
;
flt_t
tspjik
=
Sp2
(
cos321
,
thmin
,
thmax
,
&
dtsjik
);
dtsjik
=
-
dtsjik
;
int
ll
;
for
(
ll
=
0
;
ll
<
num_j
;
ll
++
)
{
int
l
=
neighs_j
[
ll
];
if
(
l
==
i
||
l
==
k
)
continue
;
int
ltype
=
map
[
x
[
l
].
w
];
int
a4
=
l
;
flt_t
r34x
=
x
[
a3
].
x
-
x
[
a4
].
x
;
flt_t
r34y
=
x
[
a3
].
y
-
x
[
a4
].
y
;
flt_t
r34z
=
x
[
a3
].
z
-
x
[
a4
].
z
;
flt_t
r34mag
=
overloaded
::
sqrt
(
r34x
*
r34x
+
r34y
*
r34y
+
r34z
*
r34z
);
flt_t
cos234
=
(
r32x
*
r34x
+
r32y
*
r34y
+
r32z
*
r34z
)
/
(
r23mag
*
r34mag
);
cos234
=
fmin_nonan
<
flt_t
>
(
1
,
fmax_nonan
<
flt_t
>
(
-
1
,
cos234
));
flt_t
sin234
=
overloaded
::
sqrt
(
1
-
cos234
*
cos234
);
if
(
sin234
==
0
)
continue
;
flt_t
sinl2i
=
1
/
(
sin234
*
sin234
);
flt_t
rjl2i
=
1
/
(
r34mag
*
r34mag
);
flt_t
rcminjl
=
ka
->
params
.
rcmin
[
jtype
][
ltype
];
flt_t
rcmaxpjl
=
ka
->
params
.
rcmaxp
[
jtype
][
ltype
];
flt_t
dw34
;
flt_t
w34
=
Sp
(
r34mag
,
rcminjl
,
rcmaxpjl
,
&
dw34
);
flt_t
rr
=
(
r23mag
*
r23mag
)
-
(
r34mag
*
r34mag
);
flt_t
r24x
=
r23x
+
r34x
;
flt_t
r24y
=
r23y
+
r34y
;
flt_t
r24z
=
r23z
+
r34z
;
flt_t
r242
=
(
r24x
*
r24x
)
+
(
r24y
*
r24y
)
+
(
r24z
*
r24z
);
flt_t
rijrjl
=
2
*
r23mag
*
r34mag
;
flt_t
rjl2
=
r34mag
*
r34mag
;
flt_t
dctjl
=
(
-
rr
+
r242
)
/
(
rijrjl
*
rjl2
);
flt_t
dctji
=
(
rr
+
r242
)
/
(
rijrjl
*
r23mag
*
r23mag
);
flt_t
dctil
=
-
2
/
rijrjl
;
flt_t
dtsijl
;
flt_t
tspijl
=
Sp2
(
cos234
,
thmin
,
thmax
,
&
dtsijl
);
dtsijl
=
-
dtsijl
;
// need minus sign
flt_t
prefactor
=
VA
;
flt_t
cross321x
=
(
r32y
*
r21z
)
-
(
r32z
*
r21y
);
flt_t
cross321y
=
(
r32z
*
r21x
)
-
(
r32x
*
r21z
);
flt_t
cross321z
=
(
r32x
*
r21y
)
-
(
r32y
*
r21x
);
flt_t
cross234x
=
(
r23y
*
r34z
)
-
(
r23z
*
r34y
);
flt_t
cross234y
=
(
r23z
*
r34x
)
-
(
r23x
*
r34z
);
flt_t
cross234z
=
(
r23x
*
r34y
)
-
(
r23y
*
r34x
);
flt_t
cwnum
=
(
cross321x
*
cross234x
)
+
(
cross321y
*
cross234y
)
+
(
cross321z
*
cross234z
);
flt_t
cwnom
=
r21mag
*
r34mag
*
r23mag
*
r23mag
*
sin321
*
sin234
;
flt_t
om1234
=
cwnum
/
cwnom
;
flt_t
cw
=
om1234
;
sum_omega
+=
((
1
-
(
om1234
*
om1234
))
*
w21
*
w34
)
*
(
1
-
tspjik
)
*
(
1
-
tspijl
);
if
(
VA
==
static_cast
<
flt_t
>
(
0.0
))
continue
;
flt_t
dt1dik
=
(
rik2i
)
-
(
dctik
*
sink2i
*
cos321
);
flt_t
dt1djk
=
(
-
dctjk
*
sink2i
*
cos321
);
flt_t
dt1djl
=
(
rjl2i
)
-
(
dctjl
*
sinl2i
*
cos234
);
flt_t
dt1dil
=
(
-
dctil
*
sinl2i
*
cos234
);
flt_t
dt1dij
=
(
2
/
(
r23mag
*
r23mag
))
-
(
dctij
*
sink2i
*
cos321
)
-
(
dctji
*
sinl2i
*
cos234
);
flt_t
dt2dikx
=
(
-
r23z
*
cross234y
)
+
(
r23y
*
cross234z
);
flt_t
dt2diky
=
(
-
r23x
*
cross234z
)
+
(
r23z
*
cross234x
);
flt_t
dt2dikz
=
(
-
r23y
*
cross234x
)
+
(
r23x
*
cross234y
);
flt_t
dt2djlx
=
(
-
r23y
*
cross321z
)
+
(
r23z
*
cross321y
);
flt_t
dt2djly
=
(
-
r23z
*
cross321x
)
+
(
r23x
*
cross321z
);
flt_t
dt2djlz
=
(
-
r23x
*
cross321y
)
+
(
r23y
*
cross321x
);
flt_t
dt2dijx
=
(
r21z
*
cross234y
)
-
(
r34z
*
cross321y
)
-
flt_t
(
r21y
*
cross234z
)
+
(
r34y
*
cross321z
);
flt_t
dt2dijy
=
(
r21x
*
cross234z
)
-
(
r34x
*
cross321z
)
-
flt_t
(
r21z
*
cross234x
)
+
(
r34z
*
cross321x
);
flt_t
dt2dijz
=
(
r21y
*
cross234x
)
-
(
r34y
*
cross321x
)
-
flt_t
(
r21x
*
cross234y
)
+
(
r34x
*
cross321y
);
flt_t
aa
=
(
prefactor
*
2
*
cw
/
cwnom
)
*
w21
*
w34
*
(
1
-
tspjik
)
*
(
1
-
tspijl
);
flt_t
aaa1
=
-
prefactor
*
(
1
-
(
om1234
*
om1234
))
*
(
1
-
tspjik
)
*
(
1
-
tspijl
);
flt_t
aaa2
=
-
prefactor
*
(
1
-
(
om1234
*
om1234
))
*
w21
*
w34
;
flt_t
at2
=
aa
*
cwnum
;
flt_t
fcijpc
=
(
-
dt1dij
*
at2
)
+
(
aaa2
*
dtsjik
*
dctij
*
(
1
-
tspijl
))
+
(
aaa2
*
dtsijl
*
dctji
*
(
1
-
tspjik
));
flt_t
fcikpc
=
(
-
dt1dik
*
at2
)
+
(
aaa2
*
dtsjik
*
dctik
*
(
1
-
tspijl
));
flt_t
fcjlpc
=
(
-
dt1djl
*
at2
)
+
(
aaa2
*
dtsijl
*
dctjl
*
(
1
-
tspjik
));
flt_t
fcjkpc
=
(
-
dt1djk
*
at2
)
+
(
aaa2
*
dtsjik
*
dctjk
*
(
1
-
tspijl
));
flt_t
fcilpc
=
(
-
dt1dil
*
at2
)
+
(
aaa2
*
dtsijl
*
dctil
*
(
1
-
tspjik
));
flt_t
F23x
=
(
fcijpc
*
r23x
)
+
(
aa
*
dt2dijx
);
flt_t
F23y
=
(
fcijpc
*
r23y
)
+
(
aa
*
dt2dijy
);
flt_t
F23z
=
(
fcijpc
*
r23z
)
+
(
aa
*
dt2dijz
);
flt_t
F12x
=
(
fcikpc
*
r21x
)
+
(
aa
*
dt2dikx
);
flt_t
F12y
=
(
fcikpc
*
r21y
)
+
(
aa
*
dt2diky
);
flt_t
F12z
=
(
fcikpc
*
r21z
)
+
(
aa
*
dt2dikz
);
flt_t
F34x
=
(
fcjlpc
*
r34x
)
+
(
aa
*
dt2djlx
);
flt_t
F34y
=
(
fcjlpc
*
r34y
)
+
(
aa
*
dt2djly
);
flt_t
F34z
=
(
fcjlpc
*
r34z
)
+
(
aa
*
dt2djlz
);
flt_t
F31x
=
(
fcjkpc
*
r31x
);
flt_t
F31y
=
(
fcjkpc
*
r31y
);
flt_t
F31z
=
(
fcjkpc
*
r31z
);
flt_t
F24x
=
(
fcilpc
*
r24x
);
flt_t
F24y
=
(
fcilpc
*
r24y
);
flt_t
F24z
=
(
fcilpc
*
r24z
);
flt_t
f1x
=
-
F12x
-
F31x
;
flt_t
f1y
=
-
F12y
-
F31y
;
flt_t
f1z
=
-
F12z
-
F31z
;
flt_t
f2x
=
F12x
+
F31x
;
flt_t
f2y
=
F12y
+
F31y
;
flt_t
f2z
=
F12z
+
F31z
;
flt_t
f3x
=
F34x
+
F24x
;
flt_t
f3y
=
F34y
+
F24y
;
flt_t
f3z
=
F34z
+
F24z
;
flt_t
f4x
=
-
F34x
-
F24x
;
flt_t
f4y
=
-
F34y
-
F24y
;
flt_t
f4z
=
-
F34z
-
F24z
;
fij
[
0
]
+=
F23x
+
F24x
-
F31x
;
fij
[
1
]
+=
F23y
+
F24y
-
F31y
;
fij
[
2
]
+=
F23z
+
F24z
-
F31z
;
// coordination forces
flt_t
tmp20
=
VA
*
((
1
-
(
om1234
*
om1234
)))
*
(
1
-
tspjik
)
*
(
1
-
tspijl
)
*
dw21
*
w34
/
r21mag
;
f2x
-=
tmp20
*
r21x
;
f2y
-=
tmp20
*
r21y
;
f2z
-=
tmp20
*
r21z
;
f1x
+=
tmp20
*
r21x
;
f1y
+=
tmp20
*
r21y
;
f1z
+=
tmp20
*
r21z
;
flt_t
tmp21
=
VA
*
((
1
-
(
om1234
*
om1234
)))
*
(
1
-
tspjik
)
*
(
1
-
tspijl
)
*
w21
*
dw34
/
r34mag
;
f3x
-=
tmp21
*
r34x
;
f3y
-=
tmp21
*
r34y
;
f3z
-=
tmp21
*
r34z
;
f4x
+=
tmp21
*
r34x
;
f4y
+=
tmp21
*
r34y
;
f4z
+=
tmp21
*
r34z
;
result_f
[
a1
].
x
+=
f1x
;
result_f
[
a1
].
y
+=
f1y
;
result_f
[
a1
].
z
+=
f1z
;
result_f
[
a2
].
x
+=
f2x
;
result_f
[
a2
].
y
+=
f2y
;
result_f
[
a2
].
z
+=
f2z
;
result_f
[
a3
].
x
+=
f3x
;
result_f
[
a3
].
y
+=
f3y
;
result_f
[
a3
].
z
+=
f3z
;
result_f
[
a4
].
x
+=
f4x
;
result_f
[
a4
].
y
+=
f4y
;
result_f
[
a4
].
z
+=
f4z
;
}
}
return
sum_omega
;
}
/*
* Implements a scalar implementation the force update due to splines.
* It is used for both pi^rc_ij and T_ij.
* Occurs four times in each bondorder and bondorderLJ.
*/
template
<
typename
flt_t
,
typename
acc_t
>
inline
void
frebo_N_spline_force
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
int
j
,
flt_t
VA
,
flt_t
dN
,
flt_t
dNconj
,
flt_t
Nconj
)
{
int
*
map
=
ka
->
map
;
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
flt_t
*
nC
=
ka
->
nC
;
flt_t
*
nH
=
ka
->
nH
;
flt_t
Nmin
=
ka
->
params
.
Nmin
;
flt_t
Nmax
=
ka
->
params
.
Nmax
;
int
itype
=
map
[
x
[
i
].
w
];
int
*
neighs
=
ka
->
neigh_rebo
.
entries
+
ka
->
neigh_rebo
.
offset
[
i
];
int
knum
=
ka
->
neigh_rebo
.
num
[
i
];
int
kk
;
for
(
kk
=
0
;
kk
<
knum
;
kk
++
)
{
int
k
=
neighs
[
kk
];
if
(
k
==
j
)
continue
;
flt_t
rikx
=
x
[
i
].
x
-
x
[
k
].
x
;
flt_t
riky
=
x
[
i
].
y
-
x
[
k
].
y
;
flt_t
rikz
=
x
[
i
].
z
-
x
[
k
].
z
;
flt_t
rikmag
=
overloaded
::
sqrt
(
rikx
*
rikx
+
riky
*
riky
+
rikz
*
rikz
);
int
ktype
=
map
[
x
[
k
].
w
];
flt_t
rcminik
=
ka
->
params
.
rcmin
[
itype
][
ktype
];
flt_t
rcmaxik
=
ka
->
params
.
rcmax
[
itype
][
ktype
];
flt_t
dwik
;
flt_t
wik
=
Sp
(
rikmag
,
rcminik
,
rcmaxik
,
&
dwik
);
flt_t
Nki
=
nC
[
k
]
+
nH
[
k
]
-
wik
;
flt_t
dNki
;
flt_t
SpN
=
Sp
(
Nki
,
Nmin
,
Nmax
,
&
dNki
);
flt_t
fdN
=
VA
*
dN
*
dwik
/
rikmag
;
flt_t
fdNconj
=
VA
*
dNconj
*
2
*
Nconj
*
dwik
*
SpN
/
rikmag
;
flt_t
ffactor
=
fdN
;
if
(
ktype
==
0
)
ffactor
+=
fdNconj
;
flt_t
fkx
=
ffactor
*
rikx
;
flt_t
fky
=
ffactor
*
riky
;
flt_t
fkz
=
ffactor
*
rikz
;
result_f
[
i
].
x
-=
fkx
;
result_f
[
i
].
y
-=
fky
;
result_f
[
i
].
z
-=
fkz
;
result_f
[
k
].
x
+=
fkx
;
result_f
[
k
].
y
+=
fky
;
result_f
[
k
].
z
+=
fkz
;
if
(
ktype
!=
0
||
fabs
(
dNki
)
<=
TOL
)
continue
;
int
*
neighs_k
=
ka
->
neigh_rebo
.
entries
+
ka
->
neigh_rebo
.
offset
[
k
];
int
nnum
=
ka
->
neigh_rebo
.
num
[
k
];
int
nn
;
for
(
nn
=
0
;
nn
<
nnum
;
nn
++
)
{
int
n
=
neighs_k
[
nn
];
if
(
n
==
i
)
continue
;
flt_t
rknx
=
x
[
k
].
x
-
x
[
n
].
x
;
flt_t
rkny
=
x
[
k
].
y
-
x
[
n
].
y
;
flt_t
rknz
=
x
[
k
].
z
-
x
[
n
].
z
;
flt_t
rknmag
=
overloaded
::
sqrt
(
rknx
*
rknx
+
rkny
*
rkny
+
rknz
*
rknz
);
int
ntype
=
map
[
x
[
n
].
w
];
flt_t
rcminkn
=
ka
->
params
.
rcmin
[
ktype
][
ntype
];
flt_t
rcmaxkn
=
ka
->
params
.
rcmax
[
ktype
][
ntype
];
flt_t
dwkn
;
Sp
(
rknmag
,
rcminkn
,
rcmaxkn
,
&
dwkn
);
flt_t
ffactor
=
VA
*
dNconj
*
2
*
Nconj
*
wik
*
dNki
*
dwkn
/
rknmag
;
result_f
[
k
].
x
-=
ffactor
*
rknx
;
result_f
[
k
].
y
-=
ffactor
*
rkny
;
result_f
[
k
].
z
-=
ffactor
*
rknz
;
result_f
[
n
].
x
+=
ffactor
*
rknx
;
result_f
[
n
].
y
+=
ffactor
*
rkny
;
result_f
[
n
].
z
+=
ffactor
*
rknz
;
}
}
}
/*
* This data-structure contains the result of a search through neighbor-lists.
* It is used to calculate C_ij and the corresponding force updates.
*/
template
<
typename
flt_t
>
struct
LennardJonesPathAIREBOT
{
AtomAIREBOT
<
flt_t
>
del
[
3
];
int
num
;
flt_t
w
[
3
];
flt_t
dw
[
3
];
flt_t
r
[
3
];
int
idx
[
4
];
};
/*
* Checks a candidate path stored in idxs whether it is better than *path
* and updates *path accordingly.
*/
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
ref_lennard_jones_test_path_single
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
flt_t
best
,
int
num
,
int
*
idxs
,
LennardJonesPathAIREBOT
<
flt_t
>
*
path
)
{
LennardJonesPathAIREBOT
<
flt_t
>
result
;
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
map
=
ka
->
map
;
result
.
num
=
num
;
flt_t
combined
=
1
;
for
(
int
i
=
num
-
2
;
i
>=
0
;
i
--
)
{
int
a0
=
idxs
[
i
+
0
];
int
a1
=
idxs
[
i
+
1
];
flt_t
delx
=
x
[
a1
].
x
-
x
[
a0
].
x
;
flt_t
dely
=
x
[
a1
].
y
-
x
[
a0
].
y
;
flt_t
delz
=
x
[
a1
].
z
-
x
[
a0
].
z
;
flt_t
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
int
type0
=
map
[
x
[
a0
].
w
];
int
type1
=
map
[
x
[
a1
].
w
];
if
(
rsq
>=
ka
->
params
.
rcmaxsq
[
type0
][
type1
])
return
best
;
flt_t
r
=
overloaded
::
sqrt
(
rsq
);
flt_t
dw
,
w
=
Sp
<
flt_t
>
(
r
,
ka
->
params
.
rcmin
[
type0
][
type1
],
ka
->
params
.
rcmax
[
type0
][
type1
],
&
dw
);
if
(
w
==
0
)
return
best
;
combined
*=
w
;
if
(
combined
<=
best
)
return
best
;
result
.
idx
[
i
]
=
a0
;
result
.
del
[
i
].
x
=
delx
;
result
.
del
[
i
].
y
=
dely
;
result
.
del
[
i
].
z
=
delz
;
result
.
r
[
i
]
=
r
;
result
.
w
[
i
]
=
w
;
result
.
dw
[
i
]
=
dw
;
}
result
.
idx
[
num
-
1
]
=
idxs
[
num
-
1
];
*
path
=
result
;
return
combined
;
}
/*
* Test through all paths surrounding i and j to find the corresponding
* best path. Uses the same iteration ordering as FLJ() does.
* Note that an optimization would use the j neighlist instead in the inner
* loop.
*/
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
ref_lennard_jones_test_path
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
int
j
,
flt_t
rij
,
flt_t
rcmax
,
LennardJonesPathAIREBOT
<
flt_t
>
*
path
)
{
int
idxs
[
4
];
idxs
[
0
]
=
i
;
idxs
[
1
]
=
j
;
flt_t
best
=
0
;
if
(
rij
<=
rcmax
)
{
best
=
ref_lennard_jones_test_path_single
(
ka
,
best
,
2
,
idxs
,
path
);
if
(
best
==
static_cast
<
flt_t
>
(
1.0
))
return
0
;
}
for
(
int
kk
=
0
;
kk
<
ka
->
neigh_rebo
.
num
[
i
];
kk
++
)
{
int
k
=
ka
->
neigh_rebo
.
entries
[
ka
->
neigh_rebo
.
offset
[
i
]
+
kk
];
if
(
k
==
j
)
continue
;
idxs
[
1
]
=
k
;
idxs
[
2
]
=
j
;
best
=
ref_lennard_jones_test_path_single
(
ka
,
best
,
3
,
idxs
,
path
);
if
(
best
==
static_cast
<
flt_t
>
(
1.0
))
return
0
;
for
(
int
mm
=
0
;
mm
<
ka
->
neigh_rebo
.
num
[
k
];
mm
++
)
{
int
m
=
ka
->
neigh_rebo
.
entries
[
ka
->
neigh_rebo
.
offset
[
k
]
+
mm
];
if
(
m
==
i
||
m
==
j
)
continue
;
idxs
[
2
]
=
m
;
idxs
[
3
]
=
j
;
best
=
ref_lennard_jones_test_path_single
(
ka
,
best
,
4
,
idxs
,
path
);
if
(
best
==
static_cast
<
flt_t
>
(
1.0
))
return
0
;
}
}
return
1
-
best
;
}
/*
* Conducts the force update due to C_ij, given the active path.
*/
template
<
typename
flt_t
,
typename
acc_t
>
inline
void
ref_lennard_jones_force_path
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
flt_t
dC
,
LennardJonesPathAIREBOT
<
flt_t
>
*
path
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
for
(
int
i
=
0
;
i
<
path
->
num
-
1
;
i
++
)
{
flt_t
fpair
=
dC
*
path
->
dw
[
i
]
/
path
->
r
[
i
];
for
(
int
j
=
0
;
j
<
path
->
num
-
1
;
j
++
)
{
if
(
i
!=
j
)
fpair
*=
path
->
w
[
j
];
}
result_f
[
path
->
idx
[
i
+
0
]].
x
-=
fpair
*
path
->
del
[
i
].
x
;
result_f
[
path
->
idx
[
i
+
0
]].
y
-=
fpair
*
path
->
del
[
i
].
y
;
result_f
[
path
->
idx
[
i
+
0
]].
z
-=
fpair
*
path
->
del
[
i
].
z
;
result_f
[
path
->
idx
[
i
+
1
]].
x
+=
fpair
*
path
->
del
[
i
].
x
;
result_f
[
path
->
idx
[
i
+
1
]].
y
+=
fpair
*
path
->
del
[
i
].
y
;
result_f
[
path
->
idx
[
i
+
1
]].
z
+=
fpair
*
path
->
del
[
i
].
z
;
}
}
/*
* Calculate the bondorderLJ term.
*/
template
<
typename
flt_t
,
typename
acc_t
>
inline
flt_t
ref_lennard_jones_bondorder
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
int
j
,
flt_t
VA
,
acc_t
fij
[
3
])
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
map
=
ka
->
map
;
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
int
itype
=
map
[
x
[
i
].
w
];
int
jtype
=
map
[
x
[
j
].
w
];
flt_t
delx
=
x
[
i
].
x
-
x
[
j
].
x
;
flt_t
dely
=
x
[
i
].
y
-
x
[
j
].
y
;
flt_t
delz
=
x
[
i
].
z
-
x
[
j
].
z
;
flt_t
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
flt_t
rij
=
overloaded
::
sqrt
(
rsq
);
flt_t
rcminij
=
ka
->
params
.
rcmin
[
itype
][
jtype
];
flt_t
rcmaxij
=
ka
->
params
.
rcmax
[
itype
][
jtype
];
flt_t
dwij
;
flt_t
wij
=
Sp
(
rij
,
rcminij
,
rcmaxij
,
&
dwij
);
flt_t
the_r
=
ka
->
params
.
rcmin
[
itype
][
jtype
];
flt_t
scale
=
the_r
/
rij
;
flt_t
Nij
=
ka
->
nH
[
i
]
+
ka
->
nC
[
i
]
-
wij
;
flt_t
Nji
=
ka
->
nH
[
j
]
+
ka
->
nC
[
j
]
-
wij
;
flt_t
NconjtmpI
;
acc_t
fijc
[
3
]
=
{
0
},
fjic
[
3
]
=
{
0
};
flt_t
pij
=
frebo_pij
<
flt_t
,
acc_t
>
(
ka
,
i
,
j
,
delx
*
scale
,
dely
*
scale
,
delz
*
scale
,
the_r
,
wij
,
0.0
,
&
NconjtmpI
,
fijc
);
flt_t
NconjtmpJ
;
flt_t
pji
=
frebo_pij
<
flt_t
,
acc_t
>
(
ka
,
j
,
i
,
-
delx
*
scale
,
-
dely
*
scale
,
-
delz
*
scale
,
the_r
,
wij
,
0.0
,
&
NconjtmpJ
,
fjic
);
flt_t
Nijconj
=
1.0
+
(
NconjtmpI
*
NconjtmpI
)
+
(
NconjtmpJ
*
NconjtmpJ
);
flt_t
dN3_pi_rc
[
3
];
flt_t
pi_rc
=
frebo_pi_rc
<
flt_t
,
acc_t
>
(
ka
,
itype
,
jtype
,
Nij
,
Nji
,
Nijconj
,
dN3_pi_rc
);
flt_t
dN3_Tij
[
3
];
flt_t
Tij
=
frebo_Tij
<
flt_t
,
acc_t
>
(
ka
,
itype
,
jtype
,
Nij
,
Nji
,
Nijconj
,
dN3_Tij
);
flt_t
sum_omega
=
0
;
if
(
fabs
(
Tij
)
>
TOL
)
{
sum_omega
=
frebo_sum_omega
<
flt_t
,
acc_t
>
(
ka
,
i
,
j
,
delx
*
scale
,
dely
*
scale
,
delz
*
scale
,
the_r
,
0.0
,
fijc
);
}
flt_t
pi_dh
=
Tij
*
sum_omega
;
flt_t
bij
=
0.5
*
(
pij
+
pji
)
+
pi_rc
+
pi_dh
;
flt_t
dStb
;
flt_t
Stb
=
Sp2
<
flt_t
>
(
bij
,
ka
->
params
.
bLJmin
[
itype
][
jtype
],
ka
->
params
.
bLJmax
[
itype
][
jtype
],
&
dStb
);
if
(
dStb
!=
0
)
{
flt_t
pij_reverse
=
frebo_pij
<
flt_t
,
acc_t
>
(
ka
,
i
,
j
,
delx
*
scale
,
dely
*
scale
,
delz
*
scale
,
the_r
,
wij
,
VA
*
dStb
,
&
NconjtmpI
,
fijc
);
flt_t
pji_reverse
=
frebo_pij
<
flt_t
,
acc_t
>
(
ka
,
j
,
i
,
-
delx
*
scale
,
-
dely
*
scale
,
-
delz
*
scale
,
the_r
,
wij
,
VA
*
dStb
,
&
NconjtmpJ
,
fjic
);
fijc
[
0
]
-=
fjic
[
0
];
fijc
[
1
]
-=
fjic
[
1
];
fijc
[
2
]
-=
fjic
[
2
];
frebo_N_spline_force
<
flt_t
,
acc_t
>
(
ka
,
i
,
j
,
VA
*
dStb
,
dN3_pi_rc
[
0
],
dN3_pi_rc
[
2
],
NconjtmpI
);
frebo_N_spline_force
<
flt_t
,
acc_t
>
(
ka
,
j
,
i
,
VA
*
dStb
,
dN3_pi_rc
[
1
],
dN3_pi_rc
[
2
],
NconjtmpJ
);
if
(
fabs
(
Tij
)
>
TOL
)
{
flt_t
sum_omega_reverse
=
frebo_sum_omega
<
flt_t
,
acc_t
>
(
ka
,
i
,
j
,
delx
*
scale
,
dely
*
scale
,
delz
*
scale
,
the_r
,
VA
*
dStb
*
Tij
,
fijc
);
frebo_N_spline_force
(
ka
,
i
,
j
,
VA
*
dStb
*
sum_omega
,
dN3_Tij
[
0
],
dN3_Tij
[
2
],
NconjtmpI
);
frebo_N_spline_force
(
ka
,
j
,
i
,
VA
*
dStb
*
sum_omega
,
dN3_Tij
[
1
],
dN3_Tij
[
2
],
NconjtmpJ
);
}
assert
(
fij
[
0
]
==
0
);
assert
(
fij
[
1
]
==
0
);
assert
(
fij
[
2
]
==
0
);
fij
[
0
]
=
scale
*
(
fijc
[
0
]
-
(
delx
*
delx
*
fijc
[
0
]
+
dely
*
delx
*
fijc
[
1
]
+
delz
*
delx
*
fijc
[
2
])
/
rsq
);
fij
[
1
]
=
scale
*
(
fijc
[
1
]
-
(
delx
*
dely
*
fijc
[
0
]
+
dely
*
dely
*
fijc
[
1
]
+
delz
*
dely
*
fijc
[
2
])
/
rsq
);
fij
[
2
]
=
scale
*
(
fijc
[
2
]
-
(
delx
*
delz
*
fijc
[
0
]
+
dely
*
delz
*
fijc
[
1
]
+
delz
*
delz
*
fijc
[
2
])
/
rsq
);
}
return
Stb
;
}
/*
* Scalar reference implementation of neighbor routine.
*/
template
<
typename
flt_t
,
typename
acc_t
>
void
ref_rebo_neigh
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
)
{
int
offset
=
ka
->
neigh_from_atom
*
ka
->
num_neighs_per_atom
;
for
(
int
i
=
ka
->
neigh_from_atom
;
i
<
ka
->
neigh_to_atom
;
i
++
)
{
ka
->
neigh_rebo
.
offset
[
i
]
=
offset
;
int
itype
=
ka
->
map
[
ka
->
x
[
i
].
w
];
int
n
=
0
;
ka
->
nC
[
i
]
=
0
;
ka
->
nH
[
i
]
=
0
;
for
(
int
j
=
0
;
j
<
ka
->
neigh_lmp
.
num
[
i
];
j
++
)
{
int
ji
=
ka
->
neigh_lmp
.
entries
[
ka
->
neigh_lmp
.
offset
[
i
]
+
j
];
flt_t
delx
=
ka
->
x
[
i
].
x
-
ka
->
x
[
ji
].
x
;
flt_t
dely
=
ka
->
x
[
i
].
y
-
ka
->
x
[
ji
].
y
;
flt_t
delz
=
ka
->
x
[
i
].
z
-
ka
->
x
[
ji
].
z
;
flt_t
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
int
jtype
=
ka
->
map
[
ka
->
x
[
ji
].
w
];
if
(
rsq
<
ka
->
params
.
rcmaxsq
[
itype
][
jtype
])
{
ka
->
neigh_rebo
.
entries
[
offset
+
n
++
]
=
ji
;
flt_t
rcmin
=
ka
->
params
.
rcmin
[
itype
][
jtype
];
flt_t
rcmax
=
ka
->
params
.
rcmax
[
itype
][
jtype
];
if
(
jtype
==
CARBON
)
ka
->
nC
[
i
]
+=
Sp
<
flt_t
>
(
overloaded
::
sqrt
(
rsq
),
rcmin
,
rcmax
,
NULL
);
else
ka
->
nH
[
i
]
+=
Sp
<
flt_t
>
(
overloaded
::
sqrt
(
rsq
),
rcmin
,
rcmax
,
NULL
);
}
}
ka
->
neigh_rebo
.
num
[
i
]
=
n
;
offset
+=
n
;
}
}
template
<
typename
flt_t
,
typename
acc_t
>
void
ref_torsion_single_interaction
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
int
j
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
map
=
ka
->
map
;
ResultForceT
<
acc_t
>
*
f
=
ka
->
result_f
;
flt_t
(
*
rcmin
)[
2
]
=
ka
->
params
.
rcmin
;
flt_t
(
*
rcmax
)[
2
]
=
ka
->
params
.
rcmax
;
flt_t
(
*
epsilonT
)[
2
]
=
ka
->
params
.
epsilonT
;
flt_t
thmin
=
ka
->
params
.
thmin
;
flt_t
thmax
=
ka
->
params
.
thmax
;
int
itype
=
map
[
x
[
i
].
w
];
flt_t
xtmp
=
x
[
i
].
x
;
flt_t
ytmp
=
x
[
i
].
y
;
flt_t
ztmp
=
x
[
i
].
z
;
int
*
REBO_neighs_i
=
&
ka
->
neigh_rebo
.
entries
[
ka
->
neigh_rebo
.
offset
[
i
]];
int
jnum
=
ka
->
neigh_rebo
.
num
[
i
];
int
jtype
=
map
[
x
[
j
].
w
];
flt_t
del32x
=
x
[
j
].
x
-
x
[
i
].
x
;
flt_t
del32y
=
x
[
j
].
y
-
x
[
i
].
y
;
flt_t
del32z
=
x
[
j
].
z
-
x
[
i
].
z
;
flt_t
rsq
=
del32x
*
del32x
+
del32y
*
del32y
+
del32z
*
del32z
;
flt_t
r32
=
overloaded
::
sqrt
(
rsq
);
flt_t
del23x
=
-
del32x
;
flt_t
del23y
=
-
del32y
;
flt_t
del23z
=
-
del32z
;
flt_t
r23
=
r32
;
flt_t
dw23
,
w23
=
Sp
<
flt_t
>
(
r23
,
rcmin
[
itype
][
jtype
],
rcmax
[
itype
][
jtype
],
&
dw23
);
assert
(
itype
==
0
);
assert
(
jtype
==
0
);
for
(
int
kk
=
0
;
kk
<
jnum
;
kk
++
)
{
int
k
=
REBO_neighs_i
[
kk
];
int
ktype
=
map
[
x
[
k
].
w
];
if
(
k
==
j
)
continue
;
flt_t
del21x
=
x
[
i
].
x
-
x
[
k
].
x
;
flt_t
del21y
=
x
[
i
].
y
-
x
[
k
].
y
;
flt_t
del21z
=
x
[
i
].
z
-
x
[
k
].
z
;
flt_t
rsq
=
del21x
*
del21x
+
del21y
*
del21y
+
del21z
*
del21z
;
flt_t
r21
=
overloaded
::
sqrt
(
rsq
);
flt_t
cos321
=
-
((
del21x
*
del32x
)
+
(
del21y
*
del32y
)
+
(
del21z
*
del32z
))
/
(
r21
*
r32
);
cos321
=
fmin
(
cos321
,
1
);
cos321
=
fmax
(
cos321
,
-
1
);
flt_t
sin321
=
overloaded
::
sqrt
(
1
-
cos321
*
cos321
);
if
(
sin321
<
TOL
)
continue
;
flt_t
deljkx
=
del21x
-
del23x
;
flt_t
deljky
=
del21y
-
del23y
;
flt_t
deljkz
=
del21z
-
del23z
;
flt_t
rjk2
=
deljkx
*
deljkx
+
deljky
*
deljky
+
deljkz
*
deljkz
;
flt_t
rjk
=
overloaded
::
sqrt
(
rjk2
);
flt_t
rik2
=
r21
*
r21
;
flt_t
dw21
,
w21
=
Sp
<
flt_t
>
(
r21
,
rcmin
[
itype
][
ktype
],
rcmax
[
itype
][
ktype
],
&
dw21
);
flt_t
rij
=
r32
;
flt_t
rik
=
r21
;
flt_t
rij2
=
r32
*
r32
;
flt_t
costmp
=
static_cast
<
flt_t
>
(
0.5
)
*
(
rij2
+
rik2
-
rjk2
)
/
rij
/
rik
;
flt_t
dtsjik
,
tspjik
=
Sp2
<
flt_t
>
(
costmp
,
thmin
,
thmax
,
&
dtsjik
);
dtsjik
=
-
dtsjik
;
int
*
REBO_neighs_j
=
&
ka
->
neigh_rebo
.
entries
[
ka
->
neigh_rebo
.
offset
[
j
]];
int
lnum
=
ka
->
neigh_rebo
.
num
[
j
];
for
(
int
ll
=
0
;
ll
<
lnum
;
ll
++
)
{
int
l
=
REBO_neighs_j
[
ll
];
int
ltype
=
map
[
x
[
l
].
w
];
if
(
l
==
i
||
l
==
k
)
continue
;
flt_t
del34x
=
x
[
j
].
x
-
x
[
l
].
x
;
flt_t
del34y
=
x
[
j
].
y
-
x
[
l
].
y
;
flt_t
del34z
=
x
[
j
].
z
-
x
[
l
].
z
;
flt_t
rsq
=
del34x
*
del34x
+
del34y
*
del34y
+
del34z
*
del34z
;
flt_t
r34
=
overloaded
::
sqrt
(
rsq
);
flt_t
cos234
=
(
del32x
*
del34x
+
del32y
*
del34y
+
del32z
*
del34z
)
/
(
r32
*
r34
);
cos234
=
fmin
(
cos234
,
1
);
cos234
=
fmax
(
cos234
,
-
1
);
flt_t
sin234
=
overloaded
::
sqrt
(
1
-
cos234
*
cos234
);
if
(
sin234
<
TOL
)
continue
;
flt_t
dw34
,
w34
=
Sp
<
flt_t
>
(
r34
,
rcmin
[
jtype
][
ltype
],
rcmax
[
jtype
][
ltype
],
&
dw34
);
flt_t
delilx
=
del23x
+
del34x
;
flt_t
delily
=
del23y
+
del34y
;
flt_t
delilz
=
del23z
+
del34z
;
flt_t
ril2
=
delilx
*
delilx
+
delily
*
delily
+
delilz
*
delilz
;
flt_t
ril
=
overloaded
::
sqrt
(
ril2
);
flt_t
rjl2
=
r34
*
r34
;
flt_t
rjl
=
r34
;
flt_t
costmp
=
static_cast
<
flt_t
>
(
0.5
)
*
(
rij2
+
rjl2
-
ril2
)
/
rij
/
rjl
;
flt_t
dtsijl
,
tspijl
=
Sp2
<
flt_t
>
(
costmp
,
thmin
,
thmax
,
&
dtsijl
);
dtsijl
=
-
dtsijl
;
//need minus sign
flt_t
cross321x
=
(
del32y
*
del21z
)
-
(
del32z
*
del21y
);
flt_t
cross321y
=
(
del32z
*
del21x
)
-
(
del32x
*
del21z
);
flt_t
cross321z
=
(
del32x
*
del21y
)
-
(
del32y
*
del21x
);
flt_t
cross321mag
=
overloaded
::
sqrt
(
cross321x
*
cross321x
+
cross321y
*
cross321y
+
cross321z
*
cross321z
);
flt_t
cross234x
=
(
del23y
*
del34z
)
-
(
del23z
*
del34y
);
flt_t
cross234y
=
(
del23z
*
del34x
)
-
(
del23x
*
del34z
);
flt_t
cross234z
=
(
del23x
*
del34y
)
-
(
del23y
*
del34x
);
flt_t
cross234mag
=
overloaded
::
sqrt
(
cross234x
*
cross234x
+
cross234y
*
cross234y
+
cross234z
*
cross234z
);
flt_t
cwnum
=
(
cross321x
*
cross234x
)
+
(
cross321y
*
cross234y
)
+
(
cross321z
*
cross234z
);
flt_t
cwnom
=
r21
*
r34
*
r32
*
r32
*
sin321
*
sin234
;
flt_t
cw
=
cwnum
/
cwnom
;
flt_t
cw2
=
(
static_cast
<
flt_t
>
(
.5
)
*
(
1
-
cw
));
flt_t
ekijl
=
epsilonT
[
ktype
][
ltype
];
flt_t
Ec
=
256
*
ekijl
/
405
;
flt_t
Vtors
=
(
Ec
*
(
overloaded
::
pow
(
cw2
,
5
)))
-
(
ekijl
/
10
);
ka
->
result_eng
+=
Vtors
*
w21
*
w23
*
w34
*
(
1
-
tspjik
)
*
(
1
-
tspijl
);
flt_t
dndijx
=
(
cross234y
*
del21z
)
-
(
cross234z
*
del21y
);
flt_t
dndijy
=
(
cross234z
*
del21x
)
-
(
cross234x
*
del21z
);
flt_t
dndijz
=
(
cross234x
*
del21y
)
-
(
cross234y
*
del21x
);
flt_t
tmpvecx
=
(
del34y
*
cross321z
)
-
(
del34z
*
cross321y
);
flt_t
tmpvecy
=
(
del34z
*
cross321x
)
-
(
del34x
*
cross321z
);
flt_t
tmpvecz
=
(
del34x
*
cross321y
)
-
(
del34y
*
cross321x
);
dndijx
=
dndijx
+
tmpvecx
;
dndijy
=
dndijy
+
tmpvecy
;
dndijz
=
dndijz
+
tmpvecz
;
flt_t
dndikx
=
(
del23y
*
cross234z
)
-
(
del23z
*
cross234y
);
flt_t
dndiky
=
(
del23z
*
cross234x
)
-
(
del23x
*
cross234z
);
flt_t
dndikz
=
(
del23x
*
cross234y
)
-
(
del23y
*
cross234x
);
flt_t
dndjlx
=
(
cross321y
*
del23z
)
-
(
cross321z
*
del23y
);
flt_t
dndjly
=
(
cross321z
*
del23x
)
-
(
cross321x
*
del23z
);
flt_t
dndjlz
=
(
cross321x
*
del23y
)
-
(
cross321y
*
del23x
);
flt_t
dcidij
=
((
r23
*
r23
)
-
(
r21
*
r21
)
+
(
rjk
*
rjk
))
/
(
2
*
r23
*
r23
*
r21
);
flt_t
dcidik
=
((
r21
*
r21
)
-
(
r23
*
r23
)
+
(
rjk
*
rjk
))
/
(
2
*
r23
*
r21
*
r21
);
flt_t
dcidjk
=
(
-
rjk
)
/
(
r23
*
r21
);
flt_t
dcjdji
=
((
r23
*
r23
)
-
(
r34
*
r34
)
+
(
ril
*
ril
))
/
(
2
*
r23
*
r23
*
r34
);
flt_t
dcjdjl
=
((
r34
*
r34
)
-
(
r23
*
r23
)
+
(
ril
*
ril
))
/
(
2
*
r23
*
r34
*
r34
);
flt_t
dcjdil
=
(
-
ril
)
/
(
r23
*
r34
);
flt_t
dsidij
=
(
-
cos321
/
sin321
)
*
dcidij
;
flt_t
dsidik
=
(
-
cos321
/
sin321
)
*
dcidik
;
flt_t
dsidjk
=
(
-
cos321
/
sin321
)
*
dcidjk
;
flt_t
dsjdji
=
(
-
cos234
/
sin234
)
*
dcjdji
;
flt_t
dsjdjl
=
(
-
cos234
/
sin234
)
*
dcjdjl
;
flt_t
dsjdil
=
(
-
cos234
/
sin234
)
*
dcjdil
;
flt_t
dxidij
=
(
r21
*
sin321
)
+
(
r23
*
r21
*
dsidij
);
flt_t
dxidik
=
(
r23
*
sin321
)
+
(
r23
*
r21
*
dsidik
);
flt_t
dxidjk
=
(
r23
*
r21
*
dsidjk
);
flt_t
dxjdji
=
(
r34
*
sin234
)
+
(
r23
*
r34
*
dsjdji
);
flt_t
dxjdjl
=
(
r23
*
sin234
)
+
(
r23
*
r34
*
dsjdjl
);
flt_t
dxjdil
=
(
r23
*
r34
*
dsjdil
);
flt_t
ddndij
=
(
dxidij
*
cross234mag
)
+
(
cross321mag
*
dxjdji
);
flt_t
ddndik
=
dxidik
*
cross234mag
;
flt_t
ddndjk
=
dxidjk
*
cross234mag
;
flt_t
ddndjl
=
cross321mag
*
dxjdjl
;
flt_t
ddndil
=
cross321mag
*
dxjdil
;
flt_t
dcwddn
=
-
cwnum
/
(
cwnom
*
cwnom
);
flt_t
dcwdn
=
1
/
cwnom
;
flt_t
dvpdcw
=
(
-
1
)
*
Ec
*
static_cast
<
flt_t
>
(
-
0.5
)
*
5
*
overloaded
::
pow
(
cw2
,
4
)
*
w23
*
w21
*
w34
*
(
1
-
tspjik
)
*
(
1
-
tspijl
);
flt_t
Ftmpx
=
dvpdcw
*
((
dcwdn
*
dndijx
)
+
(
dcwddn
*
ddndij
*
del23x
/
r23
));
flt_t
Ftmpy
=
dvpdcw
*
((
dcwdn
*
dndijy
)
+
(
dcwddn
*
ddndij
*
del23y
/
r23
));
flt_t
Ftmpz
=
dvpdcw
*
((
dcwdn
*
dndijz
)
+
(
dcwddn
*
ddndij
*
del23z
/
r23
));
flt_t
fix
=
Ftmpx
;
flt_t
fiy
=
Ftmpy
;
flt_t
fiz
=
Ftmpz
;
flt_t
fjx
=
-
Ftmpx
;
flt_t
fjy
=
-
Ftmpy
;
flt_t
fjz
=
-
Ftmpz
;
Ftmpx
=
dvpdcw
*
((
dcwdn
*
dndikx
)
+
(
dcwddn
*
ddndik
*
del21x
/
r21
));
Ftmpy
=
dvpdcw
*
((
dcwdn
*
dndiky
)
+
(
dcwddn
*
ddndik
*
del21y
/
r21
));
Ftmpz
=
dvpdcw
*
((
dcwdn
*
dndikz
)
+
(
dcwddn
*
ddndik
*
del21z
/
r21
));
fix
+=
Ftmpx
;
fiy
+=
Ftmpy
;
fiz
+=
Ftmpz
;
flt_t
fkx
=
-
Ftmpx
;
flt_t
fky
=
-
Ftmpy
;
flt_t
fkz
=
-
Ftmpz
;
Ftmpx
=
(
dvpdcw
*
dcwddn
*
ddndjk
*
deljkx
)
/
rjk
;
Ftmpy
=
(
dvpdcw
*
dcwddn
*
ddndjk
*
deljky
)
/
rjk
;
Ftmpz
=
(
dvpdcw
*
dcwddn
*
ddndjk
*
deljkz
)
/
rjk
;
fjx
+=
Ftmpx
;
fjy
+=
Ftmpy
;
fjz
+=
Ftmpz
;
fkx
-=
Ftmpx
;
fky
-=
Ftmpy
;
fkz
-=
Ftmpz
;
Ftmpx
=
dvpdcw
*
((
dcwdn
*
dndjlx
)
+
(
dcwddn
*
ddndjl
*
del34x
/
r34
));
Ftmpy
=
dvpdcw
*
((
dcwdn
*
dndjly
)
+
(
dcwddn
*
ddndjl
*
del34y
/
r34
));
Ftmpz
=
dvpdcw
*
((
dcwdn
*
dndjlz
)
+
(
dcwddn
*
ddndjl
*
del34z
/
r34
));
fjx
+=
Ftmpx
;
fjy
+=
Ftmpy
;
fjz
+=
Ftmpz
;
flt_t
flx
=
-
Ftmpx
;
flt_t
fly
=
-
Ftmpy
;
flt_t
flz
=
-
Ftmpz
;
Ftmpx
=
(
dvpdcw
*
dcwddn
*
ddndil
*
delilx
)
/
ril
;
Ftmpy
=
(
dvpdcw
*
dcwddn
*
ddndil
*
delily
)
/
ril
;
Ftmpz
=
(
dvpdcw
*
dcwddn
*
ddndil
*
delilz
)
/
ril
;
fix
+=
Ftmpx
;
fiy
+=
Ftmpy
;
fiz
+=
Ftmpz
;
flx
-=
Ftmpx
;
fly
-=
Ftmpy
;
flz
-=
Ftmpz
;
// coordination forces
flt_t
fpair
=
Vtors
*
dw21
*
w23
*
w34
*
(
1
-
tspjik
)
*
(
1
-
tspijl
)
/
r21
;
fix
-=
del21x
*
fpair
;
fiy
-=
del21y
*
fpair
;
fiz
-=
del21z
*
fpair
;
fkx
+=
del21x
*
fpair
;
fky
+=
del21y
*
fpair
;
fkz
+=
del21z
*
fpair
;
fpair
=
Vtors
*
w21
*
dw23
*
w34
*
(
1
-
tspjik
)
*
(
1
-
tspijl
)
/
r23
;
fix
-=
del23x
*
fpair
;
fiy
-=
del23y
*
fpair
;
fiz
-=
del23z
*
fpair
;
fjx
+=
del23x
*
fpair
;
fjy
+=
del23y
*
fpair
;
fjz
+=
del23z
*
fpair
;
fpair
=
Vtors
*
w21
*
w23
*
dw34
*
(
1
-
tspjik
)
*
(
1
-
tspijl
)
/
r34
;
fjx
-=
del34x
*
fpair
;
fjy
-=
del34y
*
fpair
;
fjz
-=
del34z
*
fpair
;
flx
+=
del34x
*
fpair
;
fly
+=
del34y
*
fpair
;
flz
+=
del34z
*
fpair
;
// additional cut off function forces
flt_t
fcpc
=
-
Vtors
*
w21
*
w23
*
w34
*
dtsjik
*
(
1
-
tspijl
);
fpair
=
fcpc
*
dcidij
/
rij
;
fix
+=
fpair
*
del23x
;
fiy
+=
fpair
*
del23y
;
fiz
+=
fpair
*
del23z
;
fjx
-=
fpair
*
del23x
;
fjy
-=
fpair
*
del23y
;
fjz
-=
fpair
*
del23z
;
fpair
=
fcpc
*
dcidik
/
rik
;
fix
+=
fpair
*
del21x
;
fiy
+=
fpair
*
del21y
;
fiz
+=
fpair
*
del21z
;
fkx
-=
fpair
*
del21x
;
fky
-=
fpair
*
del21y
;
fkz
-=
fpair
*
del21z
;
fpair
=
fcpc
*
dcidjk
/
rjk
;
fjx
+=
fpair
*
deljkx
;
fjy
+=
fpair
*
deljky
;
fjz
+=
fpair
*
deljkz
;
fkx
-=
fpair
*
deljkx
;
fky
-=
fpair
*
deljky
;
fkz
-=
fpair
*
deljkz
;
fcpc
=
-
Vtors
*
w21
*
w23
*
w34
*
(
1
-
tspjik
)
*
dtsijl
;
fpair
=
fcpc
*
dcjdji
/
rij
;
fix
+=
fpair
*
del23x
;
fiy
+=
fpair
*
del23y
;
fiz
+=
fpair
*
del23z
;
fjx
-=
fpair
*
del23x
;
fjy
-=
fpair
*
del23y
;
fjz
-=
fpair
*
del23z
;
fpair
=
fcpc
*
dcjdjl
/
rjl
;
fjx
+=
fpair
*
del34x
;
fjy
+=
fpair
*
del34y
;
fjz
+=
fpair
*
del34z
;
flx
-=
fpair
*
del34x
;
fly
-=
fpair
*
del34y
;
flz
-=
fpair
*
del34z
;
fpair
=
fcpc
*
dcjdil
/
ril
;
fix
+=
fpair
*
delilx
;
fiy
+=
fpair
*
delily
;
fiz
+=
fpair
*
delilz
;
flx
-=
fpair
*
delilx
;
fly
-=
fpair
*
delily
;
flz
-=
fpair
*
delilz
;
// sum per-atom forces into atom force array
f
[
i
].
x
+=
fix
;
f
[
i
].
y
+=
fiy
;
f
[
i
].
z
+=
fiz
;
f
[
j
].
x
+=
fjx
;
f
[
j
].
y
+=
fjy
;
f
[
j
].
z
+=
fjz
;
f
[
k
].
x
+=
fkx
;
f
[
k
].
y
+=
fky
;
f
[
k
].
z
+=
fkz
;
f
[
l
].
x
+=
flx
;
f
[
l
].
y
+=
fly
;
f
[
l
].
z
+=
flz
;
}
}
}
template
<
typename
flt_t
,
typename
acc_t
>
void
ref_torsion
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
map
=
ka
->
map
;
int
*
tag
=
ka
->
tag
;
for
(
int
ii
=
ka
->
frebo_from_atom
;
ii
<
ka
->
frebo_to_atom
;
ii
++
)
{
int
i
=
ii
;
int
itag
=
tag
[
i
];
int
itype
=
map
[
x
[
i
].
w
];
if
(
itype
!=
0
)
continue
;
flt_t
xtmp
=
x
[
i
].
x
;
flt_t
ytmp
=
x
[
i
].
y
;
flt_t
ztmp
=
x
[
i
].
z
;
int
*
REBO_neighs_i
=
&
ka
->
neigh_rebo
.
entries
[
ka
->
neigh_rebo
.
offset
[
i
]];
int
jnum
=
ka
->
neigh_rebo
.
num
[
i
];
for
(
int
jj
=
0
;
jj
<
jnum
;
jj
++
)
{
int
j
=
REBO_neighs_i
[
jj
];
int
jtag
=
tag
[
j
];
if
(
itag
>
jtag
)
{
if
(((
itag
+
jtag
)
&
1
)
==
0
)
continue
;
}
else
if
(
itag
<
jtag
)
{
if
(((
itag
+
jtag
)
&
1
)
==
1
)
continue
;
}
else
{
if
(
x
[
j
].
z
<
ztmp
)
continue
;
if
(
x
[
j
].
z
==
ztmp
&&
x
[
j
].
y
<
ytmp
)
continue
;
if
(
x
[
j
].
z
==
ztmp
&&
x
[
j
].
y
==
ytmp
&&
x
[
j
].
x
<
xtmp
)
continue
;
}
int
jtype
=
map
[
x
[
j
].
w
];
if
(
jtype
!=
0
)
continue
;
ref_torsion_single_interaction
(
ka
,
i
,
j
);
}
}
}
/*
* Calculate single REBO interaction.
* Corresponds to FREBO method. Note that the bondorder() function is
* inlined.
*/
template
<
typename
flt_t
,
typename
acc_t
>
void
ref_frebo_single_interaction
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
int
j
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
map
=
ka
->
map
;
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
int
jj
;
int
itype
=
map
[
x
[
i
].
w
];
flt_t
x_i
=
x
[
i
].
x
;
flt_t
y_i
=
x
[
i
].
y
;
flt_t
z_i
=
x
[
i
].
z
;
int
jtype
=
map
[
x
[
j
].
w
];
flt_t
delx
=
x
[
i
].
x
-
x
[
j
].
x
;
flt_t
dely
=
x
[
i
].
y
-
x
[
j
].
y
;
flt_t
delz
=
x
[
i
].
z
-
x
[
j
].
z
;
flt_t
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
flt_t
rij
=
overloaded
::
sqrt
(
rsq
);
flt_t
rcminij
=
ka
->
params
.
rcmin
[
itype
][
jtype
];
flt_t
rcmaxij
=
ka
->
params
.
rcmax
[
itype
][
jtype
];
flt_t
dwij
;
flt_t
wij
=
Sp
(
rij
,
rcminij
,
rcmaxij
,
&
dwij
);
if
(
wij
<=
TOL
)
return
;
flt_t
Qij
=
ka
->
params
.
Q
[
itype
][
jtype
];
flt_t
Aij
=
ka
->
params
.
A
[
itype
][
jtype
];
flt_t
alphaij
=
ka
->
params
.
alpha
[
itype
][
jtype
];
flt_t
exp_alphar
=
exp
(
-
alphaij
*
rij
);
flt_t
VR_by_wij
=
(
1.0
+
(
Qij
/
rij
))
*
Aij
*
exp_alphar
;
flt_t
VR
=
wij
*
VR_by_wij
;
flt_t
pre
=
wij
*
Aij
*
exp_alphar
;
flt_t
dVRdi
=
pre
*
((
-
alphaij
)
-
(
Qij
/
rsq
)
-
(
Qij
*
alphaij
/
rij
));
dVRdi
+=
VR_by_wij
*
dwij
;
flt_t
VA_by_wij
=
0
,
dVA
=
0
;
for
(
int
k
=
0
;
k
<
3
;
k
++
)
{
flt_t
BIJc
=
ka
->
params
.
BIJc
[
itype
][
jtype
][
k
];
flt_t
Betaij
=
ka
->
params
.
Beta
[
itype
][
jtype
][
k
];
flt_t
term
=
-
BIJc
*
overloaded
::
exp
(
-
Betaij
*
rij
);
VA_by_wij
+=
term
;
dVA
+=
-
Betaij
*
wij
*
term
;
}
dVA
+=
VA_by_wij
*
dwij
;
flt_t
VA
=
VA_by_wij
*
wij
;
acc_t
fij
[
3
]
=
{
0
};
flt_t
Nij
=
ka
->
nH
[
i
]
+
ka
->
nC
[
i
]
-
wij
;
flt_t
Nji
=
ka
->
nH
[
j
]
+
ka
->
nC
[
j
]
-
wij
;
flt_t
NconjtmpI
;
flt_t
pij
=
frebo_pij
(
ka
,
i
,
j
,
delx
,
dely
,
delz
,
rij
,
wij
,
VA
,
&
NconjtmpI
,
fij
);
flt_t
NconjtmpJ
;
acc_t
fji
[
3
]
=
{
0
};
flt_t
pji
=
frebo_pij
(
ka
,
j
,
i
,
-
delx
,
-
dely
,
-
delz
,
rij
,
wij
,
VA
,
&
NconjtmpJ
,
fji
);
fij
[
0
]
-=
fji
[
0
];
fij
[
1
]
-=
fji
[
1
];
fij
[
2
]
-=
fji
[
2
];
flt_t
Nijconj
=
1.0
+
(
NconjtmpI
*
NconjtmpI
)
+
(
NconjtmpJ
*
NconjtmpJ
);
flt_t
dN3
[
3
];
flt_t
pi_rc
=
frebo_pi_rc
(
ka
,
itype
,
jtype
,
Nij
,
Nji
,
Nijconj
,
dN3
);
frebo_N_spline_force
(
ka
,
i
,
j
,
VA
,
dN3
[
0
],
dN3
[
2
],
NconjtmpI
);
frebo_N_spline_force
(
ka
,
j
,
i
,
VA
,
dN3
[
1
],
dN3
[
2
],
NconjtmpJ
);
flt_t
Tij
=
frebo_Tij
(
ka
,
itype
,
jtype
,
Nij
,
Nji
,
Nijconj
,
dN3
);
flt_t
sum_omega
=
0.0
;
if
(
fabs
(
Tij
)
>
TOL
)
{
sum_omega
=
frebo_sum_omega
(
ka
,
i
,
j
,
delx
,
dely
,
delz
,
rij
,
VA
*
Tij
,
fij
);
frebo_N_spline_force
(
ka
,
i
,
j
,
VA
*
sum_omega
,
dN3
[
0
],
dN3
[
2
],
NconjtmpI
);
frebo_N_spline_force
(
ka
,
j
,
i
,
VA
*
sum_omega
,
dN3
[
1
],
dN3
[
2
],
NconjtmpJ
);
}
flt_t
pi_dh
=
Tij
*
sum_omega
;
flt_t
bij
=
static_cast
<
flt_t
>
(
0.5
)
*
(
pij
+
pji
)
+
pi_rc
+
pi_dh
;
flt_t
dVAdi
=
bij
*
dVA
;
flt_t
fpair
=
-
(
dVRdi
+
dVAdi
)
/
rij
;
result_f
[
i
].
x
+=
fpair
*
delx
+
fij
[
0
];
result_f
[
i
].
y
+=
fpair
*
dely
+
fij
[
1
];
result_f
[
i
].
z
+=
fpair
*
delz
+
fij
[
2
];
result_f
[
j
].
x
-=
fpair
*
delx
+
fij
[
0
];
result_f
[
j
].
y
-=
fpair
*
dely
+
fij
[
1
];
result_f
[
j
].
z
-=
fpair
*
delz
+
fij
[
2
];
flt_t
evdwl
=
VR
+
bij
*
VA
;
ka
->
result_eng
+=
evdwl
;
result_f
[
i
].
w
+=
0.5
*
evdwl
;
result_f
[
j
].
w
+=
0.5
*
evdwl
;
}
template
<
typename
flt_t
,
typename
acc_t
>
inline
void
ref_frebo_single_atom
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
tag
=
ka
->
tag
;
int
jj
;
int
itag
=
tag
[
i
];
flt_t
x_i
=
x
[
i
].
x
;
flt_t
y_i
=
x
[
i
].
y
;
flt_t
z_i
=
x
[
i
].
z
;
int
*
neighs
=
ka
->
neigh_rebo
.
entries
+
ka
->
neigh_rebo
.
offset
[
i
];
int
jnum
=
ka
->
neigh_rebo
.
num
[
i
];
for
(
jj
=
0
;
jj
<
jnum
;
jj
++
)
{
int
j
=
neighs
[
jj
];
int
jtag
=
tag
[
j
];
if
(
itag
>
jtag
)
{
if
(((
itag
+
jtag
)
&
1
)
==
0
)
continue
;
}
else
if
(
itag
<
jtag
)
{
if
(((
itag
+
jtag
)
&
1
)
==
1
)
continue
;
}
else
{
if
(
x
[
j
].
z
<
z_i
)
continue
;
if
(
x
[
j
].
z
==
z_i
&&
x
[
j
].
y
<
y_i
)
continue
;
if
(
x
[
j
].
z
==
z_i
&&
x
[
j
].
y
==
y_i
&&
x
[
j
].
x
<
x_i
)
continue
;
}
ref_frebo_single_interaction
(
ka
,
i
,
j
);
}
}
template
<
typename
flt_t
,
typename
acc_t
>
void
ref_frebo
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
torflag
)
{
for
(
int
i
=
ka
->
frebo_from_atom
;
i
<
ka
->
frebo_to_atom
;
i
++
)
{
ref_frebo_single_atom
(
ka
,
i
);
}
if
(
torflag
)
ref_torsion
(
ka
);
}
template
<
typename
flt_t
,
typename
acc_t
>
void
ref_lennard_jones_single_interaction
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
int
j
,
int
morseflag
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
map
=
ka
->
map
;
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
int
itype
=
map
[
x
[
i
].
w
];
int
jtype
=
map
[
x
[
j
].
w
];
flt_t
delx
=
x
[
i
].
x
-
x
[
j
].
x
;
flt_t
dely
=
x
[
i
].
y
-
x
[
j
].
y
;
flt_t
delz
=
x
[
i
].
z
-
x
[
j
].
z
;
flt_t
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
if
(
rsq
>=
ka
->
params
.
cutljsq
[
itype
][
jtype
])
{
return
;
}
flt_t
rij
=
overloaded
::
sqrt
(
rsq
);
LennardJonesPathAIREBOT
<
flt_t
>
testpath
;
flt_t
cij
=
1.0
;
if
(
rij
<
ka
->
params
.
cut3rebo
)
{
#pragma noinline
cij
=
ref_lennard_jones_test_path
<
flt_t
,
acc_t
>
(
ka
,
i
,
j
,
rij
,
ka
->
params
.
rcmax
[
itype
][
jtype
],
&
testpath
);
}
if
(
cij
==
0
)
{
return
;
}
flt_t
sigcut
=
ka
->
params
.
sigcut
;
flt_t
sigmin
=
ka
->
params
.
sigmin
;
flt_t
sigma
=
ka
->
params
.
sigma
[
itype
][
jtype
];
flt_t
rljmax
=
sigcut
*
sigma
;
flt_t
rljmin
=
sigmin
*
sigma
;
flt_t
dslw
,
slw
=
Sp2
(
rij
,
rljmin
,
rljmax
,
&
dslw
);
flt_t
vdw
,
dvdw
;
if
(
morseflag
)
{
const
flt_t
exr
=
exp
(
-
rij
*
ka
->
params
.
lj4
[
itype
][
jtype
]);
vdw
=
ka
->
params
.
lj1
[
itype
][
jtype
]
*
exr
*
(
ka
->
params
.
lj2
[
itype
][
jtype
]
*
exr
-
2
);
dvdw
=
ka
->
params
.
lj3
[
itype
][
jtype
]
*
exr
*
(
1
-
ka
->
params
.
lj2
[
itype
][
jtype
]
*
exr
);
}
else
{
flt_t
r2inv
=
1
/
rsq
;
flt_t
r6inv
=
r2inv
*
r2inv
*
r2inv
;
vdw
=
r6inv
*
(
ka
->
params
.
lj3
[
itype
][
jtype
]
*
r6inv
-
ka
->
params
.
lj4
[
itype
][
jtype
]);
dvdw
=
-
r6inv
*
(
ka
->
params
.
lj1
[
itype
][
jtype
]
*
r6inv
-
ka
->
params
.
lj2
[
itype
][
jtype
])
/
rij
;
}
flt_t
VLJ
=
vdw
*
slw
;
flt_t
dVLJ
=
dvdw
*
slw
+
vdw
*
dslw
;
flt_t
dStr
,
Str
=
Sp2
<
flt_t
>
(
rij
,
ka
->
params
.
rcLJmin
[
itype
][
jtype
],
ka
->
params
.
rcLJmax
[
itype
][
jtype
],
&
dStr
);
flt_t
VA
=
Str
*
cij
*
VLJ
;
flt_t
Stb
=
0
;
acc_t
fij
[
3
]
=
{
0
};
if
(
Str
>
0
)
{
#pragma noinline
Stb
=
ref_lennard_jones_bondorder
(
ka
,
i
,
j
,
VA
,
fij
);
}
flt_t
fpair
=
-
(
dStr
*
(
Stb
*
cij
*
VLJ
-
cij
*
VLJ
)
+
dVLJ
*
(
Str
*
Stb
*
cij
+
cij
-
Str
*
cij
))
/
rij
;
flt_t
evdwl
=
VA
*
Stb
+
(
1
-
Str
)
*
cij
*
VLJ
;
result_f
[
i
].
x
+=
fpair
*
delx
+
fij
[
0
];
result_f
[
i
].
y
+=
fpair
*
dely
+
fij
[
1
];
result_f
[
i
].
z
+=
fpair
*
delz
+
fij
[
2
];
result_f
[
j
].
x
-=
fpair
*
delx
+
fij
[
0
];
result_f
[
j
].
y
-=
fpair
*
dely
+
fij
[
1
];
result_f
[
j
].
z
-=
fpair
*
delz
+
fij
[
2
];
ka
->
result_eng
+=
evdwl
;
if
(
cij
<
1
)
{
#pragma noinline
ref_lennard_jones_force_path
(
ka
,
Str
*
Stb
*
VLJ
+
(
1
-
Str
)
*
VLJ
,
&
testpath
);
}
}
template
<
typename
flt_t
,
typename
acc_t
>
void
ref_lennard_jones_single_atom
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
int
morseflag
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
tag
=
ka
->
tag
;
int
jj
;
int
itag
=
tag
[
i
];
int
*
neighs
=
ka
->
neigh_lmp
.
entries
+
ka
->
neigh_lmp
.
offset
[
i
];
int
jnum
=
ka
->
neigh_lmp
.
num_half
[
i
];
for
(
jj
=
0
;
jj
<
jnum
;
jj
++
)
{
int
j
=
neighs
[
jj
];
ref_lennard_jones_single_interaction
(
ka
,
i
,
j
,
morseflag
);
}
}
template
<
typename
flt_t
,
typename
acc_t
>
void
ref_lennard_jones
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
morseflag
)
{
for
(
int
i
=
ka
->
frebo_from_atom
;
i
<
ka
->
frebo_to_atom
;
i
++
)
{
#pragma noinline
ref_lennard_jones_single_atom
(
ka
,
i
,
morseflag
);
}
}
/* ----------------------------------------------------------------------
Vectorized AIREBO implementation, standalone, using caching to reduce
memory access.
---------------------------------------------------------------------- */
template
<
typename
flt_t
,
typename
acc_t
>
struct
aut_wrap
{
typedef
typename
intr_types
<
flt_t
,
acc_t
>::
fvec
fvec
;
typedef
typename
intr_types
<
flt_t
,
acc_t
>::
avec
avec
;
typedef
typename
intr_types
<
flt_t
,
acc_t
>::
ivec
ivec
;
typedef
typename
intr_types
<
flt_t
,
acc_t
>::
bvec
bvec
;
VEC_INLINE
inline
static
void
aut_loadatoms_vec
(
AtomAIREBOT
<
flt_t
>
*
atoms
,
ivec
j_vec
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
,
bvec
*
type_mask
,
int
*
map
,
ivec
map_i
,
ivec
c_1
)
{
const
ivec
c_4
=
ivec
::
set1
(
4
);
ivec
j_vec_4
=
ivec
::
mullo
(
c_4
,
j_vec
);
fvec
w
;
fvec
::
gather_4_adjacent
(
j_vec_4
,
&
atoms
[
0
].
x
,
sizeof
(
flt_t
),
x
,
y
,
z
,
&
w
);
ivec
jtype
=
fvec
::
unpackloepi32
(
w
);
jtype
=
ivec
::
srlv
(
map_i
,
jtype
);
//_mm512_castpd_si512(w));
jtype
=
ivec
::
the_and
(
c_1
,
jtype
);
bvec
jtype_mask
=
ivec
::
cmpneq
(
jtype
,
ivec
::
setzero
());
*
type_mask
=
jtype_mask
;
}
VEC_INLINE
inline
static
void
aut_loadatoms_vec_notype
(
AtomAIREBOT
<
flt_t
>
*
atoms
,
ivec
j_vec
,
fvec
*
x
,
fvec
*
y
,
fvec
*
z
)
{
const
ivec
c_4
=
ivec
::
set1
(
4
);
ivec
j_vec_4
=
ivec
::
mullo
(
c_4
,
j_vec
);
fvec
::
gather_3_adjacent
(
j_vec_4
,
&
atoms
[
0
].
x
,
sizeof
(
flt_t
),
x
,
y
,
z
);
}
static
fvec
aut_Sp2_deriv
(
fvec
r
,
fvec
lo
,
fvec
hi
,
fvec
*
d
)
{
fvec
c_1
=
fvec
::
set1
(
1
);
fvec
c_2
=
fvec
::
set1
(
2
);
fvec
c_3
=
fvec
::
set1
(
3
);
fvec
c_6
=
fvec
::
set1
(
6
);
bvec
m_lo
=
fvec
::
cmple
(
r
,
lo
);
bvec
m_hi
=
fvec
::
cmpnlt
(
r
,
hi
);
// nlt == ge
bvec
m_tr
=
bvec
::
kandn
(
m_lo
,
~
m_hi
);
fvec
ret
=
c_1
;
ret
=
fvec
::
mask_blend
(
m_hi
,
ret
,
fvec
::
setzero
());
fvec
der
=
fvec
::
setzero
();
if
(
bvec
::
test_any_set
(
m_tr
))
{
fvec
diff
=
hi
-
lo
;
fvec
rcp
=
fvec
::
recip
(
diff
);
fvec
t
=
(
r
-
lo
)
*
rcp
;
fvec
v
=
c_1
-
t
*
t
*
(
c_3
-
c_2
*
t
);
ret
=
fvec
::
mask_blend
(
m_tr
,
ret
,
v
);
fvec
dv
=
c_6
*
rcp
*
(
t
*
t
-
t
);
der
=
fvec
::
mask_blend
(
m_tr
,
der
,
dv
);
}
*
d
=
der
;
return
ret
;
}
static
fvec
aut_Sp_deriv
(
fvec
r
,
fvec
lo
,
fvec
hi
,
fvec
*
d
)
{
fvec
c_1
=
fvec
::
set1
(
1
);
fvec
c_0_5
=
fvec
::
set1
(
0.5
);
fvec
c_m0_5
=
fvec
::
set1
(
-
0.5
);
fvec
c_PI
=
fvec
::
set1
(
M_PI
);
bvec
m_lo
=
fvec
::
cmple
(
r
,
lo
);
bvec
m_hi
=
fvec
::
cmpnlt
(
r
,
hi
);
// nlt == ge
bvec
m_tr
=
bvec
::
kandn
(
m_lo
,
~
m_hi
);
fvec
ret
=
c_1
;
ret
=
fvec
::
mask_blend
(
m_hi
,
ret
,
fvec
::
setzero
());
fvec
der
=
fvec
::
setzero
();
if
(
bvec
::
test_any_set
(
m_tr
))
{
fvec
diff
=
hi
-
lo
;
fvec
rcp
=
fvec
::
mask_recip
(
c_1
,
m_tr
,
diff
);
fvec
t
=
(
r
-
lo
)
/
diff
;
fvec
sinval
,
cosval
;
sinval
=
fvec
::
mask_sincos
(
&
cosval
,
fvec
::
setzero
(),
c_1
,
m_tr
,
c_PI
*
t
);
fvec
v
=
c_0_5
*
(
c_1
+
cosval
);
ret
=
fvec
::
mask_blend
(
m_tr
,
ret
,
v
);
fvec
dv
=
c_PI
*
c_m0_5
*
rcp
*
sinval
;
der
=
fvec
::
mask_blend
(
m_tr
,
der
,
dv
);
}
*
d
=
der
;
return
ret
;
}
static
fvec
aut_mask_Sp
(
bvec
mask
,
fvec
r
,
fvec
lo
,
fvec
hi
)
{
fvec
c_1
=
fvec
::
set1
(
1
);
fvec
c_0_5
=
fvec
::
set1
(
0.5
);
fvec
c_PI
=
fvec
::
set1
(
M_PI
);
bvec
m_lo
=
fvec
::
mask_cmple
(
mask
,
r
,
lo
);
bvec
m_hi
=
fvec
::
mask_cmpnlt
(
mask
,
r
,
hi
);
// nlt == ge
bvec
m_tr
=
bvec
::
kandn
(
m_lo
,
bvec
::
kandn
(
m_hi
,
mask
));
fvec
ret
=
c_1
;
ret
=
fvec
::
mask_blend
(
m_hi
,
ret
,
fvec
::
setzero
());
if
(
bvec
::
test_any_set
(
m_tr
))
{
fvec
rcp
=
fvec
::
mask_recip
(
c_1
,
m_tr
,
hi
-
lo
);
fvec
t
=
(
r
-
lo
)
*
rcp
;
fvec
v
=
c_0_5
*
(
c_1
+
fvec
::
mask_cos
(
c_1
,
m_tr
,
c_PI
*
t
));
ret
=
fvec
::
mask_blend
(
m_tr
,
ret
,
v
);
}
return
ret
;
}
static
void
aut_rebo_neigh
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
)
{
int
offset
=
ka
->
neigh_from_atom
*
ka
->
num_neighs_per_atom
;
ivec
c_CARBON
=
ivec
::
setzero
();
int
map_i
=
0
;
int
i
;
for
(
i
=
1
;
i
<
ka
->
num_types
;
i
++
)
{
if
(
ka
->
map
[
i
])
map_i
|=
(
1
<<
i
);
}
ivec
c_i1
=
ivec
::
set1
(
1
);
ivec
c_im
=
ivec
::
set1
(
map_i
);
AtomAIREBOT
<
flt_t
>
*
_noalias
x
=
ka
->
x
;
for
(
i
=
ka
->
neigh_from_atom
;
i
<
ka
->
neigh_to_atom
;
i
++
)
{
fvec
x_i
=
fvec
::
set1
(
x
[
i
].
x
);
fvec
y_i
=
fvec
::
set1
(
x
[
i
].
y
);
fvec
z_i
=
fvec
::
set1
(
x
[
i
].
z
);
int
itype
=
ka
->
map
[
ka
->
x
[
i
].
w
];
fvec
rcmaxsq0
=
fvec
::
set1
(
ka
->
params
.
rcmaxsq
[
itype
][
0
]);
fvec
rcmaxsq1
=
fvec
::
set1
(
ka
->
params
.
rcmaxsq
[
itype
][
1
]);
fvec
rcmax0
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
0
]);
fvec
rcmax1
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
1
]);
fvec
rcmin0
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
0
]);
fvec
rcmin1
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
1
]);
fvec
rcmaxskinsq0
=
fvec
::
set1
(
(
ka
->
params
.
rcmax
[
itype
][
0
]
+
ka
->
skin
)
*
(
ka
->
params
.
rcmax
[
itype
][
0
]
+
ka
->
skin
));
fvec
rcmaxskinsq1
=
fvec
::
set1
(
(
ka
->
params
.
rcmax
[
itype
][
1
]
+
ka
->
skin
)
*
(
ka
->
params
.
rcmax
[
itype
][
1
]
+
ka
->
skin
));
fvec
nC
=
fvec
::
setzero
();
fvec
nH
=
fvec
::
setzero
();
ka
->
neigh_rebo
.
offset
[
i
]
=
offset
;
int
jnum
=
ka
->
rebuild_flag
?
ka
->
neigh_lmp
.
num
[
i
]
:
ka
->
neigh_rebo
.
num_half
[
i
];
int
*
neighs
=
ka
->
rebuild_flag
?
&
ka
->
neigh_lmp
.
entries
[
ka
->
neigh_lmp
.
offset
[
i
]]
:
&
ka
->
neigh_rebo
.
entries
[
ka
->
neigh_rebo
.
offset
[
i
]
+
jnum
];
int
*
skin_target
=
&
ka
->
neigh_rebo
.
entries
[
offset
+
ka
->
num_neighs_per_atom
];
int
n
=
0
;
int
n_skin
=
0
;
int
lowest_idx
;
#pragma unroll(4)
for
(
lowest_idx
=
0
;
lowest_idx
<
jnum
;
lowest_idx
+=
fvec
::
VL
)
{
bvec
j_mask
=
bvec
::
full
();
if
(
lowest_idx
+
fvec
::
VL
>
jnum
)
j_mask
=
bvec
::
only
(
jnum
-
lowest_idx
);
int
*
_noalias
neighs_l
=
neighs
+
lowest_idx
;
fvec
x_j
,
y_j
,
z_j
;
bvec
jtype_mask
;
ivec
ji
=
ivec
::
maskz_loadu
(
j_mask
,
neighs_l
);
aut_loadatoms_vec
(
x
,
ji
,
&
x_j
,
&
y_j
,
&
z_j
,
&
jtype_mask
,
ka
->
map
,
c_im
,
c_i1
);
fvec
delx
=
x_i
-
x_j
;
fvec
dely
=
y_i
-
y_j
;
fvec
delz
=
z_i
-
z_j
;
fvec
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
if
(
ka
->
rebuild_flag
)
{
fvec
rcmaxskinsq
=
fvec
::
mask_blend
(
jtype_mask
,
rcmaxskinsq0
,
rcmaxskinsq1
);
bvec
c_mask
=
fvec
::
mask_cmplt
(
j_mask
,
rsq
,
rcmaxskinsq
);
ivec
::
mask_compressstore
(
c_mask
,
&
skin_target
[
n_skin
],
ji
);
n_skin
+=
bvec
::
popcnt
(
c_mask
);
}
fvec
rcmaxsq
=
fvec
::
mask_blend
(
jtype_mask
,
rcmaxsq0
,
rcmaxsq1
);
bvec
c_mask
=
fvec
::
mask_cmplt
(
j_mask
,
rsq
,
rcmaxsq
);
if
(
bvec
::
test_all_unset
(
c_mask
))
continue
;
ivec
::
mask_compressstore
(
c_mask
,
&
ka
->
neigh_rebo
.
entries
[
offset
+
n
],
ji
);
n
+=
bvec
::
popcnt
(
c_mask
);
fvec
rcmax
=
fvec
::
mask_blend
(
jtype_mask
,
rcmax0
,
rcmax1
);
fvec
rcmin
=
fvec
::
mask_blend
(
jtype_mask
,
rcmin0
,
rcmin1
);
fvec
sp
=
aut_mask_Sp
(
c_mask
,
fvec
::
sqrt
(
rsq
),
rcmin
,
rcmax
);
nC
=
fvec
::
mask_add
(
nC
,
bvec
::
kandn
(
jtype_mask
,
c_mask
),
nC
,
sp
);
nH
=
fvec
::
mask_add
(
nH
,
bvec
::
kand
(
jtype_mask
,
c_mask
),
nH
,
sp
);
}
ka
->
neigh_rebo
.
num
[
i
]
=
n
;
if
(
ka
->
rebuild_flag
)
{
for
(
int
i
=
0
;
i
<
n_skin
;
i
++
)
{
ka
->
neigh_rebo
.
entries
[
offset
+
n_skin
+
i
]
=
skin_target
[
i
];
}
}
if
(
ka
->
rebuild_flag
)
{
assert
(
n
<=
n_skin
);
offset
+=
2
*
n_skin
;
ka
->
neigh_rebo
.
num_half
[
i
]
=
n_skin
;
}
else
{
assert
(
n
<=
jnum
);
offset
+=
2
*
jnum
;
}
ka
->
nC
[
i
]
=
fvec
::
reduce_add
(
nC
);
ka
->
nH
[
i
]
=
fvec
::
reduce_add
(
nH
);
}
}
static
fvec
aut_eval_poly_lin_pd_2
(
int
n
,
flt_t
*
vals
,
ivec
idx
,
fvec
x
,
fvec
*
deriv
)
{
fvec
c_1
=
fvec
::
set1
(
1
);
fvec
x_i
=
c_1
;
fvec
x_im1
=
fvec
::
setzero
();
fvec
result
=
fvec
::
setzero
();
fvec
i_v
=
fvec
::
setzero
();
*
deriv
=
fvec
::
setzero
();
int
i
;
for
(
i
=
0
;
i
<
n
;
i
++
)
{
fvec
coeff
=
fvec
::
gather
(
idx
,
vals
+
i
,
sizeof
(
flt_t
));
result
=
result
+
coeff
*
x_i
;
*
deriv
=
*
deriv
+
coeff
*
x_im1
*
i_v
;
x_im1
=
x_i
;
x_i
=
x_i
*
x
;
i_v
=
i_v
+
c_1
;
}
return
result
;
}
static
fvec
aut_mask_gSpline_pd_2
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
bvec
active_mask
,
int
itype
,
fvec
cosjik
,
fvec
Nij
,
fvec
*
dgdc
,
fvec
*
dgdN
)
{
int
i
;
flt_t
*
gDom
=
NULL
;
int
nDom
=
0
;
ivec
offs
=
ivec
::
setzero
();
fvec
NCmin
=
fvec
::
set1
(
ka
->
params
.
NCmin
);
bvec
Ngt
=
fvec
::
cmpnle
(
Nij
,
NCmin
);
//gt
if
(
itype
==
0
)
{
nDom
=
4
;
gDom
=
&
ka
->
params
.
gCdom
[
0
];
offs
=
ivec
::
mask_blend
(
Ngt
,
offs
,
ivec
::
set1
(
4
*
6
));
}
else
{
nDom
=
3
;
gDom
=
&
ka
->
params
.
gHdom
[
0
];
offs
=
ivec
::
set1
(
8
*
6
);
}
cosjik
=
fvec
::
max
(
fvec
::
set1
(
gDom
[
0
]),
fvec
::
min
(
fvec
::
set1
(
gDom
[
nDom
]),
cosjik
));
ivec
index6
=
ivec
::
setzero
();
for
(
i
=
0
;
i
<
nDom
;
i
++
)
{
bvec
cosge
=
fvec
::
cmpnlt
(
cosjik
,
fvec
::
set1
(
gDom
[
i
]));
//ge
bvec
cosle
=
fvec
::
cmple
(
cosjik
,
fvec
::
set1
(
gDom
[
i
+
1
]));
index6
=
ivec
::
mask_blend
(
cosge
&
cosle
,
index6
,
ivec
::
set1
(
6
*
i
));
}
fvec
g
=
aut_eval_poly_lin_pd_2
(
6
,
&
ka
->
params
.
gVal
[
0
],
offs
+
index6
,
cosjik
,
dgdc
);
*
dgdN
=
fvec
::
setzero
();
if
(
itype
==
0
)
{
fvec
NCmax
=
fvec
::
set1
(
ka
->
params
.
NCmax
);
bvec
Nlt
=
fvec
::
cmplt
(
Nij
,
NCmax
);
//gt
bvec
Nmask
=
Ngt
&
Nlt
;
if
(
bvec
::
test_any_set
(
Nmask
))
{
fvec
dg1
;
fvec
g1
=
aut_eval_poly_lin_pd_2
(
6
,
&
ka
->
params
.
gVal
[
0
],
index6
,
cosjik
,
&
dg1
);
fvec
dS
;
fvec
cut
=
aut_Sp_deriv
(
Nij
,
NCmin
,
NCmax
,
&
dS
);
*
dgdN
=
fvec
::
mask_mul
(
*
dgdN
,
Nmask
,
dS
,
g1
-
g
);
g
=
fvec
::
mask_add
(
g
,
Nmask
,
g
,
cut
*
(
g1
-
g
));
*
dgdc
=
fvec
::
mask_add
(
*
dgdc
,
Nmask
,
*
dgdc
,
cut
*
(
dg1
-
*
dgdc
));
}
}
return
g
;
}
static
fvec
aut_PijSpline
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
itype
,
int
jtype
,
fvec
NijC
,
fvec
NijH
,
fvec
*
dN2
)
{
flt_t
ret
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
dN20
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
dN21
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
NijC_
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
NijH_
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
tmp_dN2
[
2
];
fvec
::
store
(
NijC_
,
NijC
);
fvec
::
store
(
NijH_
,
NijH
);
int
i
;
for
(
i
=
0
;
i
<
fvec
::
VL
;
i
++
)
{
ret
[
i
]
=
PijSpline
(
ka
,
itype
,
jtype
,
NijC_
[
i
],
NijH_
[
i
],
tmp_dN2
);
dN20
[
i
]
=
tmp_dN2
[
0
];
dN21
[
i
]
=
tmp_dN2
[
1
];
}
dN2
[
0
]
=
fvec
::
load
(
dN20
);
dN2
[
1
]
=
fvec
::
load
(
dN21
);
return
fvec
::
load
(
ret
);
}
/*
* aut_frebo_data stores all the short-ranged coordinations
* and intermediate values that get reused frequently during
* bondorder calculations.
* BUF_CAP should rarely exceed 4, so 8 is a very conservative
* value.
*/
static
const
int
BUF_CAP
=
8
;
struct
aut_frebo_data
{
fvec
rikx_buf
[
BUF_CAP
];
fvec
riky_buf
[
BUF_CAP
];
fvec
rikz_buf
[
BUF_CAP
];
fvec
rikmag_buf
[
BUF_CAP
];
fvec
cosjik_buf
[
BUF_CAP
];
ivec
k_buf
[
BUF_CAP
];
fvec
g_buf
[
BUF_CAP
];
fvec
dgdc_buf
[
BUF_CAP
];
fvec
ex_lam_buf
[
BUF_CAP
];
fvec
wik_buf
[
BUF_CAP
];
fvec
dwik_buf
[
BUF_CAP
];
fvec
cutN_buf
[
BUF_CAP
];
fvec
dcutN_buf
[
BUF_CAP
];
bvec
ktype_buf
[
BUF_CAP
];
bvec
mask_buf
[
BUF_CAP
];
fvec
force_k_x_buf
[
BUF_CAP
];
fvec
force_k_y_buf
[
BUF_CAP
];
fvec
force_k_z_buf
[
BUF_CAP
];
int
buf_len
;
fvec
x_i
;
fvec
y_i
;
fvec
z_i
;
fvec
x_j
;
fvec
y_j
;
fvec
z_j
;
fvec
nCi
;
fvec
nHi
;
fvec
force_i_x
;
fvec
force_i_y
;
fvec
force_i_z
;
fvec
force_j_x
;
fvec
force_j_y
;
fvec
force_j_z
;
};
/*
* Initialize values in aut_frebo_data and perform the calculations
* for p_ij.
*/
static
fvec
aut_frebo_pij_pd_2
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
_noalias
ka
,
struct
aut_frebo_data
*
_noalias
data
,
int
itype
,
int
jtype
,
ivec
vi
,
ivec
vj
,
fvec
rijx
,
fvec
rijy
,
fvec
rijz
,
fvec
rijmag
,
fvec
wij
,
fvec
VA
,
fvec
*
sum_N
,
fvec
fij
[
3
]
)
{
AtomAIREBOT
<
flt_t
>
*
_noalias
x
=
ka
->
x
;
int
*
_noalias
map
=
ka
->
map
;
flt_t
*
_noalias
nC
=
ka
->
nC
;
flt_t
*
_noalias
nH
=
ka
->
nH
;
fvec
x_i
,
y_i
,
z_i
;
fvec
x_j
,
y_j
,
z_j
;
x_i
=
data
->
x_i
;
y_i
=
data
->
y_i
;
z_i
=
data
->
z_i
;
x_j
=
data
->
x_j
;
y_j
=
data
->
y_j
;
z_j
=
data
->
z_j
;
fvec
invrijm
=
fvec
::
recip
(
rijmag
);
fvec
invrijm2
=
invrijm
*
invrijm
;
fvec
rcminij
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
jtype
]);
fvec
rcmaxij
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
jtype
]);
fvec
Nmin
=
fvec
::
set1
(
ka
->
params
.
Nmin
);
fvec
Nmax
=
fvec
::
set1
(
ka
->
params
.
Nmax
);
int
map_i_scalar
=
0
;
{
int
i
;
for
(
i
=
1
;
i
<
ka
->
num_types
;
i
++
)
{
if
(
ka
->
map
[
i
])
map_i_scalar
|=
(
1
<<
i
);
}
}
ivec
map_i
=
ivec
::
set1
(
map_i_scalar
);
fvec
nCi
=
data
->
nCi
;
fvec
nHi
=
data
->
nHi
;
fvec
Nij
=
nHi
+
nCi
-
wij
;
fvec
factor_jtype
,
factor_not_jtype
;
if
(
jtype
)
{
factor_jtype
=
fvec
::
set1
(
1
);
factor_not_jtype
=
fvec
::
set1
(
0
);
}
else
{
factor_jtype
=
fvec
::
set1
(
0
);
factor_not_jtype
=
fvec
::
set1
(
1
);
}
fvec
NijC
=
nCi
-
wij
*
factor_not_jtype
;
fvec
NijH
=
nHi
-
wij
*
factor_jtype
;
fvec
sum_pij
=
fvec
::
setzero
();
fvec
sum_dpij_dN
=
fvec
::
setzero
();
fvec
dN2
[
2
];
ivec
offseti
=
ivec
::
mask_gather
(
ivec
::
setzero
(),
bvec
::
full
(),
vi
,
ka
->
neigh_rebo
.
offset
,
sizeof
(
int
));
int
buf_len
=
0
;
ivec
knum
=
ivec
::
mask_gather
(
ivec
::
setzero
(),
bvec
::
full
(),
vi
,
ka
->
neigh_rebo
.
num
,
sizeof
(
int
));
ivec
kk
=
ivec
::
setzero
();
bvec
active_mask
=
ivec
::
cmplt
(
kk
,
knum
);
ivec
c_i1
=
ivec
::
set1
(
1
);
fvec
rho_j
=
fvec
::
set1
(
ka
->
params
.
rho
[
jtype
][
1
]);
fvec
rho_k0
=
fvec
::
set1
(
ka
->
params
.
rho
[
0
][
1
]);
fvec
rho_k1
=
fvec
::
set1
(
ka
->
params
.
rho
[
1
][
1
]);
fvec
c_4
=
fvec
::
set1
(
4
);
fvec
c_2_0
=
fvec
::
set1
(
2.0
);
fvec
c_m2_0
=
fvec
::
set1
(
-
2.0
);
fvec
c_4_0
=
fvec
::
set1
(
4.0
);
fvec
c_0_5
=
fvec
::
set1
(
0.5
);
fvec
c_m0_5
=
fvec
::
set1
(
-
0.5
);
fvec
c_1
=
fvec
::
set1
(
1
);
fvec
c_m1
=
fvec
::
set1
(
-
1
);
fvec
factor_itype
=
itype
?
c_1
:
fvec
::
setzero
();
fvec
rcmax0
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
0
]);
fvec
rcmax1
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
1
]);
fvec
rcmin0
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
0
]);
fvec
rcmin1
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
1
]);
fvec
result_f_i_x
=
fvec
::
setzero
();
fvec
result_f_i_y
=
fvec
::
setzero
();
fvec
result_f_i_z
=
fvec
::
setzero
();
fvec
result_f_j_x
=
fvec
::
setzero
();
fvec
result_f_j_y
=
fvec
::
setzero
();
fvec
result_f_j_z
=
fvec
::
setzero
();
*
sum_N
=
fvec
::
setzero
();
{
while
(
bvec
::
test_any_set
(
active_mask
))
{
ivec
k
=
ivec
::
mask_gather
(
ivec
::
setzero
(),
active_mask
,
kk
+
offseti
,
ka
->
neigh_rebo
.
entries
,
sizeof
(
int
));
bvec
excluded_mask
=
ivec
::
cmpeq
(
k
,
vj
)
&
active_mask
;
if
(
bvec
::
test_any_set
(
excluded_mask
))
{
kk
=
ivec
::
mask_add
(
kk
,
excluded_mask
,
kk
,
c_i1
);
active_mask
=
ivec
::
cmplt
(
kk
,
knum
);
continue
;
}
fvec
x_k
,
y_k
,
z_k
;
bvec
ktype_mask
;
aut_loadatoms_vec
(
x
,
k
,
&
x_k
,
&
y_k
,
&
z_k
,
&
ktype_mask
,
ka
->
map
,
map_i
,
c_i1
);
fvec
rikx
=
x_i
-
x_k
;
fvec
riky
=
y_i
-
y_k
;
fvec
rikz
=
z_i
-
z_k
;
fvec
rikmag
=
fvec
::
sqrt
(
rikx
*
rikx
+
riky
*
riky
+
rikz
*
rikz
);
fvec
rho_k
=
fvec
::
mask_blend
(
ktype_mask
,
rho_k0
,
rho_k1
);
fvec
lamdajik
=
c_4
*
factor_itype
*
(
rho_k
-
rikmag
-
(
rho_j
-
rijmag
));
fvec
ex_lam
=
fvec
::
exp
(
lamdajik
);
fvec
rcmax
=
fvec
::
mask_blend
(
ktype_mask
,
rcmax0
,
rcmax1
);
fvec
rcmin
=
fvec
::
mask_blend
(
ktype_mask
,
rcmin0
,
rcmin1
);
fvec
dwik
;
fvec
wik
=
aut_Sp_deriv
(
rikmag
,
rcmin
,
rcmax
,
&
dwik
);
fvec
Nki
=
fvec
::
gather
(
k
,
nC
,
sizeof
(
flt_t
))
+
fvec
::
gather
(
k
,
nH
,
sizeof
(
flt_t
))
-
wik
;
fvec
cosjik
=
(
rijx
*
rikx
+
rijy
*
riky
+
rijz
*
rikz
)
/
(
rijmag
*
rikmag
);
cosjik
=
fvec
::
min
(
c_1
,
fvec
::
max
(
c_m1
,
cosjik
));
fvec
dgdc
,
dgdN
;
fvec
g
=
aut_mask_gSpline_pd_2
(
ka
,
active_mask
,
itype
,
cosjik
,
Nij
,
&
dgdc
,
&
dgdN
);
sum_pij
=
fvec
::
mask_add
(
sum_pij
,
active_mask
,
sum_pij
,
wik
*
g
*
ex_lam
);
sum_dpij_dN
=
fvec
::
mask_add
(
sum_dpij_dN
,
active_mask
,
sum_dpij_dN
,
wik
*
ex_lam
*
dgdN
);
fvec
dcutN
;
fvec
cutN
=
aut_Sp_deriv
(
Nki
,
Nmin
,
Nmax
,
&
dcutN
);
*
sum_N
=
fvec
::
mask_add
(
*
sum_N
,
active_mask
,
*
sum_N
,
fvec
::
mask_blend
(
ktype_mask
,
c_1
,
fvec
::
setzero
())
*
wik
*
cutN
);
if
(
buf_len
==
BUF_CAP
)
goto
exceed_buffer
;
data
->
rikx_buf
[
buf_len
]
=
rikx
;
data
->
riky_buf
[
buf_len
]
=
riky
;
data
->
rikz_buf
[
buf_len
]
=
rikz
;
data
->
rikmag_buf
[
buf_len
]
=
rikmag
;
data
->
cosjik_buf
[
buf_len
]
=
cosjik
;
data
->
ktype_buf
[
buf_len
]
=
ktype_mask
;
data
->
k_buf
[
buf_len
]
=
k
;
data
->
g_buf
[
buf_len
]
=
g
;
data
->
dgdc_buf
[
buf_len
]
=
dgdc
;
data
->
ex_lam_buf
[
buf_len
]
=
ex_lam
;
data
->
wik_buf
[
buf_len
]
=
wik
;
data
->
dwik_buf
[
buf_len
]
=
dwik
;
data
->
mask_buf
[
buf_len
]
=
active_mask
;
data
->
cutN_buf
[
buf_len
]
=
cutN
;
data
->
dcutN_buf
[
buf_len
]
=
dcutN
;
buf_len
+=
1
;
kk
=
ivec
::
mask_add
(
kk
,
active_mask
,
kk
,
c_i1
);
active_mask
=
ivec
::
cmplt
(
kk
,
knum
);
}
data
->
buf_len
=
buf_len
;
fvec
PijS
=
aut_PijSpline
(
ka
,
itype
,
jtype
,
NijC
,
NijH
,
&
dN2
[
0
]);
fvec
pij
=
fvec
::
invsqrt
(
c_1
+
sum_pij
+
PijS
);
fvec
tmp
=
c_m0_5
*
pij
*
pij
*
pij
;
int
buf_idx
;
for
(
buf_idx
=
0
;
buf_idx
<
buf_len
;
buf_idx
++
)
{
fvec
rikx
=
data
->
rikx_buf
[
buf_idx
];
fvec
riky
=
data
->
riky_buf
[
buf_idx
];
fvec
rikz
=
data
->
rikz_buf
[
buf_idx
];
fvec
rikmag
=
data
->
rikmag_buf
[
buf_idx
];
fvec
cosjik
=
data
->
cosjik_buf
[
buf_idx
];
bvec
ktype_mask
=
data
->
ktype_buf
[
buf_idx
];
ivec
k
=
data
->
k_buf
[
buf_idx
];
fvec
g
=
data
->
g_buf
[
buf_idx
];
fvec
dgdc
=
data
->
dgdc_buf
[
buf_idx
];
fvec
ex_lam
=
data
->
ex_lam_buf
[
buf_idx
];
fvec
wik
=
data
->
wik_buf
[
buf_idx
];
fvec
dwik
=
data
->
dwik_buf
[
buf_idx
];
bvec
mask
=
data
->
mask_buf
[
buf_idx
];
fvec
invrikm
=
fvec
::
recip
(
rikmag
);
fvec
rjkx
=
rikx
-
rijx
;
fvec
rjky
=
riky
-
rijy
;
fvec
rjkz
=
rikz
-
rijz
;
fvec
rjkmag
=
fvec
::
sqrt
(
rjkx
*
rjkx
+
rjky
*
rjky
+
rjkz
*
rjkz
);
fvec
rijrik
=
c_2_0
*
rijmag
*
rikmag
;
fvec
rr
=
rijmag
*
rijmag
-
rikmag
*
rikmag
;
fvec
dctdjk
=
c_m2_0
/
rijrik
;
fvec
dctdik
=
(
rjkmag
*
rjkmag
-
rr
)
/
(
rijrik
*
rikmag
*
rikmag
);
fvec
dctdij
=
(
rjkmag
*
rjkmag
+
rr
)
/
(
rijrik
*
rijmag
*
rijmag
);
fvec
fi
[
3
],
fj
[
3
],
fk
[
3
];
fvec
pref
=
c_0_5
*
VA
*
tmp
;
fvec
tmp20
=
pref
*
wik
*
dgdc
*
ex_lam
;
fj
[
0
]
=
fj
[
1
]
=
fj
[
2
]
=
fvec
::
setzero
();
fvec
tmpdik
=
tmp20
*
dctdik
;
fi
[
0
]
=
fvec
::
setzero
()
-
tmpdik
*
rikx
;
fi
[
1
]
=
fvec
::
setzero
()
-
tmpdik
*
riky
;
fi
[
2
]
=
fvec
::
setzero
()
-
tmpdik
*
rikz
;
fk
[
0
]
=
tmpdik
*
rikx
;
fk
[
1
]
=
tmpdik
*
riky
;
fk
[
2
]
=
tmpdik
*
rikz
;
fvec
tmpdij
=
tmp20
*
dctdij
;
fij
[
0
]
=
fvec
::
mask_sub
(
fij
[
0
],
mask
,
fij
[
0
],
tmpdij
*
rijx
);
fij
[
1
]
=
fvec
::
mask_sub
(
fij
[
1
],
mask
,
fij
[
1
],
tmpdij
*
rijy
);
fij
[
2
]
=
fvec
::
mask_sub
(
fij
[
2
],
mask
,
fij
[
2
],
tmpdij
*
rijz
);
fvec
tmpdjk
=
tmp20
*
dctdjk
;
fi
[
0
]
=
fi
[
0
]
-
tmpdjk
*
rjkx
;
fi
[
1
]
=
fi
[
1
]
-
tmpdjk
*
rjky
;
fi
[
2
]
=
fi
[
2
]
-
tmpdjk
*
rjkz
;
fk
[
0
]
=
fk
[
0
]
+
tmpdjk
*
rjkx
;
fk
[
1
]
=
fk
[
1
]
+
tmpdjk
*
rjky
;
fk
[
2
]
=
fk
[
2
]
+
tmpdjk
*
rjkz
;
fij
[
0
]
=
fvec
::
mask_add
(
fij
[
0
],
mask
,
fij
[
0
],
tmpdjk
*
rjkx
);
fij
[
1
]
=
fvec
::
mask_add
(
fij
[
1
],
mask
,
fij
[
1
],
tmpdjk
*
rjky
);
fij
[
2
]
=
fvec
::
mask_add
(
fij
[
2
],
mask
,
fij
[
2
],
tmpdjk
*
rjkz
);
if
(
itype
)
{
fvec
tmp21
=
pref
*
wik
*
g
*
ex_lam
*
c_4_0
;
fvec
tmp21ij
=
tmp21
*
invrijm
;
fij
[
0
]
=
fvec
::
mask_sub
(
fij
[
0
],
mask
,
fij
[
0
],
tmp21ij
*
rijx
);
fij
[
1
]
=
fvec
::
mask_sub
(
fij
[
1
],
mask
,
fij
[
1
],
tmp21ij
*
rijy
);
fij
[
2
]
=
fvec
::
mask_sub
(
fij
[
2
],
mask
,
fij
[
2
],
tmp21ij
*
rijz
);
fvec
tmp21ik
=
tmp21
*
invrikm
;
fi
[
0
]
=
fi
[
0
]
+
tmp21ik
*
rikx
;
fi
[
1
]
=
fi
[
1
]
+
tmp21ik
*
riky
;
fi
[
2
]
=
fi
[
2
]
+
tmp21ik
*
rikz
;
fk
[
0
]
=
fk
[
0
]
-
tmp21ik
*
rikx
;
fk
[
1
]
=
fk
[
1
]
-
tmp21ik
*
riky
;
fk
[
2
]
=
fk
[
2
]
-
tmp21ik
*
rikz
;
}
// coordination forces
// dwik forces
fvec
tmp22
=
pref
*
dwik
*
g
*
ex_lam
*
invrikm
;
fi
[
0
]
=
fi
[
0
]
-
tmp22
*
rikx
;
fi
[
1
]
=
fi
[
1
]
-
tmp22
*
riky
;
fi
[
2
]
=
fi
[
2
]
-
tmp22
*
rikz
;
fk
[
0
]
=
fk
[
0
]
+
tmp22
*
rikx
;
fk
[
1
]
=
fk
[
1
]
+
tmp22
*
riky
;
fk
[
2
]
=
fk
[
2
]
+
tmp22
*
rikz
;
// PIJ forces
fvec
dN2ktype
=
fvec
::
mask_blend
(
ktype_mask
,
dN2
[
0
],
dN2
[
1
]);
fvec
tmp23
=
pref
*
dN2ktype
*
dwik
*
invrikm
;
fi
[
0
]
=
fi
[
0
]
-
tmp23
*
rikx
;
fi
[
1
]
=
fi
[
1
]
-
tmp23
*
riky
;
fi
[
2
]
=
fi
[
2
]
-
tmp23
*
rikz
;
fk
[
0
]
=
fk
[
0
]
+
tmp23
*
rikx
;
fk
[
1
]
=
fk
[
1
]
+
tmp23
*
riky
;
fk
[
2
]
=
fk
[
2
]
+
tmp23
*
rikz
;
// dgdN forces
fvec
tmp24
=
pref
*
sum_dpij_dN
*
dwik
*
invrikm
;
fi
[
0
]
=
fi
[
0
]
-
tmp24
*
rikx
;
fi
[
1
]
=
fi
[
1
]
-
tmp24
*
riky
;
fi
[
2
]
=
fi
[
2
]
-
tmp24
*
rikz
;
fk
[
0
]
=
fk
[
0
]
+
tmp24
*
rikx
;
fk
[
1
]
=
fk
[
1
]
+
tmp24
*
riky
;
fk
[
2
]
=
fk
[
2
]
+
tmp24
*
rikz
;
result_f_i_x
=
fvec
::
mask_add
(
result_f_i_x
,
mask
,
result_f_i_x
,
fi
[
0
]);
result_f_i_y
=
fvec
::
mask_add
(
result_f_i_y
,
mask
,
result_f_i_y
,
fi
[
1
]);
result_f_i_z
=
fvec
::
mask_add
(
result_f_i_z
,
mask
,
result_f_i_z
,
fi
[
2
]);
result_f_j_x
=
fvec
::
mask_add
(
result_f_j_x
,
mask
,
result_f_j_x
,
fj
[
0
]);
result_f_j_y
=
fvec
::
mask_add
(
result_f_j_y
,
mask
,
result_f_j_y
,
fj
[
1
]);
result_f_j_z
=
fvec
::
mask_add
(
result_f_j_z
,
mask
,
result_f_j_z
,
fj
[
2
]);
data
->
force_k_x_buf
[
buf_idx
]
=
fk
[
0
];
data
->
force_k_y_buf
[
buf_idx
]
=
fk
[
1
];
data
->
force_k_z_buf
[
buf_idx
]
=
fk
[
2
];
}
data
->
force_i_x
=
result_f_i_x
;
data
->
force_i_y
=
result_f_i_y
;
data
->
force_i_z
=
result_f_i_z
;
data
->
force_j_x
=
result_f_j_x
;
data
->
force_j_y
=
result_f_j_y
;
data
->
force_j_z
=
result_f_j_z
;
return
pij
;
}
exceed_buffer:
data
->
buf_len
=
-
1
;
return
fvec
::
setzero
();
}
/*
* Apply the force values stored iin aut_frebo_data to
* the respective neighbors.
*/
static
void
aut_frebo_data_writeback
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
_noalias
ka
,
struct
aut_frebo_data
*
_noalias
data
)
{
ResultForceT
<
acc_t
>
*
_noalias
result_f
=
ka
->
result_f
;
flt_t
fk_x_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fk_y_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fk_z_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
int
fk_k_buf
[
ivec
::
VL
]
__attribute__
((
aligned
(
64
)));
int
buf_idx
;
for
(
buf_idx
=
0
;
buf_idx
<
data
->
buf_len
;
buf_idx
++
)
{
ivec
k
=
data
->
k_buf
[
buf_idx
];
bvec
active_mask
=
data
->
mask_buf
[
buf_idx
];
fvec
::
store
(
fk_x_buf
,
data
->
force_k_x_buf
[
buf_idx
]);
fvec
::
store
(
fk_y_buf
,
data
->
force_k_y_buf
[
buf_idx
]);
fvec
::
store
(
fk_z_buf
,
data
->
force_k_z_buf
[
buf_idx
]);
ivec
::
store
(
fk_k_buf
,
k
);
int
lane
;
for
(
lane
=
0
;
lane
<
fvec
::
VL
;
lane
++
)
{
if
(
bvec
::
test_at
(
active_mask
,
lane
))
{}
else
continue
;
int
kk
=
fk_k_buf
[
lane
];
result_f
[
kk
].
x
+=
fk_x_buf
[
lane
];
result_f
[
kk
].
y
+=
fk_y_buf
[
lane
];
result_f
[
kk
].
z
+=
fk_z_buf
[
lane
];
}
}
}
static
void
aut_frebo_N_spline_force
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
_noalias
ka
,
struct
aut_frebo_data
*
_noalias
data
,
int
itype
,
int
jtype
,
ivec
vi
,
ivec
vj
,
fvec
VA
,
fvec
dN
,
fvec
dNconj
,
fvec
Nconj
)
{
ivec
c_i1
=
ivec
::
set1
(
1
);
fvec
c_2
=
fvec
::
set1
(
2
);
fvec
c_TOL
=
fvec
::
set1
(
TOL
);
ResultForceT
<
acc_t
>
*
_noalias
result_f
=
ka
->
result_f
;
AtomAIREBOT
<
flt_t
>
*
_noalias
x
=
ka
->
x
;
int
*
_noalias
map
=
ka
->
map
;
flt_t
*
_noalias
nC
=
ka
->
nC
;
flt_t
*
_noalias
nH
=
ka
->
nH
;
fvec
x_i
,
y_i
,
z_i
;
x_i
=
data
->
x_i
;
y_i
=
data
->
y_i
;
z_i
=
data
->
z_i
;
fvec
Nmin
=
fvec
::
set1
(
ka
->
params
.
Nmin
);
fvec
Nmax
=
fvec
::
set1
(
ka
->
params
.
Nmax
);
int
map_i_scalar
=
0
;
{
int
i
;
for
(
i
=
1
;
i
<
ka
->
num_types
;
i
++
)
{
if
(
ka
->
map
[
i
])
map_i_scalar
|=
(
1
<<
i
);
}
}
ivec
map_i
=
ivec
::
set1
(
map_i_scalar
);
fvec
dN2
[
2
];
ivec
kk
=
ivec
::
setzero
();
fvec
rcmax0
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
0
]);
fvec
rcmax1
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
1
]);
fvec
rcmin0
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
0
]);
fvec
rcmin1
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
1
]);
fvec
result_f_i_x
=
fvec
::
setzero
();
fvec
result_f_i_y
=
fvec
::
setzero
();
fvec
result_f_i_z
=
fvec
::
setzero
();
int
buf_idx
;
for
(
buf_idx
=
0
;
buf_idx
<
data
->
buf_len
;
buf_idx
++
)
{
ivec
k
=
data
->
k_buf
[
buf_idx
];
bvec
active_mask
=
data
->
mask_buf
[
buf_idx
];
fvec
rikx
=
data
->
rikx_buf
[
buf_idx
];
fvec
riky
=
data
->
riky_buf
[
buf_idx
];
fvec
rikz
=
data
->
rikz_buf
[
buf_idx
];
fvec
rikmag
=
data
->
rikmag_buf
[
buf_idx
];
bvec
ktype_mask
=
data
->
ktype_buf
[
buf_idx
];
fvec
dwik
=
data
->
dwik_buf
[
buf_idx
];
fvec
wik
=
data
->
wik_buf
[
buf_idx
];
fvec
dNki
=
data
->
dcutN_buf
[
buf_idx
];
fvec
SpN
=
data
->
cutN_buf
[
buf_idx
];
fvec
invrikmag
=
fvec
::
recip
(
rikmag
);
fvec
pref
=
VA
*
dwik
*
invrikmag
;
fvec
fdN
=
dN
*
pref
;
fvec
fdNconj
=
pref
*
SpN
*
c_2
*
dNconj
*
Nconj
;
fvec
ffactor
=
fdN
;
bvec
ktype_is_C
=
~
ktype_mask
;
ffactor
=
fvec
::
mask_add
(
ffactor
,
ktype_is_C
,
ffactor
,
fdNconj
);
fvec
fkx
=
ffactor
*
rikx
;
fvec
fky
=
ffactor
*
riky
;
fvec
fkz
=
ffactor
*
rikz
;
data
->
force_k_x_buf
[
buf_idx
]
=
data
->
force_k_x_buf
[
buf_idx
]
+
fkx
;
data
->
force_k_y_buf
[
buf_idx
]
=
data
->
force_k_y_buf
[
buf_idx
]
+
fky
;
data
->
force_k_z_buf
[
buf_idx
]
=
data
->
force_k_z_buf
[
buf_idx
]
+
fkz
;
result_f_i_x
=
fvec
::
mask_sub
(
result_f_i_x
,
active_mask
,
result_f_i_x
,
fkx
);
result_f_i_y
=
fvec
::
mask_sub
(
result_f_i_y
,
active_mask
,
result_f_i_y
,
fky
);
result_f_i_z
=
fvec
::
mask_sub
(
result_f_i_z
,
active_mask
,
result_f_i_z
,
fkz
);
bvec
need_k_neighs
=
fvec
::
mask_cmpnle
(
active_mask
,
fvec
::
abs
(
dNki
),
c_TOL
)
&
ktype_is_C
;
if
(
bvec
::
test_any_set
(
need_k_neighs
))
{
int
lane
;
for
(
lane
=
0
;
lane
<
fvec
::
VL
;
lane
++
)
{
if
(
!
bvec
::
test_at
(
need_k_neighs
,
lane
))
continue
;
int
kk
=
ivec
::
at
(
k
,
lane
);
int
k
=
kk
;
int
ktype
=
map
[
x
[
k
].
w
];
int
i
=
ivec
::
at
(
vi
,
lane
);
fvec
oldVA
=
VA
;
double
VA
=
fvec
::
at
(
oldVA
,
lane
);
fvec
oldwik
=
wik
;
double
wik
=
fvec
::
at
(
oldwik
,
lane
);
fvec
olddNconj
=
dNconj
;
double
dNconj
=
fvec
::
at
(
olddNconj
,
lane
);
fvec
oldNconj
=
Nconj
;
double
Nconj
=
fvec
::
at
(
oldNconj
,
lane
);
fvec
olddNki
=
dNki
;
double
dNki
=
fvec
::
at
(
olddNki
,
lane
);
int
*
neighs_k
=
ka
->
neigh_rebo
.
entries
+
ka
->
neigh_rebo
.
offset
[
k
];
int
nnum
=
ka
->
neigh_rebo
.
num
[
k
];
int
nn
;
for
(
nn
=
0
;
nn
<
nnum
;
nn
++
)
{
int
n
=
neighs_k
[
nn
];
if
(
n
==
i
)
continue
;
double
rknx
=
x
[
k
].
x
-
x
[
n
].
x
;
double
rkny
=
x
[
k
].
y
-
x
[
n
].
y
;
double
rknz
=
x
[
k
].
z
-
x
[
n
].
z
;
double
rknmag
=
sqrt
(
rknx
*
rknx
+
rkny
*
rkny
+
rknz
*
rknz
);
int
ntype
=
map
[
x
[
n
].
w
];
double
rcminkn
=
ka
->
params
.
rcmin
[
ktype
][
ntype
];
double
rcmaxkn
=
ka
->
params
.
rcmax
[
ktype
][
ntype
];
double
dwkn
;
Sp
(
rknmag
,
rcminkn
,
rcmaxkn
,
&
dwkn
);
double
ffactor
=
VA
*
dNconj
*
2
*
Nconj
*
wik
*
dNki
*
dwkn
/
rknmag
;
result_f
[
k
].
x
-=
ffactor
*
rknx
;
result_f
[
k
].
y
-=
ffactor
*
rkny
;
result_f
[
k
].
z
-=
ffactor
*
rknz
;
result_f
[
n
].
x
+=
ffactor
*
rknx
;
result_f
[
n
].
y
+=
ffactor
*
rkny
;
result_f
[
n
].
z
+=
ffactor
*
rknz
;
}
}
}
}
data
->
force_i_x
=
data
->
force_i_x
+
result_f_i_x
;
data
->
force_i_y
=
data
->
force_i_y
+
result_f_i_y
;
data
->
force_i_z
=
data
->
force_i_z
+
result_f_i_z
;
}
static
fvec
aut_frebo_pi_rc_pd
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
itype
,
int
jtype
,
fvec
Nij
,
fvec
Nji
,
fvec
Nijconj
,
fvec
*
dN3
)
{
flt_t
ret
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
dN3ret
[
3
][
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
int
i
;
for
(
i
=
0
;
i
<
fvec
::
VL
;
i
++
)
{
flt_t
dN3tmp
[
3
];
ret
[
i
]
=
frebo_pi_rc
(
ka
,
itype
,
jtype
,
fvec
::
at
(
Nij
,
i
),
fvec
::
at
(
Nji
,
i
),
fvec
::
at
(
Nijconj
,
i
),
&
dN3tmp
[
0
]);
dN3ret
[
0
][
i
]
=
dN3tmp
[
0
];
dN3ret
[
1
][
i
]
=
dN3tmp
[
1
];
dN3ret
[
2
][
i
]
=
dN3tmp
[
2
];
}
dN3
[
0
]
=
fvec
::
load
(
&
dN3ret
[
0
][
0
]);
dN3
[
1
]
=
fvec
::
load
(
&
dN3ret
[
1
][
0
]);
dN3
[
2
]
=
fvec
::
load
(
&
dN3ret
[
2
][
0
]);
return
fvec
::
load
(
&
ret
[
0
]);
}
static
fvec
aut_frebo_Tij
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
itype
,
int
jtype
,
fvec
Nij
,
fvec
Nji
,
fvec
Nijconj
,
fvec
*
dN3
)
{
flt_t
ret
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
dN3ret
[
3
][
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
int
i
;
for
(
i
=
0
;
i
<
fvec
::
VL
;
i
++
)
{
flt_t
dN3tmp
[
3
];
ret
[
i
]
=
frebo_Tij
(
ka
,
itype
,
jtype
,
fvec
::
at
(
Nij
,
i
),
fvec
::
at
(
Nji
,
i
),
fvec
::
at
(
Nijconj
,
i
),
&
dN3tmp
[
0
]);
dN3ret
[
0
][
i
]
=
dN3tmp
[
0
];
dN3ret
[
1
][
i
]
=
dN3tmp
[
1
];
dN3ret
[
2
][
i
]
=
dN3tmp
[
2
];
}
dN3
[
0
]
=
fvec
::
load
(
&
dN3ret
[
0
][
0
]);
dN3
[
1
]
=
fvec
::
load
(
&
dN3ret
[
1
][
0
]);
dN3
[
2
]
=
fvec
::
load
(
&
dN3ret
[
2
][
0
]);
return
fvec
::
load
(
&
ret
[
0
]);
}
static
fvec
aut_frebo_sum_omega
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
_noalias
ka
,
struct
aut_frebo_data
*
_noalias
i_data
,
struct
aut_frebo_data
*
_noalias
j_data
,
int
itype
,
int
jtype
,
ivec
vi
,
ivec
vj
,
fvec
r23x
,
fvec
r23y
,
fvec
r23z
,
fvec
r23mag
,
fvec
VA
,
fvec
fij
[
3
]
)
{
fvec
c_1
=
fvec
::
set1
(
1
);
fvec
c_m1
=
fvec
::
set1
(
-
1
);
fvec
c_2
=
fvec
::
set1
(
2
);
fvec
c_m2
=
fvec
::
set1
(
-
2
);
fvec
sum_omega
=
fvec
::
setzero
();
fvec
thmin
=
fvec
::
set1
(
ka
->
params
.
thmin
);
fvec
thmax
=
fvec
::
set1
(
ka
->
params
.
thmax
);
// 2 == i, 3 == j
fvec
r32x
=
fvec
::
setzero
()
-
r23x
;
fvec
r32y
=
fvec
::
setzero
()
-
r23y
;
fvec
r32z
=
fvec
::
setzero
()
-
r23z
;
int
buf_idx_i
,
buf_idx_j
;
for
(
buf_idx_i
=
0
;
buf_idx_i
<
i_data
->
buf_len
;
buf_idx_i
++
)
{
// a1 == k == buf_idx_i
bvec
mask_start
=
i_data
->
mask_buf
[
buf_idx_i
];
fvec
r21x
=
i_data
->
rikx_buf
[
buf_idx_i
];
// a2 - a1 -> i - k
fvec
r21y
=
i_data
->
riky_buf
[
buf_idx_i
];
// a2 - a1 -> i - k
fvec
r21z
=
i_data
->
rikz_buf
[
buf_idx_i
];
// a2 - a1 -> i - k
fvec
r21mag
=
i_data
->
rikmag_buf
[
buf_idx_i
];
// TODO use buffered cosjik
fvec
cos321
=
(
r23x
*
r21x
+
r23y
*
r21y
+
r23z
*
r21z
)
/
(
r23mag
*
r21mag
);
cos321
=
fvec
::
min
(
c_1
,
fvec
::
max
(
c_m1
,
cos321
));
fvec
sin321
=
fvec
::
sqrt
(
c_1
-
cos321
*
cos321
);
bvec
mask_outer
=
fvec
::
cmpneq
(
fvec
::
setzero
(),
sin321
)
&
mask_start
;
// add "continue"
fvec
sink2i
=
fvec
::
mask_recip
(
fvec
::
undefined
(),
mask_outer
,
sin321
*
sin321
);
fvec
rik2i
=
fvec
::
mask_recip
(
fvec
::
undefined
(),
mask_outer
,
r21mag
*
r21mag
);
fvec
rr
=
r23mag
*
r23mag
-
r21mag
*
r21mag
;
fvec
r31x
=
r21x
-
r23x
;
fvec
r31y
=
r21y
-
r23y
;
fvec
r31z
=
r21z
-
r23z
;
fvec
r31mag2
=
r31x
*
r31x
+
r31y
*
r31y
+
r31z
*
r31z
;
fvec
rijrik
=
c_2
*
r23mag
*
r21mag
;
fvec
r21mag2
=
r21mag
*
r21mag
;
fvec
dctik
=
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_outer
,
r31mag2
-
rr
,
rijrik
*
r21mag2
);
fvec
dctij
=
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_outer
,
r31mag2
+
rr
,
rijrik
*
r23mag
*
r23mag
);
fvec
dctjk
=
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_outer
,
c_m2
,
rijrik
);
fvec
dw21
=
i_data
->
dwik_buf
[
buf_idx_i
];
fvec
w21
=
i_data
->
wik_buf
[
buf_idx_i
];
fvec
dtsjik
;
fvec
tspjik
=
aut_Sp2_deriv
(
cos321
,
thmin
,
thmax
,
&
dtsjik
);
dtsjik
=
fvec
::
setzero
()
-
dtsjik
;
// todo replace by appropriate xor.
ivec
k
=
i_data
->
k_buf
[
buf_idx_i
];
for
(
buf_idx_j
=
0
;
buf_idx_j
<
j_data
->
buf_len
;
buf_idx_j
++
)
{
// check l == k in second loop.
// l == a4 == buf_idx_j
ivec
l
=
j_data
->
k_buf
[
buf_idx_j
];
bvec
mask_inner_0
=
ivec
::
mask_cmpneq
(
mask_outer
,
k
,
l
)
&
j_data
->
mask_buf
[
buf_idx_j
];
// add "continue"
fvec
r34x
=
j_data
->
rikx_buf
[
buf_idx_j
];
fvec
r34y
=
j_data
->
riky_buf
[
buf_idx_j
];
fvec
r34z
=
j_data
->
rikz_buf
[
buf_idx_j
];
fvec
r34mag
=
j_data
->
rikmag_buf
[
buf_idx_j
];
fvec
cos234
=
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_inner_0
,
r32x
*
r34x
+
r32y
*
r34y
+
r32z
*
r34z
,
r23mag
*
r34mag
);
cos234
=
fvec
::
min
(
c_1
,
fvec
::
max
(
c_m1
,
cos234
));
fvec
sin234
=
fvec
::
mask_sqrt
(
fvec
::
undefined
(),
mask_inner_0
,
c_1
-
cos234
*
cos234
);
bvec
mask_inner_1
=
fvec
::
mask_cmpneq
(
mask_inner_0
,
sin234
,
fvec
::
setzero
());
// add "continue"
fvec
sinl2i
=
fvec
::
mask_recip
(
fvec
::
undefined
(),
mask_inner_1
,
sin234
*
sin234
);
fvec
rjl2i
=
fvec
::
mask_recip
(
fvec
::
undefined
(),
mask_inner_1
,
r34mag
*
r34mag
);
fvec
dw34
=
j_data
->
dwik_buf
[
buf_idx_j
];
fvec
w34
=
j_data
->
wik_buf
[
buf_idx_j
];
fvec
rr
=
r23mag
*
r23mag
-
r34mag
*
r34mag
;
fvec
r24x
=
r23x
+
r34x
;
fvec
r24y
=
r23y
+
r34y
;
fvec
r24z
=
r23z
+
r34z
;
fvec
r242
=
r24x
*
r24x
+
r24y
*
r24y
+
r24z
*
r24z
;
fvec
rijrjl
=
c_2
*
r23mag
*
r34mag
;
fvec
rjl2
=
r34mag
*
r34mag
;
fvec
dctjl
=
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_inner_1
,
r242
-
rr
,
rijrjl
*
rjl2
);
fvec
dctji
=
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_inner_1
,
r242
+
rr
,
rijrjl
*
r23mag
*
r23mag
);
fvec
dctil
=
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_inner_1
,
c_m2
,
rijrjl
);
fvec
dtsijl
;
fvec
tspijl
=
aut_Sp2_deriv
(
cos234
,
thmin
,
thmax
,
&
dtsijl
);
dtsijl
=
fvec
::
setzero
()
-
dtsijl
;
fvec
prefactor
=
VA
;
fvec
cross321x
=
r32y
*
r21z
-
r32z
*
r21y
;
fvec
cross321y
=
r32z
*
r21x
-
r32x
*
r21z
;
fvec
cross321z
=
r32x
*
r21y
-
r32y
*
r21x
;
fvec
cross234x
=
r23y
*
r34z
-
r23z
*
r34y
;
fvec
cross234y
=
r23z
*
r34x
-
r23x
*
r34z
;
fvec
cross234z
=
r23x
*
r34y
-
r23y
*
r34x
;
fvec
cwnum
=
cross321x
*
cross234x
+
cross321y
*
cross234y
+
cross321z
*
cross234z
;
fvec
cwnom
=
r21mag
*
r34mag
*
r23mag
*
r23mag
*
sin321
*
sin234
;
fvec
om1234
=
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_inner_1
,
cwnum
,
cwnom
);
fvec
cw
=
om1234
;
fvec
sum_omega_contrib
=
(
c_1
-
om1234
*
om1234
)
*
w21
*
w34
*
(
c_1
-
tspjik
)
*
(
c_1
-
tspijl
);
sum_omega
=
fvec
::
mask_add
(
sum_omega
,
mask_inner_1
,
sum_omega
,
sum_omega_contrib
);
fvec
dt1dik
=
rik2i
-
dctik
*
sink2i
*
cos321
;
fvec
dt1djk
=
fvec
::
setzero
()
-
dctjk
*
sink2i
*
cos321
;
fvec
dt1djl
=
rjl2i
-
dctjl
*
sinl2i
*
cos234
;
fvec
dt1dil
=
fvec
::
setzero
()
-
dctil
*
sinl2i
*
cos234
;
fvec
dt1dij
=
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_inner_1
,
c_2
,
r23mag
*
r23mag
)
-
dctij
*
sink2i
*
cos321
-
dctji
*
sinl2i
*
cos234
;
fvec
dt2dikx
=
r23y
*
cross234z
-
r23z
*
cross234y
;
fvec
dt2diky
=
r23z
*
cross234x
-
r23x
*
cross234z
;
fvec
dt2dikz
=
r23x
*
cross234y
-
r23y
*
cross234x
;
fvec
dt2djlx
=
r23z
*
cross321y
-
r23y
*
cross321z
;
fvec
dt2djly
=
r23x
*
cross321z
-
r23z
*
cross321x
;
fvec
dt2djlz
=
r23y
*
cross321x
-
r23x
*
cross321y
;
fvec
dt2dijx
=
r21z
*
cross234y
+
r34y
*
cross321z
-
(
r34z
*
cross321y
+
r21y
*
cross234z
);
fvec
dt2dijy
=
r21x
*
cross234z
+
r34z
*
cross321x
-
(
r34x
*
cross321z
+
r21z
*
cross234x
);
fvec
dt2dijz
=
r21y
*
cross234x
+
r34x
*
cross321y
-
(
r34y
*
cross321x
+
r21x
*
cross234y
);
fvec
aa
=
prefactor
*
c_2
*
fvec
::
mask_div
(
fvec
::
undefined
(),
mask_inner_1
,
cw
,
cwnom
)
*
w21
*
w34
*
(
c_1
-
tspjik
)
*
(
c_1
-
tspijl
);
fvec
aaa1
=
(
fvec
::
setzero
()
-
prefactor
)
*
(
c_1
-
om1234
*
om1234
)
*
(
c_1
-
tspjik
)
*
(
c_1
-
tspijl
);
fvec
aaa2
=
(
fvec
::
setzero
()
-
prefactor
)
*
(
c_1
-
om1234
*
om1234
)
*
w21
*
w34
;
fvec
at2
=
aa
*
cwnum
;
fvec
fcijpc
=
aaa2
*
dtsjik
*
dctij
*
(
c_1
-
tspijl
)
+
aaa2
*
dtsijl
*
dctji
*
(
c_1
-
tspjik
)
-
dt1dij
*
at2
;
fvec
fcikpc
=
aaa2
*
dtsjik
*
dctik
*
(
c_1
-
tspijl
)
-
dt1dik
*
at2
;
fvec
fcjlpc
=
aaa2
*
dtsijl
*
dctjl
*
(
c_1
-
tspjik
)
-
dt1djl
*
at2
;
fvec
fcjkpc
=
aaa2
*
dtsjik
*
dctjk
*
(
c_1
-
tspijl
)
-
dt1djk
*
at2
;
fvec
fcilpc
=
aaa2
*
dtsijl
*
dctil
*
(
c_1
-
tspjik
)
-
dt1dil
*
at2
;
fvec
F23x
=
fcijpc
*
r23x
+
aa
*
dt2dijx
;
fvec
F23y
=
fcijpc
*
r23y
+
aa
*
dt2dijy
;
fvec
F23z
=
fcijpc
*
r23z
+
aa
*
dt2dijz
;
fvec
F12x
=
fcikpc
*
r21x
+
aa
*
dt2dikx
;
fvec
F12y
=
fcikpc
*
r21y
+
aa
*
dt2diky
;
fvec
F12z
=
fcikpc
*
r21z
+
aa
*
dt2dikz
;
fvec
F34x
=
fcjlpc
*
r34x
+
aa
*
dt2djlx
;
fvec
F34y
=
fcjlpc
*
r34y
+
aa
*
dt2djly
;
fvec
F34z
=
fcjlpc
*
r34z
+
aa
*
dt2djlz
;
fvec
F31x
=
fcjkpc
*
r31x
;
fvec
F31y
=
fcjkpc
*
r31y
;
fvec
F31z
=
fcjkpc
*
r31z
;
fvec
F24x
=
fcilpc
*
r24x
;
fvec
F24y
=
fcilpc
*
r24y
;
fvec
F24z
=
fcilpc
*
r24z
;
fvec
f1x
=
fvec
::
setzero
()
-
(
F12x
+
F31x
);
fvec
f1y
=
fvec
::
setzero
()
-
(
F12y
+
F31y
);
fvec
f1z
=
fvec
::
setzero
()
-
(
F12z
+
F31z
);
fvec
f2x
=
F12x
+
F31x
;
fvec
f2y
=
F12y
+
F31y
;
fvec
f2z
=
F12z
+
F31z
;
fvec
f3x
=
F34x
+
F24x
;
fvec
f3y
=
F34y
+
F24y
;
fvec
f3z
=
F34z
+
F24z
;
fvec
f4x
=
fvec
::
setzero
()
-
(
F34x
+
F24x
);
fvec
f4y
=
fvec
::
setzero
()
-
(
F34y
+
F24y
);
fvec
f4z
=
fvec
::
setzero
()
-
(
F34z
+
F24z
);
fij
[
0
]
=
fvec
::
mask_add
(
fij
[
0
],
mask_inner_1
,
fij
[
0
],
F23x
+
F24x
-
F31x
);
fij
[
1
]
=
fvec
::
mask_add
(
fij
[
1
],
mask_inner_1
,
fij
[
1
],
F23y
+
F24y
-
F31y
);
fij
[
2
]
=
fvec
::
mask_add
(
fij
[
2
],
mask_inner_1
,
fij
[
2
],
F23z
+
F24z
-
F31z
);
fvec
tmp20
=
VA
*
(
c_1
-
om1234
*
om1234
)
*
(
c_1
-
tspjik
)
*
(
c_1
-
tspijl
)
*
dw21
*
w34
*
fvec
::
mask_recip
(
fvec
::
undefined
(),
mask_inner_1
,
r21mag
);
f2x
=
f2x
-
tmp20
*
r21x
;
f2y
=
f2y
-
tmp20
*
r21y
;
f2z
=
f2z
-
tmp20
*
r21z
;
f1x
=
f1x
+
tmp20
*
r21x
;
f1y
=
f1y
+
tmp20
*
r21y
;
f1z
=
f1z
+
tmp20
*
r21z
;
fvec
tmp21
=
VA
*
(
c_1
-
om1234
*
om1234
)
*
(
c_1
-
tspjik
)
*
(
c_1
-
tspijl
)
*
w21
*
dw34
*
fvec
::
mask_recip
(
fvec
::
undefined
(),
mask_inner_1
,
r34mag
);
f3x
=
f3x
-
tmp21
*
r34x
;
f3y
=
f3y
-
tmp21
*
r34y
;
f3z
=
f3z
-
tmp21
*
r34z
;
f4x
=
f4x
+
tmp21
*
r34x
;
f4y
=
f4y
+
tmp21
*
r34y
;
f4z
=
f4z
+
tmp21
*
r34z
;
// 1 == buf_idx_i, 2 == i, 3 == j, 4 == buf_idx_j
i_data
->
force_k_x_buf
[
buf_idx_i
]
=
fvec
::
mask_add
(
i_data
->
force_k_x_buf
[
buf_idx_i
],
mask_inner_1
,
i_data
->
force_k_x_buf
[
buf_idx_i
],
f1x
);
i_data
->
force_k_y_buf
[
buf_idx_i
]
=
fvec
::
mask_add
(
i_data
->
force_k_y_buf
[
buf_idx_i
],
mask_inner_1
,
i_data
->
force_k_y_buf
[
buf_idx_i
],
f1y
);
i_data
->
force_k_z_buf
[
buf_idx_i
]
=
fvec
::
mask_add
(
i_data
->
force_k_z_buf
[
buf_idx_i
],
mask_inner_1
,
i_data
->
force_k_z_buf
[
buf_idx_i
],
f1z
);
i_data
->
force_i_x
=
fvec
::
mask_add
(
i_data
->
force_i_x
,
mask_inner_1
,
i_data
->
force_i_x
,
f2x
);
i_data
->
force_i_y
=
fvec
::
mask_add
(
i_data
->
force_i_y
,
mask_inner_1
,
i_data
->
force_i_y
,
f2y
);
i_data
->
force_i_z
=
fvec
::
mask_add
(
i_data
->
force_i_z
,
mask_inner_1
,
i_data
->
force_i_z
,
f2z
);
j_data
->
force_i_x
=
fvec
::
mask_add
(
j_data
->
force_i_x
,
mask_inner_1
,
j_data
->
force_i_x
,
f3x
);
j_data
->
force_i_y
=
fvec
::
mask_add
(
j_data
->
force_i_y
,
mask_inner_1
,
j_data
->
force_i_y
,
f3y
);
j_data
->
force_i_z
=
fvec
::
mask_add
(
j_data
->
force_i_z
,
mask_inner_1
,
j_data
->
force_i_z
,
f3z
);
j_data
->
force_k_x_buf
[
buf_idx_j
]
=
fvec
::
mask_add
(
j_data
->
force_k_x_buf
[
buf_idx_j
],
mask_inner_1
,
j_data
->
force_k_x_buf
[
buf_idx_j
],
f4x
);
j_data
->
force_k_y_buf
[
buf_idx_j
]
=
fvec
::
mask_add
(
j_data
->
force_k_y_buf
[
buf_idx_j
],
mask_inner_1
,
j_data
->
force_k_y_buf
[
buf_idx_j
],
f4y
);
j_data
->
force_k_z_buf
[
buf_idx_j
]
=
fvec
::
mask_add
(
j_data
->
force_k_z_buf
[
buf_idx_j
],
mask_inner_1
,
j_data
->
force_k_z_buf
[
buf_idx_j
],
f4z
);
}
}
return
sum_omega
;
}
static
fvec
aut_frebo_pi_dh
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
_noalias
ka
,
struct
aut_frebo_data
*
_noalias
i_data
,
struct
aut_frebo_data
*
_noalias
j_data
,
int
itype
,
int
jtype
,
ivec
vi
,
ivec
vj
,
fvec
r23x
,
fvec
r23y
,
fvec
r23z
,
fvec
r23mag
,
fvec
VA
,
fvec
Nij
,
fvec
Nji
,
fvec
Nijconj
,
fvec
NconjtmpI
,
fvec
NconjtmpJ
,
fvec
fij
[
3
]
)
{
fvec
c_TOL
=
fvec
::
set1
(
TOL
);
fvec
dN3
[
3
];
fvec
Tij
=
aut_frebo_Tij
(
ka
,
itype
,
jtype
,
Nij
,
Nji
,
Nijconj
,
&
dN3
[
0
]);
bvec
TijgtTOLmask
=
fvec
::
cmpnle
(
fvec
::
abs
(
Tij
),
c_TOL
);
fvec
sum_omega
=
fvec
::
setzero
();
if
(
bvec
::
test_any_set
(
TijgtTOLmask
))
{
sum_omega
=
aut_frebo_sum_omega
(
ka
,
i_data
,
j_data
,
itype
,
jtype
,
vi
,
vj
,
r23x
,
r23y
,
r23z
,
r23mag
,
VA
*
Tij
,
fij
);
sum_omega
=
fvec
::
mask_blend
(
TijgtTOLmask
,
fvec
::
setzero
(),
sum_omega
);
aut_frebo_N_spline_force
(
ka
,
i_data
,
itype
,
jtype
,
vi
,
vj
,
VA
*
sum_omega
,
dN3
[
0
],
dN3
[
2
],
NconjtmpI
);
aut_frebo_N_spline_force
(
ka
,
j_data
,
jtype
,
itype
,
vj
,
vi
,
VA
*
sum_omega
,
dN3
[
1
],
dN3
[
2
],
NconjtmpJ
);
}
return
Tij
*
sum_omega
;
}
/*
We can reuse the aut_frebo_data buffers here to do this calculation very
cheaply.
*/
static
void
aut_torsion_vec
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
struct
aut_frebo_data
*
i_data
,
struct
aut_frebo_data
*
j_data
,
ivec
i
,
ivec
j
,
fvec
wij
,
fvec
dwij
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
map
=
ka
->
map
;
flt_t
(
*
epsilonT
)[
2
]
=
ka
->
params
.
epsilonT
;
fvec
epsilonT00
=
fvec
::
set1
(
epsilonT
[
0
][
0
]);
fvec
epsilonT01
=
fvec
::
set1
(
epsilonT
[
0
][
1
]);
fvec
epsilonT10
=
fvec
::
set1
(
epsilonT
[
1
][
0
]);
fvec
epsilonT11
=
fvec
::
set1
(
epsilonT
[
1
][
1
]);
fvec
thmin
=
fvec
::
set1
(
ka
->
params
.
thmin
);
fvec
thmax
=
fvec
::
set1
(
ka
->
params
.
thmax
);
const
fvec
c_1_0
=
fvec
::
set1
(
1.0
);
const
fvec
c_0_5
=
fvec
::
set1
(
0.5
);
const
fvec
c_0_1
=
fvec
::
set1
(
0.1
);
const
fvec
c_2_0
=
fvec
::
set1
(
2.0
);
const
fvec
c_2_5
=
fvec
::
set1
(
2.5
);
const
fvec
c_256_405
=
fvec
::
set1
(
256.0
/
405.0
);
fvec
del32x
=
j_data
->
x_i
-
i_data
->
x_i
;
fvec
del32y
=
j_data
->
y_i
-
i_data
->
y_i
;
fvec
del32z
=
j_data
->
z_i
-
i_data
->
z_i
;
fvec
rsq
=
del32x
*
del32x
+
del32y
*
del32y
+
del32z
*
del32z
;
fvec
r32
=
fvec
::
sqrt
(
rsq
);
fvec
del23x
=
fvec
::
setzero
()
-
del32x
;
fvec
del23y
=
fvec
::
setzero
()
-
del32y
;
fvec
del23z
=
fvec
::
setzero
()
-
del32z
;
fvec
r23
=
r32
;
fvec
w23
=
wij
;
fvec
dw23
=
dwij
;
for
(
int
buf_idx_i
=
0
;
buf_idx_i
<
i_data
->
buf_len
;
buf_idx_i
++
)
{
bvec
mask_start
=
i_data
->
mask_buf
[
buf_idx_i
];
fvec
del21x
=
i_data
->
rikx_buf
[
buf_idx_i
];
// a2 - a1 -> i - k
fvec
del21y
=
i_data
->
riky_buf
[
buf_idx_i
];
// a2 - a1 -> i - k
fvec
del21z
=
i_data
->
rikz_buf
[
buf_idx_i
];
// a2 - a1 -> i - k
fvec
r21
=
i_data
->
rikmag_buf
[
buf_idx_i
];
fvec
cos321
=
i_data
->
cosjik_buf
[
buf_idx_i
];
fvec
sin321
=
fvec
::
sqrt
(
c_1_0
-
cos321
*
cos321
);
// strictly equivalent to sin321 < TOL
mask_start
=
fvec
::
mask_cmpneq
(
mask_start
,
fvec
::
setzero
(),
sin321
);
if
(
!
bvec
::
test_any_set
(
mask_start
))
continue
;
fvec
deljkx
=
del21x
-
del23x
;
fvec
deljky
=
del21y
-
del23y
;
fvec
deljkz
=
del21z
-
del23z
;
fvec
rjk2
=
deljkx
*
deljkx
+
deljky
*
deljky
+
deljkz
*
deljkz
;
fvec
rjk
=
fvec
::
sqrt
(
rjk2
);
fvec
rik2
=
r21
*
r21
;
fvec
w21
=
i_data
->
wik_buf
[
buf_idx_i
];
fvec
dw21
=
i_data
->
dwik_buf
[
buf_idx_i
];
fvec
rij
=
r32
;
fvec
rik
=
r21
;
fvec
rij2
=
r32
*
r32
;
fvec
dtsjik
;
fvec
tspjik
=
aut_Sp2_deriv
(
cos321
,
thmin
,
thmax
,
&
dtsjik
);
dtsjik
=
fvec
::
setzero
()
-
dtsjik
;
bvec
ktype_mask
=
i_data
->
ktype_buf
[
buf_idx_i
];
fvec
epsilonT0
=
fvec
::
mask_blend
(
ktype_mask
,
epsilonT00
,
epsilonT10
);
fvec
epsilonT1
=
fvec
::
mask_blend
(
ktype_mask
,
epsilonT01
,
epsilonT11
);
ivec
k
=
i_data
->
k_buf
[
buf_idx_i
];
for
(
int
buf_idx_j
=
0
;
buf_idx_j
<
j_data
->
buf_len
;
buf_idx_j
++
)
{
ivec
l
=
j_data
->
k_buf
[
buf_idx_j
];
bvec
mask_inner_0
=
ivec
::
mask_cmpneq
(
mask_start
,
k
,
l
)
&
j_data
->
mask_buf
[
buf_idx_j
];
if
(
!
bvec
::
test_any_set
(
mask_inner_0
))
continue
;
fvec
del34x
=
j_data
->
rikx_buf
[
buf_idx_j
];
fvec
del34y
=
j_data
->
riky_buf
[
buf_idx_j
];
fvec
del34z
=
j_data
->
rikz_buf
[
buf_idx_j
];
fvec
r34
=
j_data
->
rikmag_buf
[
buf_idx_j
];
bvec
ltype_mask
=
j_data
->
ktype_buf
[
buf_idx_j
];
fvec
cos234
=
j_data
->
cosjik_buf
[
buf_idx_j
];
fvec
sin234
=
fvec
::
sqrt
(
c_1_0
-
cos234
*
cos234
);
// strictly equivalent to sin234 < TOL
mask_inner_0
=
fvec
::
mask_cmpneq
(
mask_inner_0
,
sin234
,
fvec
::
setzero
());
if
(
!
bvec
::
test_any_set
(
mask_inner_0
))
continue
;
fvec
dw34
=
j_data
->
dwik_buf
[
buf_idx_j
];
fvec
w34
=
j_data
->
wik_buf
[
buf_idx_j
];
fvec
delilx
=
del23x
+
del34x
;
fvec
delily
=
del23y
+
del34y
;
fvec
delilz
=
del23z
+
del34z
;
fvec
ril2
=
delilx
*
delilx
+
delily
*
delily
+
delilz
*
delilz
;
fvec
ril
=
fvec
::
sqrt
(
ril2
);
fvec
rjl2
=
r34
*
r34
;
fvec
rjl
=
r34
;
fvec
dtsijl
;
fvec
tspijl
=
aut_Sp2_deriv
(
cos234
,
thmin
,
thmax
,
&
dtsijl
);
dtsijl
=
fvec
::
setzero
()
-
dtsijl
;
fvec
cross321x
=
del32y
*
del21z
-
del32z
*
del21y
;
fvec
cross321y
=
del32z
*
del21x
-
del32x
*
del21z
;
fvec
cross321z
=
del32x
*
del21y
-
del32y
*
del21x
;
fvec
cross321mag
=
fvec
::
sqrt
(
cross321x
*
cross321x
+
cross321y
*
cross321y
+
cross321z
*
cross321z
);
fvec
cross234x
=
del23y
*
del34z
-
del23z
*
del34y
;
fvec
cross234y
=
del23z
*
del34x
-
del23x
*
del34z
;
fvec
cross234z
=
del23x
*
del34y
-
del23y
*
del34x
;
fvec
cross234mag
=
fvec
::
sqrt
(
cross234x
*
cross234x
+
cross234y
*
cross234y
+
cross234z
*
cross234z
);
fvec
cwnum
=
cross321x
*
cross234x
+
cross321y
*
cross234y
+
cross321z
*
cross234z
;
fvec
cwnom
=
r21
*
r34
*
r32
*
r32
*
sin321
*
sin234
;
fvec
cw
=
cwnum
/
cwnom
;
fvec
cw2
=
c_0_5
*
(
c_1_0
-
cw
);
fvec
ekijl
=
fvec
::
mask_blend
(
ltype_mask
,
epsilonT0
,
epsilonT1
);
fvec
Ec
=
c_256_405
*
ekijl
;
fvec
cw2_5
=
cw2
*
cw2
*
cw2
*
cw2
*
cw2
;
fvec
Vtors
=
Ec
*
cw2_5
-
ekijl
*
c_0_1
;
fvec
evdwl
=
Vtors
*
w21
*
w23
*
w34
*
(
c_1_0
-
tspjik
)
*
(
c_1_0
-
tspijl
);
ka
->
result_eng
+=
fvec
::
mask_reduce_add
(
mask_inner_0
,
evdwl
);
fvec
dndijx
=
cross234y
*
del21z
-
cross234z
*
del21y
;
fvec
dndijy
=
cross234z
*
del21x
-
cross234x
*
del21z
;
fvec
dndijz
=
cross234x
*
del21y
-
cross234y
*
del21x
;
fvec
tmpvecx
=
del34y
*
cross321z
-
del34z
*
cross321y
;
fvec
tmpvecy
=
del34z
*
cross321x
-
del34x
*
cross321z
;
fvec
tmpvecz
=
del34x
*
cross321y
-
del34y
*
cross321x
;
dndijx
=
dndijx
+
tmpvecx
;
dndijy
=
dndijy
+
tmpvecy
;
dndijz
=
dndijz
+
tmpvecz
;
fvec
dndikx
=
del23y
*
cross234z
-
del23z
*
cross234y
;
fvec
dndiky
=
del23z
*
cross234x
-
del23x
*
cross234z
;
fvec
dndikz
=
del23x
*
cross234y
-
del23y
*
cross234x
;
fvec
dndjlx
=
cross321y
*
del23z
-
cross321z
*
del23y
;
fvec
dndjly
=
cross321z
*
del23x
-
cross321x
*
del23z
;
fvec
dndjlz
=
cross321x
*
del23y
-
cross321y
*
del23x
;
fvec
r23sq
=
r23
*
r23
;
fvec
r21sq
=
r21
*
r21
;
fvec
r34sq
=
r34
*
r34
;
fvec
rjksq
=
rjk
*
rjk
;
fvec
rilsq
=
ril
*
ril
;
fvec
dcidij
=
(
r23sq
-
r21sq
+
rjksq
)
/
(
c_2_0
*
r23sq
*
r21
);
fvec
dcidik
=
(
r21sq
-
r23sq
+
rjksq
)
/
(
c_2_0
*
r21sq
*
r23
);
fvec
dcidjk
=
fvec
::
setzero
()
-
rjk
/
(
r23
*
r21
);
fvec
dcjdji
=
(
r23sq
-
r34sq
+
rilsq
)
/
(
c_2_0
*
r23sq
*
r34
);
fvec
dcjdjl
=
(
r34sq
-
r23sq
+
rilsq
)
/
(
c_2_0
*
r34sq
*
r23
);
fvec
dcjdil
=
fvec
::
setzero
()
-
ril
/
(
r23
*
r34
);
fvec
dsidij
=
fvec
::
setzero
()
-
cos321
/
sin321
*
dcidij
;
fvec
dsidik
=
fvec
::
setzero
()
-
cos321
/
sin321
*
dcidik
;
fvec
dsidjk
=
fvec
::
setzero
()
-
cos321
/
sin321
*
dcidjk
;
fvec
dsjdji
=
fvec
::
setzero
()
-
cos234
/
sin234
*
dcjdji
;
fvec
dsjdjl
=
fvec
::
setzero
()
-
cos234
/
sin234
*
dcjdjl
;
fvec
dsjdil
=
fvec
::
setzero
()
-
cos234
/
sin234
*
dcjdil
;
fvec
dxidij
=
r21
*
sin321
+
r23
*
r21
*
dsidij
;
fvec
dxidik
=
r23
*
sin321
+
r23
*
r21
*
dsidik
;
fvec
dxidjk
=
r23
*
r21
*
dsidjk
;
fvec
dxjdji
=
r34
*
sin234
+
r23
*
r34
*
dsjdji
;
fvec
dxjdjl
=
r23
*
sin234
+
r23
*
r34
*
dsjdjl
;
fvec
dxjdil
=
r23
*
r34
*
dsjdil
;
fvec
ddndij
=
dxidij
*
cross234mag
+
cross321mag
*
dxjdji
;
fvec
ddndik
=
dxidik
*
cross234mag
;
fvec
ddndjk
=
dxidjk
*
cross234mag
;
fvec
ddndjl
=
cross321mag
*
dxjdjl
;
fvec
ddndil
=
cross321mag
*
dxjdil
;
fvec
dcwddn
=
fvec
::
setzero
()
-
cwnum
/
(
cwnom
*
cwnom
);
fvec
dcwdn
=
fvec
::
recip
(
cwnom
);
fvec
cw2_4
=
cw2
*
cw2
*
cw2
*
cw2
;
fvec
dvpdcw
=
c_2_5
*
Ec
*
cw2_4
*
w23
*
w21
*
w34
*
(
c_1_0
-
tspjik
)
*
(
c_1_0
-
tspijl
);
fvec
Ftmpx
=
dvpdcw
*
(
dcwdn
*
dndijx
+
dcwddn
*
ddndij
*
del23x
/
r23
);
fvec
Ftmpy
=
dvpdcw
*
(
dcwdn
*
dndijy
+
dcwddn
*
ddndij
*
del23y
/
r23
);
fvec
Ftmpz
=
dvpdcw
*
(
dcwdn
*
dndijz
+
dcwddn
*
ddndij
*
del23z
/
r23
);
fvec
fix
=
Ftmpx
;
fvec
fiy
=
Ftmpy
;
fvec
fiz
=
Ftmpz
;
fvec
fjx
=
fvec
::
setzero
()
-
Ftmpx
;
fvec
fjy
=
fvec
::
setzero
()
-
Ftmpy
;
fvec
fjz
=
fvec
::
setzero
()
-
Ftmpz
;
Ftmpx
=
dvpdcw
*
(
dcwdn
*
dndikx
+
dcwddn
*
ddndik
*
del21x
/
r21
);
Ftmpy
=
dvpdcw
*
(
dcwdn
*
dndiky
+
dcwddn
*
ddndik
*
del21y
/
r21
);
Ftmpz
=
dvpdcw
*
(
dcwdn
*
dndikz
+
dcwddn
*
ddndik
*
del21z
/
r21
);
fix
=
fix
+
Ftmpx
;
fiy
=
fiy
+
Ftmpy
;
fiz
=
fiz
+
Ftmpz
;
fvec
fkx
=
fvec
::
setzero
()
-
Ftmpx
;
fvec
fky
=
fvec
::
setzero
()
-
Ftmpy
;
fvec
fkz
=
fvec
::
setzero
()
-
Ftmpz
;
Ftmpx
=
dvpdcw
*
dcwddn
*
ddndjk
*
deljkx
/
rjk
;
Ftmpy
=
dvpdcw
*
dcwddn
*
ddndjk
*
deljky
/
rjk
;
Ftmpz
=
dvpdcw
*
dcwddn
*
ddndjk
*
deljkz
/
rjk
;
fjx
=
fjx
+
Ftmpx
;
fjy
=
fjy
+
Ftmpy
;
fjz
=
fjz
+
Ftmpz
;
fkx
=
fkx
-
Ftmpx
;
fky
=
fky
-
Ftmpy
;
fkz
=
fkz
-
Ftmpz
;
Ftmpx
=
dvpdcw
*
(
dcwdn
*
dndjlx
+
dcwddn
*
ddndjl
*
del34x
/
r34
);
Ftmpy
=
dvpdcw
*
(
dcwdn
*
dndjly
+
dcwddn
*
ddndjl
*
del34y
/
r34
);
Ftmpz
=
dvpdcw
*
(
dcwdn
*
dndjlz
+
dcwddn
*
ddndjl
*
del34z
/
r34
);
fjx
=
fjx
+
Ftmpx
;
fjy
=
fjy
+
Ftmpy
;
fjz
=
fjz
+
Ftmpz
;
fvec
flx
=
fvec
::
setzero
()
-
Ftmpx
;
fvec
fly
=
fvec
::
setzero
()
-
Ftmpy
;
fvec
flz
=
fvec
::
setzero
()
-
Ftmpz
;
Ftmpx
=
dvpdcw
*
dcwddn
*
ddndil
*
delilx
/
ril
;
Ftmpy
=
dvpdcw
*
dcwddn
*
ddndil
*
delily
/
ril
;
Ftmpz
=
dvpdcw
*
dcwddn
*
ddndil
*
delilz
/
ril
;
fix
=
fix
+
Ftmpx
;
fiy
=
fiy
+
Ftmpy
;
fiz
=
fiz
+
Ftmpz
;
flx
=
flx
-
Ftmpx
;
fly
=
fly
-
Ftmpy
;
flz
=
flz
-
Ftmpz
;
// coordination forces
fvec
fpair
=
Vtors
*
dw21
*
w23
*
w34
*
(
c_1_0
-
tspjik
)
*
(
c_1_0
-
tspijl
)
/
r21
;
fix
=
fix
-
del21x
*
fpair
;
fiy
=
fiy
-
del21y
*
fpair
;
fiz
=
fiz
-
del21z
*
fpair
;
fkx
=
fkx
+
del21x
*
fpair
;
fky
=
fky
+
del21y
*
fpair
;
fkz
=
fkz
+
del21z
*
fpair
;
fpair
=
Vtors
*
w21
*
dw23
*
w34
*
(
c_1_0
-
tspjik
)
*
(
c_1_0
-
tspijl
)
/
r23
;
fix
=
fix
-
del23x
*
fpair
;
fiy
=
fiy
-
del23y
*
fpair
;
fiz
=
fiz
-
del23z
*
fpair
;
fjx
=
fjx
+
del23x
*
fpair
;
fjy
=
fjy
+
del23y
*
fpair
;
fjz
=
fjz
+
del23z
*
fpair
;
fpair
=
Vtors
*
w21
*
w23
*
dw34
*
(
c_1_0
-
tspjik
)
*
(
c_1_0
-
tspijl
)
/
r34
;
fjx
=
fjx
-
del34x
*
fpair
;
fjy
=
fjy
-
del34y
*
fpair
;
fjz
=
fjz
-
del34z
*
fpair
;
flx
=
flx
+
del34x
*
fpair
;
fly
=
fly
+
del34y
*
fpair
;
flz
=
flz
+
del34z
*
fpair
;
// additional cut off function forces
fvec
fcpc
=
fvec
::
setzero
()
-
Vtors
*
w21
*
w23
*
w34
*
dtsjik
*
(
c_1_0
-
tspijl
);
fpair
=
fcpc
*
dcidij
/
rij
;
fix
=
fix
+
fpair
*
del23x
;
fiy
=
fiy
+
fpair
*
del23y
;
fiz
=
fiz
+
fpair
*
del23z
;
fjx
=
fjx
-
fpair
*
del23x
;
fjy
=
fjy
-
fpair
*
del23y
;
fjz
=
fjz
-
fpair
*
del23z
;
fpair
=
fcpc
*
dcidik
/
rik
;
fix
=
fix
+
fpair
*
del21x
;
fiy
=
fiy
+
fpair
*
del21y
;
fiz
=
fiz
+
fpair
*
del21z
;
fkx
=
fkx
-
fpair
*
del21x
;
fky
=
fky
-
fpair
*
del21y
;
fkz
=
fkz
-
fpair
*
del21z
;
fpair
=
fcpc
*
dcidjk
/
rjk
;
fjx
=
fjx
+
fpair
*
deljkx
;
fjy
=
fjy
+
fpair
*
deljky
;
fjz
=
fjz
+
fpair
*
deljkz
;
fkx
=
fkx
-
fpair
*
deljkx
;
fky
=
fky
-
fpair
*
deljky
;
fkz
=
fkz
-
fpair
*
deljkz
;
fcpc
=
fvec
::
setzero
()
-
Vtors
*
w21
*
w23
*
w34
*
(
c_1_0
-
tspjik
)
*
dtsijl
;
fpair
=
fcpc
*
dcjdji
/
rij
;
fix
=
fix
+
fpair
*
del23x
;
fiy
=
fiy
+
fpair
*
del23y
;
fiz
=
fiz
+
fpair
*
del23z
;
fjx
=
fjx
-
fpair
*
del23x
;
fjy
=
fjy
-
fpair
*
del23y
;
fjz
=
fjz
-
fpair
*
del23z
;
fpair
=
fcpc
*
dcjdjl
/
rjl
;
fjx
=
fjx
+
fpair
*
del34x
;
fjy
=
fjy
+
fpair
*
del34y
;
fjz
=
fjz
+
fpair
*
del34z
;
flx
=
flx
-
fpair
*
del34x
;
fly
=
fly
-
fpair
*
del34y
;
flz
=
flz
-
fpair
*
del34z
;
fpair
=
fcpc
*
dcjdil
/
ril
;
fix
=
fix
+
fpair
*
delilx
;
fiy
=
fiy
+
fpair
*
delily
;
fiz
=
fiz
+
fpair
*
delilz
;
flx
=
flx
-
fpair
*
delilx
;
fly
=
fly
-
fpair
*
delily
;
flz
=
flz
-
fpair
*
delilz
;
// sum per-atom forces into atom force array
i_data
->
force_i_x
=
fvec
::
mask_add
(
i_data
->
force_i_x
,
mask_inner_0
,
i_data
->
force_i_x
,
fix
);
i_data
->
force_i_y
=
fvec
::
mask_add
(
i_data
->
force_i_y
,
mask_inner_0
,
i_data
->
force_i_y
,
fiy
);
i_data
->
force_i_z
=
fvec
::
mask_add
(
i_data
->
force_i_z
,
mask_inner_0
,
i_data
->
force_i_z
,
fiz
);
i_data
->
force_j_x
=
fvec
::
mask_add
(
i_data
->
force_j_x
,
mask_inner_0
,
i_data
->
force_j_x
,
fjx
);
i_data
->
force_j_y
=
fvec
::
mask_add
(
i_data
->
force_j_y
,
mask_inner_0
,
i_data
->
force_j_y
,
fjy
);
i_data
->
force_j_z
=
fvec
::
mask_add
(
i_data
->
force_j_z
,
mask_inner_0
,
i_data
->
force_j_z
,
fjz
);
i_data
->
force_k_x_buf
[
buf_idx_i
]
=
fvec
::
mask_add
(
i_data
->
force_k_x_buf
[
buf_idx_i
],
mask_inner_0
,
i_data
->
force_k_x_buf
[
buf_idx_i
],
fkx
);
i_data
->
force_k_y_buf
[
buf_idx_i
]
=
fvec
::
mask_add
(
i_data
->
force_k_y_buf
[
buf_idx_i
],
mask_inner_0
,
i_data
->
force_k_y_buf
[
buf_idx_i
],
fky
);
i_data
->
force_k_z_buf
[
buf_idx_i
]
=
fvec
::
mask_add
(
i_data
->
force_k_z_buf
[
buf_idx_i
],
mask_inner_0
,
i_data
->
force_k_z_buf
[
buf_idx_i
],
fkz
);
j_data
->
force_k_x_buf
[
buf_idx_j
]
=
fvec
::
mask_add
(
j_data
->
force_k_x_buf
[
buf_idx_j
],
mask_inner_0
,
j_data
->
force_k_x_buf
[
buf_idx_j
],
flx
);
j_data
->
force_k_y_buf
[
buf_idx_j
]
=
fvec
::
mask_add
(
j_data
->
force_k_y_buf
[
buf_idx_j
],
mask_inner_0
,
j_data
->
force_k_y_buf
[
buf_idx_j
],
fly
);
j_data
->
force_k_z_buf
[
buf_idx_j
]
=
fvec
::
mask_add
(
j_data
->
force_k_z_buf
[
buf_idx_j
],
mask_inner_0
,
j_data
->
force_k_z_buf
[
buf_idx_j
],
flz
);
}
}
}
/*
* Processes VL elements of the same type itype/jtype for REBO and TORSION
* interactions. This allows us to reuse the aut_frebo_data buffes in the
* torsion calculaltion.
*/
static
void
aut_frebo_batch_of_kind
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
torflag
,
int
itype
,
int
jtype
,
int
*
i_buf
,
int
*
j_buf
)
{
{
// jump-scope for exceed_limits
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
tag
=
ka
->
tag
;
int
*
map
=
ka
->
map
;
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
flt_t
rcminij
=
ka
->
params
.
rcmin
[
itype
][
jtype
];
flt_t
rcmaxij
=
ka
->
params
.
rcmax
[
itype
][
jtype
];
flt_t
Qij
=
ka
->
params
.
Q
[
itype
][
jtype
];
flt_t
Aij
=
ka
->
params
.
A
[
itype
][
jtype
];
flt_t
alphaij
=
ka
->
params
.
alpha
[
itype
][
jtype
];
fvec
vrcminij
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
jtype
]);
fvec
vrcmaxij
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
jtype
]);
fvec
vQij
=
fvec
::
set1
(
ka
->
params
.
Q
[
itype
][
jtype
]);
fvec
vAij
=
fvec
::
set1
(
ka
->
params
.
A
[
itype
][
jtype
]);
fvec
malphaij
=
fvec
::
set1
(
-
ka
->
params
.
alpha
[
itype
][
jtype
]);
fvec
c_1_0
=
fvec
::
set1
(
1
);
fvec
c_0_5
=
fvec
::
set1
(
0.5
);
fvec
c_TOL
=
fvec
::
set1
(
1e-9
);
struct
aut_frebo_data
i_data
,
j_data
;
fvec
evdwl_vacc
=
fvec
::
setzero
();
ivec
vi
=
ivec
::
maskz_loadu
(
bvec
::
full
(),
i_buf
);
int
tmp
;
ivec
vj
=
ivec
::
maskz_loadu
(
bvec
::
full
(),
j_buf
);
fvec
x_i
,
y_i
,
z_i
;
fvec
x_j
,
y_j
,
z_j
;
aut_loadatoms_vec_notype
(
x
,
vi
,
&
x_i
,
&
y_i
,
&
z_i
);
aut_loadatoms_vec_notype
(
x
,
vj
,
&
x_j
,
&
y_j
,
&
z_j
);
i_data
.
x_i
=
x_i
;
i_data
.
y_i
=
y_i
;
i_data
.
z_i
=
z_i
;
i_data
.
x_j
=
x_j
;
i_data
.
y_j
=
y_j
;
i_data
.
z_j
=
z_j
;
j_data
.
x_i
=
x_j
;
j_data
.
y_i
=
y_j
;
j_data
.
z_i
=
z_j
;
j_data
.
x_j
=
x_i
;
j_data
.
y_j
=
y_i
;
j_data
.
z_j
=
z_i
;
fvec
delx
=
x_i
-
x_j
;
fvec
dely
=
y_i
-
y_j
;
fvec
delz
=
z_i
-
z_j
;
fvec
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
fvec
rij
=
fvec
::
sqrt
(
rsq
);
fvec
dwij
;
fvec
wij
=
aut_Sp_deriv
(
rij
,
vrcminij
,
vrcmaxij
,
&
dwij
);
fvec
exp_alphar
=
fvec
::
exp
(
malphaij
*
rij
);
fvec
Qij_over_rij
=
vQij
/
rij
;
fvec
Qij_over_rsq
=
vQij
/
rsq
;
fvec
VR_by_wij
=
(
c_1_0
+
Qij_over_rij
)
*
vAij
*
exp_alphar
;
fvec
VR
=
wij
*
VR_by_wij
;
fvec
pre
=
wij
*
vAij
*
exp_alphar
;
fvec
dVRdi
=
pre
*
(
malphaij
+
malphaij
*
Qij_over_rij
-
Qij_over_rsq
);
dVRdi
=
dVRdi
+
VR_by_wij
*
dwij
;
fvec
VA_by_wij
=
fvec
::
setzero
();
fvec
dVA
=
fvec
::
setzero
();
int
k
;
for
(
k
=
0
;
k
<
3
;
k
++
)
{
fvec
mBIJc
=
fvec
::
set1
(
-
ka
->
params
.
BIJc
[
itype
][
jtype
][
k
]);
fvec
mBetaij
=
fvec
::
set1
(
-
ka
->
params
.
Beta
[
itype
][
jtype
][
k
]);
fvec
term
=
mBIJc
*
fvec
::
exp
(
mBetaij
*
rij
);
VA_by_wij
=
VA_by_wij
+
term
;
dVA
=
dVA
+
mBetaij
*
wij
*
term
;
}
dVA
=
dVA
+
dwij
*
VA_by_wij
;
fvec
VA
=
wij
*
VA_by_wij
;
bvec
tol_check
=
fvec
::
cmplt
(
wij
,
c_TOL
);
VA
=
fvec
::
mask_blend
(
tol_check
,
VA
,
fvec
::
setzero
());
dVA
=
fvec
::
mask_blend
(
tol_check
,
dVA
,
fvec
::
setzero
());
VR
=
fvec
::
mask_blend
(
tol_check
,
VR
,
fvec
::
setzero
());
dVRdi
=
fvec
::
mask_blend
(
tol_check
,
dVRdi
,
fvec
::
setzero
());
fvec
nHi
=
fvec
::
gather
(
vi
,
ka
->
nH
,
sizeof
(
flt_t
));
fvec
nCi
=
fvec
::
gather
(
vi
,
ka
->
nC
,
sizeof
(
flt_t
));
fvec
nHj
=
fvec
::
gather
(
vj
,
ka
->
nH
,
sizeof
(
flt_t
));
fvec
nCj
=
fvec
::
gather
(
vj
,
ka
->
nC
,
sizeof
(
flt_t
));
fvec
Nij
=
(
nHi
+
nCi
)
-
wij
;
fvec
Nji
=
(
nHj
+
nCj
)
-
wij
;
i_data
.
nHi
=
nHi
;
i_data
.
nCi
=
nCi
;
j_data
.
nHi
=
nHj
;
j_data
.
nCi
=
nCj
;
fvec
fij
[
3
],
fji
[
3
];
fij
[
0
]
=
fvec
::
setzero
();
fij
[
1
]
=
fvec
::
setzero
();
fij
[
2
]
=
fvec
::
setzero
();
fji
[
0
]
=
fvec
::
setzero
();
fji
[
1
]
=
fvec
::
setzero
();
fji
[
2
]
=
fvec
::
setzero
();
fvec
NconjtmpI
;
fvec
pij
=
aut_frebo_pij_pd_2
(
ka
,
&
i_data
,
itype
,
jtype
,
vi
,
vj
,
delx
,
dely
,
delz
,
rij
,
wij
,
VA
,
&
NconjtmpI
,
fij
);
if
(
i_data
.
buf_len
<
0
)
goto
exceed_limits
;
fvec
NconjtmpJ
;
fvec
rjix
=
fvec
::
setzero
()
-
delx
;
fvec
rjiy
=
fvec
::
setzero
()
-
dely
;
fvec
rjiz
=
fvec
::
setzero
()
-
delz
;
fvec
pji
=
aut_frebo_pij_pd_2
(
ka
,
&
j_data
,
jtype
,
itype
,
vj
,
vi
,
rjix
,
rjiy
,
rjiz
,
rij
,
wij
,
VA
,
&
NconjtmpJ
,
fji
);
fij
[
0
]
=
fij
[
0
]
-
fji
[
0
];
fij
[
1
]
=
fij
[
1
]
-
fji
[
1
];
fij
[
2
]
=
fij
[
2
]
-
fji
[
2
];
if
(
j_data
.
buf_len
<
0
)
goto
exceed_limits
;
if
(
torflag
&&
itype
==
0
&&
jtype
==
0
)
aut_torsion_vec
(
ka
,
&
i_data
,
&
j_data
,
vi
,
vj
,
wij
,
dwij
);
fvec
Nijconj
=
c_1_0
+
NconjtmpI
*
NconjtmpI
+
NconjtmpJ
*
NconjtmpJ
;
fvec
dN3
[
3
];
fvec
pi_rc
=
aut_frebo_pi_rc_pd
(
ka
,
itype
,
jtype
,
Nij
,
Nji
,
Nijconj
,
dN3
);
aut_frebo_N_spline_force
(
ka
,
&
i_data
,
itype
,
jtype
,
vi
,
vj
,
VA
,
dN3
[
0
],
dN3
[
2
],
NconjtmpI
);
aut_frebo_N_spline_force
(
ka
,
&
j_data
,
jtype
,
itype
,
vj
,
vi
,
VA
,
dN3
[
1
],
dN3
[
2
],
NconjtmpJ
);
fvec
pi_dh
=
aut_frebo_pi_dh
(
ka
,
&
i_data
,
&
j_data
,
itype
,
jtype
,
vi
,
vj
,
delx
,
dely
,
delz
,
rij
,
VA
,
Nij
,
Nji
,
Nijconj
,
NconjtmpI
,
NconjtmpJ
,
fij
);
fvec
bij
=
c_0_5
*
(
pij
+
pji
)
+
pi_rc
+
pi_dh
;
fvec
dVAdi
=
bij
*
dVA
;
fvec
fpair
=
(
dVAdi
+
dVRdi
)
*
fvec
::
recip
(
rij
);
fvec
result_f_j_x
=
fpair
*
delx
-
fij
[
0
];
fvec
result_f_j_y
=
fpair
*
dely
-
fij
[
1
];
fvec
result_f_j_z
=
fpair
*
delz
-
fij
[
2
];
fvec
result_f_i_x
=
fvec
::
setzero
()
-
result_f_j_x
;
fvec
result_f_i_y
=
fvec
::
setzero
()
-
result_f_j_y
;
fvec
result_f_i_z
=
fvec
::
setzero
()
-
result_f_j_z
;
fvec
evdwl
=
VR
+
bij
*
VA
;
evdwl_vacc
=
evdwl_vacc
+
evdwl
;
aut_frebo_data_writeback
(
ka
,
&
i_data
);
aut_frebo_data_writeback
(
ka
,
&
j_data
);
flt_t
fi_x_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fi_y_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fi_z_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
int
fi_i_buf
[
ivec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fj_x_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fj_y_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fj_z_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
int
fj_j_buf
[
ivec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
evdwl_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
result_f_i_x
=
i_data
.
force_i_x
+
result_f_i_x
;
result_f_i_y
=
i_data
.
force_i_y
+
result_f_i_y
;
result_f_i_z
=
i_data
.
force_i_z
+
result_f_i_z
;
result_f_j_x
=
i_data
.
force_j_x
+
result_f_j_x
;
result_f_j_y
=
i_data
.
force_j_y
+
result_f_j_y
;
result_f_j_z
=
i_data
.
force_j_z
+
result_f_j_z
;
result_f_i_x
=
j_data
.
force_j_x
+
result_f_i_x
;
result_f_i_y
=
j_data
.
force_j_y
+
result_f_i_y
;
result_f_i_z
=
j_data
.
force_j_z
+
result_f_i_z
;
result_f_j_x
=
j_data
.
force_i_x
+
result_f_j_x
;
result_f_j_y
=
j_data
.
force_i_y
+
result_f_j_y
;
result_f_j_z
=
j_data
.
force_i_z
+
result_f_j_z
;
fvec
::
store
(
fi_x_buf
,
result_f_i_x
);
fvec
::
store
(
fi_y_buf
,
result_f_i_y
);
fvec
::
store
(
fi_z_buf
,
result_f_i_z
);
ivec
::
store
(
fi_i_buf
,
vi
);
fvec
::
store
(
fj_x_buf
,
result_f_j_x
);
fvec
::
store
(
fj_y_buf
,
result_f_j_y
);
fvec
::
store
(
fj_z_buf
,
result_f_j_z
);
ivec
::
store
(
fj_j_buf
,
vj
);
fvec
::
store
(
evdwl_buf
,
evdwl
);
int
lane
;
for
(
lane
=
0
;
lane
<
fvec
::
VL
;
lane
++
)
{
int
ii
=
fi_i_buf
[
lane
];
result_f
[
ii
].
x
+=
fi_x_buf
[
lane
];
result_f
[
ii
].
y
+=
fi_y_buf
[
lane
];
result_f
[
ii
].
z
+=
fi_z_buf
[
lane
];
result_f
[
ii
].
w
+=
0.5
*
evdwl_buf
[
lane
];
int
jj
=
fj_j_buf
[
lane
];
result_f
[
jj
].
x
+=
fj_x_buf
[
lane
];
result_f
[
jj
].
y
+=
fj_y_buf
[
lane
];
result_f
[
jj
].
z
+=
fj_z_buf
[
lane
];
result_f
[
jj
].
w
+=
0.5
*
evdwl_buf
[
lane
];
}
ka
->
result_eng
+=
fvec
::
reduce_add
(
evdwl_vacc
);
return
;
}
exceed_limits:
for
(
int
l
=
0
;
l
<
fvec
::
VL
;
l
++
)
{
int
i
=
i_buf
[
l
];
int
j
=
j_buf
[
l
];
ref_frebo_single_interaction
(
ka
,
i
,
j
);
if
(
torflag
&&
itype
==
0
&&
jtype
==
0
)
ref_torsion_single_interaction
(
ka
,
i
,
j
);
}
}
/*
Orders the interactions by itype and jtype and passes chunks to the above
method.
*/
static
void
aut_frebo
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
torflag
)
{
AtomAIREBOT
<
flt_t
>
*
_noalias
x
=
ka
->
x
;
int
*
_noalias
tag
=
ka
->
tag
;
int
*
_noalias
map
=
ka
->
map
;
int
i_buf
[
2
][
2
][
fvec
::
VL
];
int
j_buf
[
2
][
2
][
fvec
::
VL
];
int
n_buf
[
2
][
2
]
=
{
0
};
for
(
int
i
=
ka
->
frebo_from_atom
;
i
<
ka
->
frebo_to_atom
;
i
++
)
{
int
itag
=
tag
[
i
];
int
itype
=
map
[
x
[
i
].
w
];
flt_t
x_i
=
x
[
i
].
x
;
flt_t
y_i
=
x
[
i
].
y
;
flt_t
z_i
=
x
[
i
].
z
;
int
*
neighs
=
ka
->
neigh_rebo
.
entries
+
ka
->
neigh_rebo
.
offset
[
i
];
int
jnum
=
ka
->
neigh_rebo
.
num
[
i
];
for
(
int
jj
=
0
;
jj
<
jnum
;
jj
++
)
{
int
j
=
neighs
[
jj
];
int
jtag
=
tag
[
j
];
if
(
itag
>
jtag
)
{
if
(((
itag
+
jtag
)
&
1
)
==
0
)
continue
;
}
else
if
(
itag
<
jtag
)
{
if
(((
itag
+
jtag
)
&
1
)
==
1
)
continue
;
}
else
{
if
(
x
[
j
].
z
<
z_i
)
continue
;
if
(
x
[
j
].
z
==
z_i
&&
x
[
j
].
y
<
y_i
)
continue
;
if
(
x
[
j
].
z
==
z_i
&&
x
[
j
].
y
==
y_i
&&
x
[
j
].
x
<
x_i
)
continue
;
}
int
jtype
=
map
[
x
[
j
].
w
];
int
ins
=
n_buf
[
itype
][
jtype
];
i_buf
[
itype
][
jtype
][
ins
]
=
i
;
j_buf
[
itype
][
jtype
][
ins
]
=
j
;
n_buf
[
itype
][
jtype
]
+=
1
;
if
(
n_buf
[
itype
][
jtype
]
==
fvec
::
VL
)
{
aut_frebo_batch_of_kind
(
ka
,
torflag
,
itype
,
jtype
,
i_buf
[
itype
][
jtype
],
j_buf
[
itype
][
jtype
]);
n_buf
[
itype
][
jtype
]
=
0
;
}
}
}
for
(
int
itype
=
0
;
itype
<
2
;
itype
++
)
{
for
(
int
jtype
=
0
;
jtype
<
2
;
jtype
++
)
{
for
(
int
l
=
0
;
l
<
n_buf
[
itype
][
jtype
];
l
++
)
{
int
i
=
i_buf
[
itype
][
jtype
][
l
];
int
j
=
j_buf
[
itype
][
jtype
][
l
];
ref_frebo_single_interaction
(
ka
,
i
,
j
);
if
(
torflag
&&
itype
==
0
&&
jtype
==
0
)
ref_torsion_single_interaction
(
ka
,
i
,
j
);
}
}
}
}
/*
* Apply paths in scalar fashion, not crucial for performance.
*/
static
void
aut_airebo_lj_force_path
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
bvec
mask
,
fvec
dC
,
LennardJonesPathAIREBOT
<
flt_t
>
path
[
fvec
::
VL
])
{
for
(
int
i
=
0
;
i
<
fvec
::
VL
;
i
++
)
{
if
(
bvec
::
test_at
(
mask
,
i
))
{
ref_lennard_jones_force_path
(
ka
,
fvec
::
at
(
dC
,
i
),
&
path
[
i
]);
}
}
}
/*
* Hash-Map for efficient calculation of C_ij.
* Can have up to ITEMS entries with associated paths, as well as
* 1024 entries. Open addressing, invalidation by using a different i.
* Only needs to be reset once per timestep.
*/
static
const
int
OPT_TEST_PATH_SIZE
=
1024
;
static
const
int
OPT_TEST_PATH_ITEMS
=
128
;
struct
aut_airebo_lj_test_path_result_data
{
LennardJonesPathAIREBOT
<
flt_t
>
testpath
[
OPT_TEST_PATH_ITEMS
];
int
i
[
OPT_TEST_PATH_SIZE
];
int
j
[
OPT_TEST_PATH_SIZE
];
flt_t
cij
[
OPT_TEST_PATH_SIZE
];
int
testpath_idx
[
OPT_TEST_PATH_SIZE
];
};
static
const
unsigned
int
OPT_TEST_PATH_HASH
=
2654435761
;
static
int
aut_lj_tap_hash_fn
(
int
j
,
int
attempt
)
{
uint32_t
result
=
j
;
result
*=
(
uint32_t
)
OPT_TEST_PATH_HASH
;
result
+=
(
uint32_t
)
attempt
;
result
%=
(
uint32_t
)
OPT_TEST_PATH_SIZE
;
return
result
;
}
static
ivec
aut_airebo_lj_tap_hash_fn_vec
(
ivec
val
,
ivec
attempt
)
{
const
ivec
golden
=
ivec
::
set1
(
OPT_TEST_PATH_HASH
);
const
ivec
mask
=
ivec
::
set1
(
OPT_TEST_PATH_SIZE
-
1
);
ivec
a
=
ivec
::
mullo
(
golden
,
val
);
ivec
b
=
a
+
attempt
;
ivec
c
=
ivec
::
the_and
(
b
,
mask
);
return
c
;
}
/*
* Enter all those (potential) neighbors of i (including 2nd and 3rd degree)
* into the hash-map. There is no good way to vectorize this, and it does not
* seem time-critical.
*/
static
bool
aut_airebo_lj_test_all_paths
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
i
,
struct
aut_airebo_lj_test_path_result_data
*
result
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
map
=
ka
->
map
;
flt_t
(
*
rcmin
)[
2
]
=
&
ka
->
params
.
rcmin
[
0
];
flt_t
(
*
rcmax
)[
2
]
=
&
ka
->
params
.
rcmax
[
0
];
flt_t
rcminsq
[
2
][
2
];
rcminsq
[
0
][
0
]
=
rcmin
[
0
][
0
]
*
rcmin
[
0
][
0
];
rcminsq
[
0
][
1
]
=
rcmin
[
0
][
1
]
*
rcmin
[
0
][
1
];
rcminsq
[
1
][
0
]
=
rcmin
[
1
][
0
]
*
rcmin
[
1
][
0
];
rcminsq
[
1
][
1
]
=
rcmin
[
1
][
1
]
*
rcmin
[
1
][
1
];
int
*
neighs_i
=
&
ka
->
neigh_rebo
.
entries
[
ka
->
neigh_rebo
.
offset
[
i
]];
int
itype
=
map
[
x
[
i
].
w
];
int
path_insert_pos
=
0
;
for
(
int
jj
=
0
;
jj
<
ka
->
neigh_rebo
.
num
[
i
];
jj
++
)
{
int
j
=
neighs_i
[
jj
];
int
jtype
=
map
[
x
[
j
].
w
];
flt_t
dijx
=
x
[
j
].
x
-
x
[
i
].
x
;
flt_t
dijy
=
x
[
j
].
y
-
x
[
i
].
y
;
flt_t
dijz
=
x
[
j
].
z
-
x
[
i
].
z
;
flt_t
rijsq
=
dijx
*
dijx
+
dijy
*
dijy
+
dijz
*
dijz
;
flt_t
wj
=
1
,
dwj
=
0
;
flt_t
rij
=
0
;
if
(
rijsq
>=
rcminsq
[
itype
][
jtype
])
{
rij
=
overloaded
::
sqrt
(
rijsq
);
wj
=
Sp
(
rij
,
rcmin
[
itype
][
jtype
],
rcmax
[
itype
][
jtype
],
&
dwj
);
}
int
attempt
=
0
;
int
start_hash_slot
=
aut_lj_tap_hash_fn
(
j
,
attempt
);
int
hash_slot
=
start_hash_slot
;
while
(
result
->
i
[
hash_slot
]
==
i
&&
result
->
j
[
hash_slot
]
!=
j
&&
attempt
<
OPT_TEST_PATH_SIZE
)
{
hash_slot
=
aut_lj_tap_hash_fn
(
j
,
++
attempt
);
}
if
(
attempt
>=
OPT_TEST_PATH_SIZE
)
goto
exceed_limits
;
bool
init_slot
=
result
->
i
[
hash_slot
]
!=
i
;
if
(
init_slot
||
(
1
-
wj
<
result
->
cij
[
hash_slot
]))
{
result
->
i
[
hash_slot
]
=
i
;
result
->
j
[
hash_slot
]
=
j
;
result
->
cij
[
hash_slot
]
=
1
-
wj
;
if
(
wj
!=
1.0
)
{
if
(
path_insert_pos
>=
OPT_TEST_PATH_ITEMS
)
goto
exceed_limits
;
result
->
testpath_idx
[
hash_slot
]
=
path_insert_pos
;
LennardJonesPathAIREBOT
<
flt_t
>
*
path
=
&
result
->
testpath
[
path_insert_pos
++
];
path
->
num
=
2
;
path
->
del
[
0
].
x
=
dijx
;
path
->
del
[
0
].
y
=
dijy
;
path
->
del
[
0
].
z
=
dijz
;
if
(
rij
==
0
)
rij
=
sqrt
(
rijsq
);
path
->
r
[
0
]
=
rij
;
path
->
w
[
0
]
=
wj
;
path
->
dw
[
0
]
=
dwj
;
path
->
idx
[
0
]
=
i
;
path
->
idx
[
1
]
=
j
;
}
}
int
*
neighs_j
=
&
ka
->
neigh_rebo
.
entries
[
ka
->
neigh_rebo
.
offset
[
j
]];
for
(
int
kk
=
0
;
kk
<
ka
->
neigh_rebo
.
num
[
j
];
kk
++
)
{
int
k
=
neighs_j
[
kk
];
if
(
k
==
i
)
continue
;
int
ktype
=
map
[
x
[
k
].
w
];
flt_t
djkx
=
x
[
k
].
x
-
x
[
j
].
x
;
flt_t
djky
=
x
[
k
].
y
-
x
[
j
].
y
;
flt_t
djkz
=
x
[
k
].
z
-
x
[
j
].
z
;
flt_t
rjksq
=
djkx
*
djkx
+
djky
*
djky
+
djkz
*
djkz
;
flt_t
wk
=
1
,
dwk
=
0
;
flt_t
rjk
=
0
;
if
(
rjksq
>=
rcminsq
[
jtype
][
ktype
])
{
rjk
=
overloaded
::
sqrt
(
rjksq
);
wk
=
Sp
(
rjk
,
rcmin
[
jtype
][
ktype
],
rcmax
[
jtype
][
ktype
],
&
dwk
);
}
int
attempt
=
0
;
int
start_hash_slot
=
aut_lj_tap_hash_fn
(
k
,
attempt
);
int
hash_slot
=
start_hash_slot
;
while
(
result
->
i
[
hash_slot
]
==
i
&&
result
->
j
[
hash_slot
]
!=
k
&&
attempt
<
OPT_TEST_PATH_SIZE
)
{
hash_slot
=
aut_lj_tap_hash_fn
(
k
,
++
attempt
);
}
if
(
attempt
>=
OPT_TEST_PATH_SIZE
)
goto
exceed_limits
;
bool
init_slot
=
result
->
i
[
hash_slot
]
!=
i
;
if
(
init_slot
||
(
1
-
wj
*
wk
<
result
->
cij
[
hash_slot
]))
{
result
->
i
[
hash_slot
]
=
i
;
result
->
j
[
hash_slot
]
=
k
;
result
->
cij
[
hash_slot
]
=
1
-
wj
*
wk
;
if
(
wj
*
wk
!=
1.0
)
{
if
(
path_insert_pos
>=
OPT_TEST_PATH_ITEMS
)
goto
exceed_limits
;
result
->
testpath_idx
[
hash_slot
]
=
path_insert_pos
;
LennardJonesPathAIREBOT
<
flt_t
>
*
path
=
&
result
->
testpath
[
path_insert_pos
++
];
path
->
num
=
3
;
path
->
del
[
0
].
x
=
dijx
;
path
->
del
[
0
].
y
=
dijy
;
path
->
del
[
0
].
z
=
dijz
;
if
(
rij
==
0
)
rij
=
sqrt
(
rijsq
);
path
->
r
[
0
]
=
rij
;
path
->
del
[
1
].
x
=
djkx
;
path
->
del
[
1
].
y
=
djky
;
path
->
del
[
1
].
z
=
djkz
;
if
(
rjk
==
0
)
rjk
=
sqrt
(
rjksq
);
path
->
r
[
1
]
=
rjk
;
path
->
w
[
0
]
=
wj
;
path
->
dw
[
0
]
=
dwj
;
path
->
w
[
1
]
=
wk
;
path
->
dw
[
1
]
=
dwk
;
path
->
idx
[
0
]
=
i
;
path
->
idx
[
1
]
=
j
;
path
->
idx
[
2
]
=
k
;
}
}
int
*
neighs_k
=
&
ka
->
neigh_rebo
.
entries
[
ka
->
neigh_rebo
.
offset
[
k
]];
for
(
int
ll
=
0
;
ll
<
ka
->
neigh_rebo
.
num
[
k
];
ll
++
)
{
int
l
=
neighs_k
[
ll
];
if
((
l
==
i
)
||
(
l
==
j
))
continue
;
int
ltype
=
map
[
x
[
l
].
w
];
flt_t
dklx
=
x
[
l
].
x
-
x
[
k
].
x
;
flt_t
dkly
=
x
[
l
].
y
-
x
[
k
].
y
;
flt_t
dklz
=
x
[
l
].
z
-
x
[
k
].
z
;
flt_t
rklsq
=
dklx
*
dklx
+
dkly
*
dkly
+
dklz
*
dklz
;
flt_t
wl
=
1
,
dwl
=
0
;
flt_t
rkl
=
0
;
if
(
rklsq
>=
rcminsq
[
ktype
][
ltype
])
{
rkl
=
overloaded
::
sqrt
(
rklsq
);
wl
=
Sp
(
rkl
,
rcmin
[
ktype
][
ltype
],
rcmax
[
ktype
][
ltype
],
&
dwl
);
}
int
attempt
=
0
;
int
start_hash_slot
=
aut_lj_tap_hash_fn
(
l
,
attempt
);
int
hash_slot
=
start_hash_slot
;
while
(
result
->
i
[
hash_slot
]
==
i
&&
result
->
j
[
hash_slot
]
!=
l
&&
attempt
<
OPT_TEST_PATH_SIZE
)
{
hash_slot
=
aut_lj_tap_hash_fn
(
l
,
++
attempt
);
}
if
(
attempt
>=
OPT_TEST_PATH_SIZE
)
goto
exceed_limits
;
bool
init_slot
=
result
->
i
[
hash_slot
]
!=
i
;
if
(
init_slot
||
(
1
-
wj
*
wk
*
wl
<
result
->
cij
[
hash_slot
]))
{
result
->
i
[
hash_slot
]
=
i
;
result
->
j
[
hash_slot
]
=
l
;
result
->
cij
[
hash_slot
]
=
1
-
wj
*
wk
*
wl
;
if
(
wj
*
wk
*
wl
!=
1.0
)
{
if
(
path_insert_pos
>=
OPT_TEST_PATH_ITEMS
)
goto
exceed_limits
;
result
->
testpath_idx
[
hash_slot
]
=
path_insert_pos
;
LennardJonesPathAIREBOT
<
flt_t
>
*
path
=
&
result
->
testpath
[
path_insert_pos
++
];
path
->
num
=
4
;
path
->
del
[
0
].
x
=
dijx
;
path
->
del
[
0
].
y
=
dijy
;
path
->
del
[
0
].
z
=
dijz
;
if
(
rij
==
0
)
rij
=
sqrt
(
rijsq
);
path
->
r
[
0
]
=
rij
;
path
->
del
[
1
].
x
=
djkx
;
path
->
del
[
1
].
y
=
djky
;
path
->
del
[
1
].
z
=
djkz
;
if
(
rjk
==
0
)
rjk
=
sqrt
(
rjksq
);
path
->
r
[
1
]
=
rjk
;
path
->
del
[
2
].
x
=
dklx
;
path
->
del
[
2
].
y
=
dkly
;
path
->
del
[
2
].
z
=
dklz
;
if
(
rkl
==
0
)
rkl
=
sqrt
(
rklsq
);
path
->
r
[
2
]
=
rkl
;
path
->
w
[
0
]
=
wj
;
path
->
dw
[
0
]
=
dwj
;
path
->
w
[
1
]
=
wk
;
path
->
dw
[
1
]
=
dwk
;
path
->
w
[
2
]
=
wl
;
path
->
dw
[
2
]
=
dwl
;
path
->
idx
[
0
]
=
i
;
path
->
idx
[
1
]
=
j
;
path
->
idx
[
2
]
=
k
;
path
->
idx
[
3
]
=
l
;
}
}
}
}
}
return
true
;
exceed_limits:
return
false
;
}
/*
* Attempt to look up an element in the hash-map.
*/
static
fvec
aut_airebo_lj_tap_test_path
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
struct
aut_airebo_lj_test_path_result_data
*
test_path_result
,
bvec
need_search
,
ivec
i_bc
,
ivec
j
,
LennardJonesPathAIREBOT
<
flt_t
>
path
[
fvec
::
VL
]
)
{
const
ivec
c_i1
=
ivec
::
set1
(
1
);
fvec
cij
=
fvec
::
set1
(
1.0
);
// first round: hash all j
// lookup i/j in hash list.
// if i matches and j matches: congrats
// if i matches and j does not: look up attempts
// if attempts > current_attempts:
// do another round of hashing
// for all those found:
// fill in the path
// -----------------------------------------------
// find all the correct hash slots, and a mask of where found.
ivec
attempt
=
ivec
::
setzero
();
ivec
hash_slot
=
aut_airebo_lj_tap_hash_fn_vec
(
j
,
attempt
);
ivec
lookup_i
=
ivec
::
mask_gather
(
ivec
::
undefined
(),
need_search
,
hash_slot
,
&
test_path_result
->
i
[
0
],
sizeof
(
int
));
bvec
correct_i
=
ivec
::
mask_cmpeq
(
need_search
,
lookup_i
,
i_bc
);
ivec
lookup_j
=
ivec
::
mask_gather
(
ivec
::
undefined
(),
correct_i
,
hash_slot
,
&
test_path_result
->
j
[
0
],
sizeof
(
int
));
bvec
found_items
=
ivec
::
mask_cmpeq
(
correct_i
,
lookup_j
,
j
);
bvec
another_attempt
=
correct_i
&
~
found_items
;
while
(
bvec
::
test_any_set
(
another_attempt
))
{
attempt
=
ivec
::
mask_add
(
attempt
,
another_attempt
,
attempt
,
c_i1
);
hash_slot
=
aut_airebo_lj_tap_hash_fn_vec
(
j
,
attempt
);
ivec
lookup_i_2
=
ivec
::
mask_gather
(
lookup_i
,
another_attempt
,
hash_slot
,
&
test_path_result
->
i
[
0
],
sizeof
(
int
));
lookup_i
=
lookup_i_2
;
correct_i
=
ivec
::
mask_cmpeq
(
need_search
,
lookup_i
,
i_bc
);
lookup_j
=
ivec
::
mask_gather
(
lookup_j
,
another_attempt
,
hash_slot
,
&
test_path_result
->
j
[
0
],
sizeof
(
int
));
found_items
=
ivec
::
mask_cmpeq
(
correct_i
,
lookup_j
,
j
);
another_attempt
=
correct_i
&
~
found_items
;
}
cij
=
fvec
::
mask_gather
(
cij
,
found_items
,
hash_slot
,
&
test_path_result
->
cij
[
0
],
sizeof
(
flt_t
));
bvec
need_testpath
=
fvec
::
mask_cmplt
(
found_items
,
fvec
::
setzero
(),
cij
);
if
(
bvec
::
test_any_set
(
need_testpath
))
{
for
(
int
i
=
0
;
i
<
fvec
::
VL
;
i
++
)
{
if
(
bvec
::
test_at
(
need_testpath
,
i
))
{
int
testpath_idx
=
test_path_result
->
testpath_idx
[
ivec
::
at
(
hash_slot
,
i
)];
path
[
i
]
=
test_path_result
->
testpath
[
testpath_idx
];
}
}
}
return
cij
;
}
/*
* This function calculates the Lennard-Jones interaciton for those
* elements that require a bond-order calculation.
* It is similarly structured as the aut_frebo_batch_of_kind function.
* The forces due to bondorders are calculated speculatively and later
* updated with the correct outer derivative.
*/
template
<
int
MORSEFLAG
>
static
void
aut_lj_with_bo
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
itype
,
int
jtype
,
ivec
i
,
ivec
j
,
fvec
cij
,
LennardJonesPathAIREBOT
<
flt_t
>
testpath
[
fvec
::
VL
]
)
{
{
// jump-scope for exceed_limits
AtomAIREBOT
<
flt_t
>
*
_noalias
x
=
ka
->
x
;
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
ivec
c_i4
=
ivec
::
set1
(
4
);
fvec
c_1_0
=
fvec
::
set1
(
1.0
);
fvec
c_2_0
=
fvec
::
set1
(
2.0
);
fvec
c_0_5
=
fvec
::
set1
(
0.5
);
fvec
x_i
,
y_i
,
z_i
;
aut_loadatoms_vec_notype
(
x
,
i
,
&
x_i
,
&
y_i
,
&
z_i
);
fvec
x_j
,
y_j
,
z_j
;
aut_loadatoms_vec_notype
(
x
,
j
,
&
x_j
,
&
y_j
,
&
z_j
);
fvec
delx
=
x_i
-
x_j
;
fvec
dely
=
y_i
-
y_j
;
fvec
delz
=
z_i
-
z_j
;
fvec
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
fvec
rij
=
fvec
::
sqrt
(
rsq
);
bvec
need_path_force
=
fvec
::
cmplt
(
cij
,
c_1_0
);
flt_t
sigcut
=
ka
->
params
.
sigcut
;
flt_t
sigmin
=
ka
->
params
.
sigmin
;
flt_t
sigma
=
ka
->
params
.
sigma
[
itype
][
jtype
];
flt_t
rljmax
=
sigcut
*
sigma
;
flt_t
rljmin
=
sigmin
*
sigma
;
fvec
p_rljmin
=
fvec
::
set1
(
rljmin
);
fvec
p_rljmax
=
fvec
::
set1
(
rljmax
);
fvec
dslw
,
slw
=
aut_Sp2_deriv
(
rij
,
p_rljmin
,
p_rljmax
,
&
dslw
);
fvec
p_lj1
=
fvec
::
set1
(
ka
->
params
.
lj1
[
itype
][
jtype
]);
fvec
p_lj2
=
fvec
::
set1
(
ka
->
params
.
lj2
[
itype
][
jtype
]);
fvec
p_lj3
=
fvec
::
set1
(
ka
->
params
.
lj3
[
itype
][
jtype
]);
fvec
p_lj4
=
fvec
::
set1
(
ka
->
params
.
lj4
[
itype
][
jtype
]);
fvec
r2inv
=
fvec
::
recip
(
rsq
);
fvec
vdw
,
dvdw
;
if
(
MORSEFLAG
)
{
fvec
exr
=
fvec
::
exp
(
fvec
::
setzero
()
-
rij
*
p_lj4
);
vdw
=
p_lj1
*
exr
*
(
p_lj2
*
exr
-
c_2_0
);
dvdw
=
p_lj3
*
exr
*
(
c_1_0
-
p_lj2
*
exr
);
}
else
{
fvec
r6inv
=
r2inv
*
r2inv
*
r2inv
;
vdw
=
r6inv
*
(
p_lj3
*
r6inv
-
p_lj4
);
fvec
r7inv
=
r6inv
*
rij
*
r2inv
;
dvdw
=
r7inv
*
(
p_lj2
-
p_lj1
*
r6inv
);
}
fvec
VLJ
=
vdw
*
slw
;
fvec
dVLJ
=
dvdw
*
slw
+
vdw
*
dslw
;
fvec
p_rcLJmin
=
fvec
::
set1
(
ka
->
params
.
rcLJmin
[
itype
][
jtype
]);
fvec
p_rcLJmax
=
fvec
::
set1
(
ka
->
params
.
rcLJmax
[
itype
][
jtype
]);
fvec
dStr
,
Str
=
aut_Sp2_deriv
(
rij
,
p_rcLJmin
,
p_rcLJmax
,
&
dStr
);
fvec
VA
=
cij
*
VLJ
*
Str
;
fvec
fij
[
3
],
fji
[
3
];
fij
[
0
]
=
fvec
::
setzero
();
fij
[
1
]
=
fvec
::
setzero
();
fij
[
2
]
=
fvec
::
setzero
();
fji
[
0
]
=
fvec
::
setzero
();
fji
[
1
]
=
fvec
::
setzero
();
fji
[
2
]
=
fvec
::
setzero
();
ivec
vi
=
i
;
ivec
vj
=
j
;
struct
aut_frebo_data
i_data
,
j_data
;
i_data
.
x_i
=
x_i
;
i_data
.
y_i
=
y_i
;
i_data
.
z_i
=
z_i
;
i_data
.
x_j
=
x_j
;
i_data
.
y_j
=
y_j
;
i_data
.
z_j
=
z_j
;
j_data
.
x_i
=
x_j
;
j_data
.
y_i
=
y_j
;
j_data
.
z_i
=
z_j
;
j_data
.
x_j
=
x_i
;
j_data
.
y_j
=
y_i
;
j_data
.
z_j
=
z_i
;
fvec
p_rcmin
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
jtype
]);
fvec
p_rcmax
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
jtype
]);
fvec
dwij
;
fvec
wij
=
aut_Sp_deriv
(
rij
,
p_rcmin
,
p_rcmax
,
&
dwij
);
fvec
nHi
=
fvec
::
gather
(
vi
,
ka
->
nH
,
sizeof
(
flt_t
));
fvec
nCi
=
fvec
::
gather
(
vi
,
ka
->
nC
,
sizeof
(
flt_t
));
fvec
nHj
=
fvec
::
gather
(
vj
,
ka
->
nH
,
sizeof
(
flt_t
));
fvec
nCj
=
fvec
::
gather
(
vj
,
ka
->
nC
,
sizeof
(
flt_t
));
fvec
Nij
=
nHi
+
nCi
-
wij
;
fvec
Nji
=
nHj
+
nCj
-
wij
;
i_data
.
nHi
=
nHi
;
i_data
.
nCi
=
nCi
;
j_data
.
nHi
=
nHj
;
j_data
.
nCi
=
nCj
;
fvec
the_r
=
fvec
::
set1
(
ka
->
params
.
rcmin
[
itype
][
jtype
]);
fvec
scale
=
the_r
/
rij
;
fvec
NconjtmpI
;
fvec
pij
=
aut_frebo_pij_pd_2
(
ka
,
&
i_data
,
itype
,
jtype
,
vi
,
vj
,
delx
*
scale
,
dely
*
scale
,
delz
*
scale
,
the_r
,
wij
,
VA
,
&
NconjtmpI
,
fij
);
if
(
i_data
.
buf_len
<
0
)
goto
exceed_limits
;
fvec
NconjtmpJ
;
fvec
rjix
=
fvec
::
setzero
()
-
delx
;
fvec
rjiy
=
fvec
::
setzero
()
-
dely
;
fvec
rjiz
=
fvec
::
setzero
()
-
delz
;
fvec
pji
=
aut_frebo_pij_pd_2
(
ka
,
&
j_data
,
jtype
,
itype
,
vj
,
vi
,
rjix
*
scale
,
rjiy
*
scale
,
rjiz
*
scale
,
the_r
,
wij
,
VA
,
&
NconjtmpJ
,
fji
);
fij
[
0
]
=
fij
[
0
]
-
fji
[
0
];
fij
[
1
]
=
fij
[
1
]
-
fji
[
1
];
fij
[
2
]
=
fij
[
2
]
-
fji
[
2
];
if
(
j_data
.
buf_len
<
0
)
goto
exceed_limits
;
fvec
Nijconj
=
c_1_0
+
NconjtmpI
*
NconjtmpI
+
NconjtmpJ
*
NconjtmpJ
;
fvec
dN3
[
3
];
fvec
pi_rc
=
aut_frebo_pi_rc_pd
(
ka
,
itype
,
jtype
,
Nij
,
Nji
,
Nijconj
,
dN3
);
fvec
c_TOL
=
fvec
::
set1
(
TOL
);
fvec
dN3_dh
[
3
];
fvec
Tij
=
aut_frebo_Tij
(
ka
,
itype
,
jtype
,
Nij
,
Nji
,
Nijconj
,
&
dN3_dh
[
0
]);
bvec
TijgtTOLmask
=
fvec
::
cmpnle
(
fvec
::
abs
(
Tij
),
c_TOL
);
fvec
sum_omega
=
fvec
::
setzero
();
if
(
bvec
::
test_any_set
(
TijgtTOLmask
))
{
sum_omega
=
aut_frebo_sum_omega
(
ka
,
&
i_data
,
&
j_data
,
itype
,
jtype
,
vi
,
vj
,
delx
*
scale
,
dely
*
scale
,
delz
*
scale
,
the_r
,
VA
*
Tij
,
fij
);
sum_omega
=
fvec
::
mask_blend
(
TijgtTOLmask
,
fvec
::
setzero
(),
sum_omega
);
}
fvec
pi_dh
=
Tij
*
sum_omega
;
fvec
bij
=
c_0_5
*
(
pij
+
pji
)
+
pi_rc
+
pi_dh
;
fvec
p_bLJmin
=
fvec
::
set1
(
ka
->
params
.
bLJmin
[
itype
][
jtype
]);
fvec
p_bLJmax
=
fvec
::
set1
(
ka
->
params
.
bLJmax
[
itype
][
jtype
]);
fvec
dStb
,
Stb
=
aut_Sp2_deriv
(
bij
,
p_bLJmin
,
p_bLJmax
,
&
dStb
);
bvec
need_bo_deriv
=
fvec
::
cmpneq
(
dStb
,
fvec
::
setzero
());
// fix up j_data, i_data, fij:
// multiply each by dStb
if
(
bvec
::
test_any_set
(
need_bo_deriv
))
{
i_data
.
force_i_x
=
dStb
*
i_data
.
force_i_x
;
i_data
.
force_i_y
=
dStb
*
i_data
.
force_i_y
;
i_data
.
force_i_z
=
dStb
*
i_data
.
force_i_z
;
i_data
.
force_j_x
=
dStb
*
i_data
.
force_j_x
;
i_data
.
force_j_y
=
dStb
*
i_data
.
force_j_y
;
i_data
.
force_j_z
=
dStb
*
i_data
.
force_j_z
;
j_data
.
force_i_x
=
dStb
*
j_data
.
force_i_x
;
j_data
.
force_i_y
=
dStb
*
j_data
.
force_i_y
;
j_data
.
force_i_z
=
dStb
*
j_data
.
force_i_z
;
j_data
.
force_j_x
=
dStb
*
j_data
.
force_j_x
;
j_data
.
force_j_y
=
dStb
*
j_data
.
force_j_y
;
j_data
.
force_j_z
=
dStb
*
j_data
.
force_j_z
;
for
(
int
k
=
0
;
k
<
i_data
.
buf_len
;
k
++
)
{
i_data
.
force_k_x_buf
[
k
]
=
dStb
*
i_data
.
force_k_x_buf
[
k
];
i_data
.
force_k_y_buf
[
k
]
=
dStb
*
i_data
.
force_k_y_buf
[
k
];
i_data
.
force_k_z_buf
[
k
]
=
dStb
*
i_data
.
force_k_z_buf
[
k
];
}
for
(
int
k
=
0
;
k
<
j_data
.
buf_len
;
k
++
)
{
j_data
.
force_k_x_buf
[
k
]
=
dStb
*
j_data
.
force_k_x_buf
[
k
];
j_data
.
force_k_y_buf
[
k
]
=
dStb
*
j_data
.
force_k_y_buf
[
k
];
j_data
.
force_k_z_buf
[
k
]
=
dStb
*
j_data
.
force_k_z_buf
[
k
];
}
fvec
fijc
[
3
];
fijc
[
0
]
=
dStb
*
fij
[
0
];
fijc
[
1
]
=
dStb
*
fij
[
1
];
fijc
[
2
]
=
dStb
*
fij
[
2
];
fij
[
0
]
=
scale
*
(
fijc
[
0
]
-
(
delx
*
delx
*
fijc
[
0
]
+
dely
*
delx
*
fijc
[
1
]
+
delz
*
delx
*
fijc
[
2
])
/
rsq
);
fij
[
1
]
=
scale
*
(
fijc
[
1
]
-
(
delx
*
dely
*
fijc
[
0
]
+
dely
*
dely
*
fijc
[
1
]
+
delz
*
dely
*
fijc
[
2
])
/
rsq
);
fij
[
2
]
=
scale
*
(
fijc
[
2
]
-
(
delx
*
delz
*
fijc
[
0
]
+
dely
*
delz
*
fijc
[
1
]
+
delz
*
delz
*
fijc
[
2
])
/
rsq
);
aut_frebo_N_spline_force
(
ka
,
&
i_data
,
itype
,
jtype
,
vi
,
vj
,
dStb
*
VA
,
dN3
[
0
],
dN3
[
2
],
NconjtmpI
);
aut_frebo_N_spline_force
(
ka
,
&
j_data
,
jtype
,
itype
,
vj
,
vi
,
dStb
*
VA
,
dN3
[
1
],
dN3
[
2
],
NconjtmpJ
);
if
(
bvec
::
test_any_set
(
TijgtTOLmask
))
{
aut_frebo_N_spline_force
(
ka
,
&
i_data
,
itype
,
jtype
,
vi
,
vj
,
dStb
*
VA
*
sum_omega
,
dN3_dh
[
0
],
dN3_dh
[
2
],
NconjtmpI
);
aut_frebo_N_spline_force
(
ka
,
&
j_data
,
jtype
,
itype
,
vj
,
vi
,
dStb
*
VA
*
sum_omega
,
dN3_dh
[
1
],
dN3_dh
[
2
],
NconjtmpJ
);
}
aut_frebo_data_writeback
(
ka
,
&
i_data
);
aut_frebo_data_writeback
(
ka
,
&
j_data
);
}
else
{
fij
[
0
]
=
fvec
::
setzero
();
fij
[
1
]
=
fvec
::
setzero
();
fij
[
2
]
=
fvec
::
setzero
();
}
fvec
fpdVLJ
=
cij
*
dVLJ
*
(
c_1_0
+
Str
*
(
Stb
-
c_1_0
));
fvec
fpdStr
=
dStr
*
cij
*
(
Stb
*
VLJ
-
VLJ
);
fvec
fpair
=
r2inv
*
rij
*
(
fvec
::
setzero
()
-
(
fpdVLJ
+
fpdStr
));
fvec
evdwl
=
VA
*
Stb
+
cij
*
VLJ
*
(
c_1_0
-
Str
);
fvec
result_f_i_x
=
fpair
*
delx
+
fij
[
0
];
fvec
result_f_i_y
=
fpair
*
dely
+
fij
[
1
];
fvec
result_f_i_z
=
fpair
*
delz
+
fij
[
2
];
fvec
result_f_j_x
=
fvec
::
setzero
()
-
result_f_i_x
;
fvec
result_f_j_y
=
fvec
::
setzero
()
-
result_f_i_y
;
fvec
result_f_j_z
=
fvec
::
setzero
()
-
result_f_i_z
;
flt_t
fi_x_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fi_y_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fi_z_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
int
fi_i_buf
[
ivec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fj_x_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fj_y_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
fj_z_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
int
fj_j_buf
[
ivec
::
VL
]
__attribute__
((
aligned
(
64
)));
flt_t
evdwl_buf
[
fvec
::
VL
]
__attribute__
((
aligned
(
64
)));
if
(
bvec
::
test_any_set
(
need_bo_deriv
))
{
result_f_i_x
=
i_data
.
force_i_x
+
result_f_i_x
;
result_f_i_y
=
i_data
.
force_i_y
+
result_f_i_y
;
result_f_i_z
=
i_data
.
force_i_z
+
result_f_i_z
;
result_f_j_x
=
i_data
.
force_j_x
+
result_f_j_x
;
result_f_j_y
=
i_data
.
force_j_y
+
result_f_j_y
;
result_f_j_z
=
i_data
.
force_j_z
+
result_f_j_z
;
result_f_i_x
=
j_data
.
force_j_x
+
result_f_i_x
;
result_f_i_y
=
j_data
.
force_j_y
+
result_f_i_y
;
result_f_i_z
=
j_data
.
force_j_z
+
result_f_i_z
;
result_f_j_x
=
j_data
.
force_i_x
+
result_f_j_x
;
result_f_j_y
=
j_data
.
force_i_y
+
result_f_j_y
;
result_f_j_z
=
j_data
.
force_i_z
+
result_f_j_z
;
}
fvec
::
store
(
fi_x_buf
,
result_f_i_x
);
fvec
::
store
(
fi_y_buf
,
result_f_i_y
);
fvec
::
store
(
fi_z_buf
,
result_f_i_z
);
ivec
::
store
(
fi_i_buf
,
vi
);
fvec
::
store
(
fj_x_buf
,
result_f_j_x
);
fvec
::
store
(
fj_y_buf
,
result_f_j_y
);
fvec
::
store
(
fj_z_buf
,
result_f_j_z
);
ivec
::
store
(
fj_j_buf
,
vj
);
fvec
::
store
(
evdwl_buf
,
evdwl
);
int
lane
;
for
(
lane
=
0
;
lane
<
fvec
::
VL
;
lane
++
)
{
int
ii
=
fi_i_buf
[
lane
];
result_f
[
ii
].
x
+=
fi_x_buf
[
lane
];
result_f
[
ii
].
y
+=
fi_y_buf
[
lane
];
result_f
[
ii
].
z
+=
fi_z_buf
[
lane
];
result_f
[
ii
].
w
+=
0.5
*
evdwl_buf
[
lane
];
int
jj
=
fj_j_buf
[
lane
];
result_f
[
jj
].
x
+=
fj_x_buf
[
lane
];
result_f
[
jj
].
y
+=
fj_y_buf
[
lane
];
result_f
[
jj
].
z
+=
fj_z_buf
[
lane
];
result_f
[
jj
].
w
+=
0.5
*
evdwl_buf
[
lane
];
}
ka
->
result_eng
+=
fvec
::
reduce_add
(
evdwl
);
if
(
bvec
::
test_any_set
(
need_path_force
))
{
fvec
dC
=
VLJ
*
(
Str
*
Stb
+
c_1_0
-
Str
);
aut_airebo_lj_force_path
(
ka
,
need_path_force
,
dC
,
testpath
);
}
return
;
}
exceed_limits:
for
(
int
l
=
0
;
l
<
fvec
::
VL
;
l
++
)
{
ref_lennard_jones_single_interaction
(
ka
,
ivec
::
at
(
i
,
l
),
ivec
::
at
(
j
,
l
),
MORSEFLAG
);
}
return
;
}
/*
* Calculate the lennard-jones interaction.
* Uses the above hash-map, and outlines the calculation if the bondorder is
* needed.
* Agressively compresses to get the most values calculated.
*/
template
<
int
MORSEFLAG
>
static
void
aut_lennard_jones
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
)
{
AtomAIREBOT
<
flt_t
>
*
x
=
ka
->
x
;
int
*
tag
=
ka
->
tag
;
int
*
map
=
ka
->
map
;
ResultForceT
<
acc_t
>
*
result_f
=
ka
->
result_f
;
ivec
c_i1
=
ivec
::
set1
(
1
);
ivec
c_i4
=
ivec
::
set1
(
4
);
fvec
c_1_0
=
fvec
::
set1
(
1.0
);
fvec
c_2_0
=
fvec
::
set1
(
2.0
);
fvec
c_0_0
=
fvec
::
set1
(
0.0
);
int
map_i_scalar
=
0
;
{
int
i
;
for
(
i
=
1
;
i
<
ka
->
num_types
;
i
++
)
{
if
(
ka
->
map
[
i
])
map_i_scalar
|=
(
1
<<
i
);
}
}
ivec
map_i
=
ivec
::
set1
(
map_i_scalar
);
fvec
result_eng
=
fvec
::
setzero
();
struct
aut_airebo_lj_test_path_result_data
test_path_result
;
for
(
int
i
=
0
;
i
<
OPT_TEST_PATH_SIZE
;
i
++
)
{
test_path_result
.
i
[
i
]
=
-
1
;
}
ivec
i_bo
[
2
][
2
];
ivec
j_bo
[
2
][
2
];
fvec
cij_bo
[
2
][
2
];
LennardJonesPathAIREBOT
<
flt_t
>
testpath_bo
[
2
][
2
][
fvec
::
VL
];
int
num_bo
[
2
][
2
]
=
{
0
};
for
(
int
i
=
ka
->
frebo_from_atom
;
i
<
ka
->
frebo_to_atom
;
i
++
)
{
ivec
itag_bc
=
ivec
::
set1
(
tag
[
i
]);
int
itype
=
map
[
x
[
i
].
w
];
fvec
x_i
=
fvec
::
set1
(
x
[
i
].
x
);
fvec
y_i
=
fvec
::
set1
(
x
[
i
].
y
);
fvec
z_i
=
fvec
::
set1
(
x
[
i
].
z
);
ivec
i_bc
=
ivec
::
set1
(
i
);
fvec
cutljsq0
=
fvec
::
set1
(
ka
->
params
.
cutljsq
[
itype
][
0
]);
fvec
cutljsq1
=
fvec
::
set1
(
ka
->
params
.
cutljsq
[
itype
][
1
]);
fvec
p_rcmax0
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
0
]);
fvec
p_rcmax1
=
fvec
::
set1
(
ka
->
params
.
rcmax
[
itype
][
1
]);
flt_t
sigcut
=
ka
->
params
.
sigcut
;
flt_t
sigmin
=
ka
->
params
.
sigmin
;
flt_t
sigma0
=
ka
->
params
.
sigma
[
itype
][
0
];
flt_t
rljmax0
=
sigcut
*
sigma0
;
flt_t
rljmin0
=
sigmin
*
sigma0
;
flt_t
sigma1
=
ka
->
params
.
sigma
[
itype
][
1
];
flt_t
rljmax1
=
sigcut
*
sigma1
;
flt_t
rljmin1
=
sigmin
*
sigma1
;
fvec
p_rljmax0
=
fvec
::
set1
(
rljmax0
);
fvec
p_rljmax1
=
fvec
::
set1
(
rljmax1
);
fvec
p_rljmin0
=
fvec
::
set1
(
rljmin0
);
fvec
p_rljmin1
=
fvec
::
set1
(
rljmin1
);
fvec
p_rcLJmax0
=
fvec
::
set1
(
ka
->
params
.
rcLJmax
[
itype
][
0
]);
fvec
p_rcLJmax1
=
fvec
::
set1
(
ka
->
params
.
rcLJmax
[
itype
][
1
]);
fvec
p_rcLJmin0
=
fvec
::
set1
(
ka
->
params
.
rcLJmin
[
itype
][
0
]);
fvec
p_rcLJmin1
=
fvec
::
set1
(
ka
->
params
.
rcLJmin
[
itype
][
1
]);
fvec
p_lj10
=
fvec
::
set1
(
ka
->
params
.
lj1
[
itype
][
0
]);
fvec
p_lj11
=
fvec
::
set1
(
ka
->
params
.
lj1
[
itype
][
1
]);
fvec
p_lj20
=
fvec
::
set1
(
ka
->
params
.
lj2
[
itype
][
0
]);
fvec
p_lj21
=
fvec
::
set1
(
ka
->
params
.
lj2
[
itype
][
1
]);
fvec
p_lj30
=
fvec
::
set1
(
ka
->
params
.
lj3
[
itype
][
0
]);
fvec
p_lj31
=
fvec
::
set1
(
ka
->
params
.
lj3
[
itype
][
1
]);
fvec
p_lj40
=
fvec
::
set1
(
ka
->
params
.
lj4
[
itype
][
0
]);
fvec
p_lj41
=
fvec
::
set1
(
ka
->
params
.
lj4
[
itype
][
1
]);
int
*
neighs
=
ka
->
neigh_lmp
.
entries
+
ka
->
neigh_lmp
.
offset
[
i
];
int
jnum
=
ka
->
neigh_lmp
.
num_half
[
i
];
bool
tap_success
=
aut_airebo_lj_test_all_paths
(
ka
,
i
,
&
test_path_result
);
if
(
!
tap_success
)
{
for
(
int
jj
=
0
;
jj
<
jnum
;
jj
++
)
{
ref_lennard_jones_single_interaction
(
ka
,
i
,
neighs
[
jj
],
MORSEFLAG
);
}
continue
;
}
ivec
j_2
;
fvec
delx_2
,
dely_2
,
delz_2
,
rsq_2
;
bvec
jtype_mask_2
;
int
num_2
=
0
;
fvec
result_f_i_x
=
fvec
::
setzero
();
fvec
result_f_i_y
=
fvec
::
setzero
();
fvec
result_f_i_z
=
fvec
::
setzero
();
int
jj
=
0
;
bool
rest_j
=
jj
<
jnum
;
bool
rest_2
=
fvec
::
fast_compress
();
#pragma forceinline recursive
while
(
rest_j
||
rest_2
)
{
fvec
delx
,
dely
,
delz
,
rsq
;
bvec
jtype_mask
,
within_cutoff
;
ivec
j
;
if
(
rest_j
)
{
bvec
mask_0
=
bvec
::
full
();
//0xFF >> (8 - (jnum - jj));
if
(
jj
+
(
fvec
::
VL
-
1
)
>=
jnum
)
mask_0
=
bvec
::
only
(
jnum
-
jj
);
j
=
ivec
::
maskz_loadu
(
mask_0
,
&
neighs
[
jj
]);
fvec
x_j
,
y_j
,
z_j
;
aut_loadatoms_vec
(
x
,
j
,
&
x_j
,
&
y_j
,
&
z_j
,
&
jtype_mask
,
map
,
map_i
,
c_i1
);
fvec
::
gather_prefetch0
(
ivec
::
mullo
(
c_i4
,
ivec
::
maskz_loadu
(
bvec
::
full
(),
&
neighs
[
jj
+
fvec
::
VL
])),
x
);
_mm_prefetch
((
const
char
*
)
&
neighs
[
jj
+
2
*
fvec
::
VL
],
_MM_HINT_T0
);
delx
=
x_i
-
x_j
;
dely
=
y_i
-
y_j
;
delz
=
z_i
-
z_j
;
rsq
=
delx
*
delx
+
dely
*
dely
+
delz
*
delz
;
fvec
cutoff_sq
=
fvec
::
mask_blend
(
jtype_mask
,
cutljsq0
,
cutljsq1
);
within_cutoff
=
fvec
::
mask_cmplt
(
mask_0
,
rsq
,
cutoff_sq
);
if
(
fvec
::
fast_compress
())
{
j
=
ivec
::
masku_compress
(
within_cutoff
,
j
);
delx
=
fvec
::
masku_compress
(
within_cutoff
,
delx
);
dely
=
fvec
::
masku_compress
(
within_cutoff
,
dely
);
delz
=
fvec
::
masku_compress
(
within_cutoff
,
delz
);
rsq
=
fvec
::
masku_compress
(
within_cutoff
,
rsq
);
jtype_mask
=
bvec
::
masku_compress
(
within_cutoff
,
jtype_mask
);
//within_cutoff = 0xFF >> (8 - _cc_popcnt(within_cutoff));
bvec
mask_2
=
bvec
::
after
(
num_2
);
//0xFF << num_2;
j_2
=
ivec
::
mask_expand
(
j_2
,
mask_2
,
j
);
delx_2
=
fvec
::
mask_expand
(
delx_2
,
mask_2
,
delx
);
dely_2
=
fvec
::
mask_expand
(
dely_2
,
mask_2
,
dely
);
delz_2
=
fvec
::
mask_expand
(
delz_2
,
mask_2
,
delz
);
rsq_2
=
fvec
::
mask_expand
(
rsq_2
,
mask_2
,
rsq
);
jtype_mask_2
=
bvec
::
mask_expand
(
jtype_mask_2
,
mask_2
,
jtype_mask
);
num_2
=
num_2
+
bvec
::
popcnt
(
within_cutoff
);
if
(
num_2
<
fvec
::
VL
)
{
jj
+=
fvec
::
VL
;
rest_j
=
jj
<
jnum
;
continue
;
}
num_2
-=
fvec
::
VL
;
//(0xFF >> (8 - num_2)) << (_cc_popcnt(within_cutoff) - num_2);
mask_2
=
bvec
::
onlyafter
(
num_2
,
bvec
::
popcnt
(
within_cutoff
)
-
num_2
);
{
ivec
tmp_j
=
j_2
;
j_2
=
ivec
::
masku_compress
(
mask_2
,
j
);
j
=
tmp_j
;
fvec
tmp_delx
=
delx_2
;
delx_2
=
fvec
::
masku_compress
(
mask_2
,
delx
);
delx
=
tmp_delx
;
fvec
tmp_dely
=
dely_2
;
dely_2
=
fvec
::
masku_compress
(
mask_2
,
dely
);
dely
=
tmp_dely
;
fvec
tmp_delz
=
delz_2
;
delz_2
=
fvec
::
masku_compress
(
mask_2
,
delz
);
delz
=
tmp_delz
;
fvec
tmp_rsq
=
rsq_2
;
rsq_2
=
fvec
::
masku_compress
(
mask_2
,
rsq
);
rsq
=
tmp_rsq
;
bvec
tmp_jtype_mask
=
jtype_mask_2
;
jtype_mask_2
=
bvec
::
masku_compress
(
mask_2
,
jtype_mask
);
jtype_mask
=
tmp_jtype_mask
;
within_cutoff
=
bvec
::
full
();
}
}
}
else
if
(
rest_2
)
{
rest_2
=
false
;
j
=
j_2
;
delx
=
delx_2
;
dely
=
dely_2
;
delz
=
delz_2
;
rsq
=
rsq_2
;
jtype_mask
=
jtype_mask_2
;
within_cutoff
=
bvec
::
only
(
num_2
);
num_2
=
0
;
}
bvec
current_mask
=
within_cutoff
;
if
(
bvec
::
test_all_unset
(
current_mask
))
{
jj
+=
fvec
::
VL
;
rest_j
=
jj
<
jnum
;
continue
;
}
fvec
rij
=
fvec
::
sqrt
(
rsq
);
LennardJonesPathAIREBOT
<
flt_t
>
testpath
[
fvec
::
VL
];
fvec
cij
=
c_1_0
;
fvec
p_cut3rebo
=
fvec
::
set1
(
ka
->
params
.
cut3rebo
);
bvec
need_search
=
fvec
::
mask_cmplt
(
current_mask
,
rij
,
p_cut3rebo
);
if
(
bvec
::
test_any_set
(
need_search
))
{
fvec
p_rcmax
=
fvec
::
mask_blend
(
jtype_mask
,
p_rcmax0
,
p_rcmax1
);
#pragma noinline
cij
=
aut_airebo_lj_tap_test_path
(
ka
,
&
test_path_result
,
need_search
,
i_bc
,
j
,
testpath
);
}
current_mask
=
fvec
::
mask_cmplt
(
current_mask
,
c_0_0
,
cij
);
if
(
bvec
::
test_all_unset
(
current_mask
))
{
jj
+=
fvec
::
VL
;
rest_j
=
jj
<
jnum
;
continue
;
}
bvec
need_path_force
=
fvec
::
mask_cmplt
(
current_mask
,
cij
,
c_1_0
);
fvec
p_rljmax
=
fvec
::
mask_blend
(
jtype_mask
,
p_rljmax0
,
p_rljmax1
);
fvec
p_rljmin
=
fvec
::
mask_blend
(
jtype_mask
,
p_rljmin0
,
p_rljmin1
);
fvec
dslw
,
slw
=
aut_Sp2_deriv
(
rij
,
p_rljmin
,
p_rljmax
,
&
dslw
);
fvec
p_lj1
=
fvec
::
mask_blend
(
jtype_mask
,
p_lj10
,
p_lj11
);
fvec
p_lj2
=
fvec
::
mask_blend
(
jtype_mask
,
p_lj20
,
p_lj21
);
fvec
p_lj3
=
fvec
::
mask_blend
(
jtype_mask
,
p_lj30
,
p_lj31
);
fvec
p_lj4
=
fvec
::
mask_blend
(
jtype_mask
,
p_lj40
,
p_lj41
);
fvec
vdw
,
dvdw
;
fvec
r2inv
=
fvec
::
recip
(
rsq
);
if
(
MORSEFLAG
)
{
fvec
exr
=
fvec
::
exp
(
fvec
::
setzero
()
-
rij
*
p_lj4
);
vdw
=
p_lj1
*
exr
*
(
p_lj2
*
exr
-
c_2_0
);
dvdw
=
p_lj3
*
exr
*
(
c_1_0
-
p_lj2
*
exr
);
}
else
{
fvec
r6inv
=
r2inv
*
r2inv
*
r2inv
;
vdw
=
r6inv
*
(
p_lj3
*
r6inv
-
p_lj4
);
fvec
r7inv
=
r6inv
*
rij
*
r2inv
;
dvdw
=
r7inv
*
(
p_lj2
-
p_lj1
*
r6inv
);
}
fvec
VLJ
=
vdw
*
slw
;
fvec
dVLJ
=
dvdw
*
slw
+
vdw
*
dslw
;
fvec
p_rcLJmin
=
fvec
::
mask_blend
(
jtype_mask
,
p_rcLJmin0
,
p_rcLJmin1
);
fvec
p_rcLJmax
=
fvec
::
mask_blend
(
jtype_mask
,
p_rcLJmax0
,
p_rcLJmax1
);
fvec
dStr
,
Str
=
aut_Sp2_deriv
(
rij
,
p_rcLJmin
,
p_rcLJmax
,
&
dStr
);
fvec
VA
=
cij
*
VLJ
*
Str
;
bvec
need_bondorder
=
fvec
::
mask_cmplt
(
current_mask
,
c_0_0
,
Str
);
fvec
Stb
=
fvec
::
setzero
();
fvec
fij
[
3
];
fij
[
0
]
=
fvec
::
setzero
();
fij
[
1
]
=
fvec
::
setzero
();
fij
[
2
]
=
fvec
::
setzero
();
if
(
bvec
::
test_any_set
(
need_bondorder
))
{
for
(
int
jtype
=
0
;
jtype
<
2
;
jtype
++
)
{
bvec
need_bo_with_jtype
=
need_bondorder
;
if
(
jtype
)
need_bo_with_jtype
=
need_bo_with_jtype
&
jtype_mask
;
else
need_bo_with_jtype
=
need_bo_with_jtype
&
~
jtype_mask
;
ivec
jtmp
=
ivec
::
masku_compress
(
need_bo_with_jtype
,
j
);
ivec
itmp
=
ivec
::
masku_compress
(
need_bo_with_jtype
,
ivec
::
set1
(
i
));
fvec
cijtmp
=
fvec
::
masku_compress
(
need_bo_with_jtype
,
cij
);
bvec
insert_mask
=
bvec
::
after
(
num_bo
[
itype
][
jtype
]);
i_bo
[
itype
][
jtype
]
=
ivec
::
mask_expand
(
i_bo
[
itype
][
jtype
],
insert_mask
,
itmp
);
j_bo
[
itype
][
jtype
]
=
ivec
::
mask_expand
(
j_bo
[
itype
][
jtype
],
insert_mask
,
jtmp
);
cij_bo
[
itype
][
jtype
]
=
fvec
::
mask_expand
(
cij_bo
[
itype
][
jtype
],
insert_mask
,
cijtmp
);
bvec
need_path_force_with_jtype
=
need_bo_with_jtype
&
need_path_force
;
int
testpath_end
=
fvec
::
VL
;
if
(
bvec
::
test_any_set
(
need_path_force_with_jtype
))
{
int
pos
=
num_bo
[
itype
][
jtype
];
for
(
int
l
=
0
;
l
<
fvec
::
VL
;
l
++
)
{
if
(
pos
>=
fvec
::
VL
)
{
testpath_end
=
l
;
break
;
}
if
(
bvec
::
test_at
(
need_path_force_with_jtype
,
l
))
{
testpath_bo
[
itype
][
jtype
][
pos
]
=
testpath
[
l
];
}
if
(
bvec
::
test_at
(
need_bo_with_jtype
,
l
))
{
pos
+=
1
;
}
}
}
num_bo
[
itype
][
jtype
]
=
num_bo
[
itype
][
jtype
]
+
bvec
::
popcnt
(
need_bo_with_jtype
);
if
(
num_bo
[
itype
][
jtype
]
>=
fvec
::
VL
)
{
#pragma noinline
aut_lj_with_bo
<
MORSEFLAG
>
(
ka
,
itype
,
jtype
,
i_bo
[
itype
][
jtype
],
j_bo
[
itype
][
jtype
],
cij_bo
[
itype
][
jtype
],
testpath_bo
[
itype
][
jtype
]);
num_bo
[
itype
][
jtype
]
-=
fvec
::
VL
;
insert_mask
=
bvec
::
onlyafter
(
num_bo
[
itype
][
jtype
],
bvec
::
popcnt
(
need_bo_with_jtype
)
-
num_bo
[
itype
][
jtype
]);
i_bo
[
itype
][
jtype
]
=
ivec
::
masku_compress
(
insert_mask
,
itmp
);
j_bo
[
itype
][
jtype
]
=
ivec
::
masku_compress
(
insert_mask
,
jtmp
);
cij_bo
[
itype
][
jtype
]
=
fvec
::
masku_compress
(
insert_mask
,
cijtmp
);
if
(
bvec
::
test_any_set
(
need_path_force_with_jtype
))
{
int
pos
=
0
;
for
(
int
l
=
testpath_end
;
l
<
fvec
::
VL
;
l
++
)
{
if
(
bvec
::
test_at
(
need_path_force_with_jtype
,
l
))
{
testpath_bo
[
itype
][
jtype
][
pos
]
=
testpath
[
l
];
}
if
(
bvec
::
test_at
(
need_bo_with_jtype
,
l
))
{
pos
+=
1
;
}
}
}
}
}
current_mask
=
current_mask
&
~
need_bondorder
;
need_path_force
=
need_path_force
&
~
need_bondorder
;
}
fvec
fpdVLJ
=
cij
*
dVLJ
*
(
c_1_0
+
Str
*
(
Stb
-
c_1_0
));
fvec
fpdStr
=
dStr
*
cij
*
(
Stb
*
VLJ
-
VLJ
);
fvec
fpair
=
r2inv
*
rij
*
(
fvec
::
setzero
()
-
(
fpdVLJ
+
fpdStr
));
fvec
evdwl
=
VA
*
Stb
+
cij
*
VLJ
*
(
c_1_0
-
Str
);
fvec
fix
=
fpair
*
delx
+
fij
[
0
];
fvec
fiy
=
fpair
*
dely
+
fij
[
1
];
fvec
fiz
=
fpair
*
delz
+
fij
[
2
];
result_f_i_x
=
fvec
::
mask_add
(
result_f_i_x
,
current_mask
,
result_f_i_x
,
fix
);
result_f_i_y
=
fvec
::
mask_add
(
result_f_i_y
,
current_mask
,
result_f_i_y
,
fiy
);
result_f_i_z
=
fvec
::
mask_add
(
result_f_i_z
,
current_mask
,
result_f_i_z
,
fiz
);
result_eng
=
fvec
::
mask_add
(
result_eng
,
current_mask
,
result_eng
,
evdwl
);
ivec
j_dbl_idx
=
ivec
::
mullo
(
j
,
c_i4
);
avec
fjx
=
avec
::
mask_gather
(
avec
::
undefined
(),
current_mask
,
j_dbl_idx
,
&
ka
->
result_f
[
0
].
x
,
sizeof
(
acc_t
));
avec
fjy
=
avec
::
mask_gather
(
avec
::
undefined
(),
current_mask
,
j_dbl_idx
,
&
ka
->
result_f
[
0
].
y
,
sizeof
(
acc_t
));
avec
fjz
=
avec
::
mask_gather
(
avec
::
undefined
(),
current_mask
,
j_dbl_idx
,
&
ka
->
result_f
[
0
].
z
,
sizeof
(
acc_t
));
fjx
=
fjx
-
fix
;
fjy
=
fjy
-
fiy
;
fjz
=
fjz
-
fiz
;
avec
::
mask_i32loscatter
(
&
ka
->
result_f
[
0
].
x
,
current_mask
,
j_dbl_idx
,
fjx
,
sizeof
(
acc_t
));
avec
::
mask_i32loscatter
(
&
ka
->
result_f
[
0
].
y
,
current_mask
,
j_dbl_idx
,
fjy
,
sizeof
(
acc_t
));
avec
::
mask_i32loscatter
(
&
ka
->
result_f
[
0
].
z
,
current_mask
,
j_dbl_idx
,
fjz
,
sizeof
(
acc_t
));
if
(
bvec
::
test_any_set
(
need_path_force
))
{
fvec
dC
=
VLJ
*
(
Str
*
Stb
+
c_1_0
-
Str
);
#pragma noinline
aut_airebo_lj_force_path
(
ka
,
need_path_force
,
dC
,
testpath
);
}
jj
+=
fvec
::
VL
;
rest_j
=
jj
<
jnum
;
}
ka
->
result_f
[
i
].
x
+=
fvec
::
reduce_add
(
result_f_i_x
);
ka
->
result_f
[
i
].
y
+=
fvec
::
reduce_add
(
result_f_i_y
);
ka
->
result_f
[
i
].
z
+=
fvec
::
reduce_add
(
result_f_i_z
);
}
for
(
int
itype
=
0
;
itype
<
2
;
itype
++
)
{
for
(
int
jtype
=
0
;
jtype
<
2
;
jtype
++
)
{
for
(
int
l
=
0
;
l
<
num_bo
[
itype
][
jtype
];
l
++
)
{
ref_lennard_jones_single_interaction
(
ka
,
ivec
::
at
(
i_bo
[
itype
][
jtype
],
l
),
ivec
::
at
(
j_bo
[
itype
][
jtype
],
l
),
MORSEFLAG
);
}
}
}
ka
->
result_eng
+=
fvec
::
reduce_add
(
result_eng
);
}
};
template
<
typename
flt_t
,
typename
acc_t
>
void
aut_lennard_jones
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
morseflag
)
{
#ifdef LMP_INTEL_AIREBO_REF
ref_lennard_jones
(
ka
,
morseflag
);
#else
if
(
morseflag
)
{
aut_wrap
<
flt_t
,
acc_t
>::
template
aut_lennard_jones
<
1
>
(
ka
);
}
else
{
aut_wrap
<
flt_t
,
acc_t
>::
template
aut_lennard_jones
<
0
>
(
ka
);
}
#endif
}
template
<
typename
flt_t
,
typename
acc_t
>
void
aut_rebo_neigh
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
)
{
#ifdef LMP_INTEL_AIREBO_REF
ref_rebo_neigh
(
ka
);
#else
aut_wrap
<
flt_t
,
acc_t
>::
aut_rebo_neigh
(
ka
);
#endif
}
template
<
typename
flt_t
,
typename
acc_t
>
void
aut_frebo
(
KernelArgsAIREBOT
<
flt_t
,
acc_t
>
*
ka
,
int
torsion_flag
)
{
#ifdef LMP_INTEL_AIREBO_REF
ref_frebo
(
ka
,
torsion_flag
);
#else
aut_wrap
<
flt_t
,
acc_t
>::
aut_frebo
(
ka
,
torsion_flag
);
#endif
}
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
}
Event Timeline
Log In to Comment