Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91649322
pair_tersoff_intel.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Nov 13, 02:07
Size
58 KB
Mime Type
text/x-c++
Expires
Fri, Nov 15, 02:07 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22297886
Attached To
rLAMMPS lammps
pair_tersoff_intel.cpp
View Options
This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Markus Höhnerbach (RWTH)
------------------------------------------------------------------------- */
#include "math.h"
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "pair_tersoff_intel.h"
#include "atom.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "force.h"
#include "comm.h"
#include "memory.h"
#include "error.h"
// Currently Intel compiler is required for this pair style.
// For convenience, base class routines are called if not using Intel compiler.
#ifndef __INTEL_COMPILER
using namespace LAMMPS_NS;
PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
{
}
void PairTersoffIntel::compute(int eflag, int vflag)
{
PairTersoff::compute(eflag, vflag);
}
void PairTersoffIntel::init_style()
{
if (comm->me == 0) {
error->warning(FLERR, "Tersoff/intel currently requires intel compiler. "
"Using MANYBODY version.");
}
PairTersoff::init_style();
}
#else
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push,target(mic))
#endif
#include "intel_intrinsics.h"
#include "math_const.h"
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
#include "group.h"
#include "kspace.h"
#include "modify.h"
#include "suffix.h"
using namespace LAMMPS_NS;
using namespace MathConst;
/* ---------------------------------------------------------------------- */
PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
{
suffix_flag |= Suffix::INTEL;
respa_enable = 0;
}
/* ---------------------------------------------------------------------- */
// Dispatch the requested precision
void PairTersoffIntel::compute(int eflag, int vflag)
{
if (fix->precision()==FixIntel::PREC_MODE_MIXED) {
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
} else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) {
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
} else {
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
}
fix->balance_stamp();
vflag_fdotr = 0;
}
// Dispatch the extent of computation:
// do we need to calculate energy/virial
template <class flt_t, class acc_t>
void PairTersoffIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag,vflag);
} else evflag = vflag_fdotr = 0;
const int inum = list->inum;
const int nthreads = comm->nthreads;
const int host_start = fix->host_start_pair();
const int offload_end = fix->offload_end_pair();
const int ago = neighbor->ago;
if (ago != 0 && fix->separate_buffers() == 0) {
fix->start_watch(TIME_PACK);
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
nthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago);
}
fix->stop_watch(TIME_PACK);
}
if (evflag || vflag_fdotr) {
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (eflag) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
}
}
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push, target(mic))
#endif
// The complete Tersoff computation kernel is encapsulated here
// everything is static, the class just serves as a unit of organization
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> {
// instantiate the vector library and import the types
typedef typename lmp_intel::vector_routines<flt_t, acc_t, mic> v;
typedef typename v::fvec fvec;
typedef typename v::ivec ivec;
typedef typename v::bvec bvec;
typedef typename v::avec avec;
typedef typename v::iarr iarr;
typedef typename v::farr farr;
typedef typename v::aarr aarr;
typedef typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t c_inner_t;
typedef typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t c_outer_t;
// for descriptions of these methods, please have a look at the original code
// what's done in here is that they are inlined and vectorized
// attractive() also provides an option to compute zeta as well
static fvec zeta_vector(
const c_inner_t * param,
ivec xjw, bvec mask,
fvec vrij, fvec rsq2,
fvec vdijx, fvec vdijy, fvec vdijz,
fvec dikx, fvec diky, fvec dikz
);
static void force_zeta_vector(
const c_outer_t * param,
ivec xjw,
bvec mask,
fvec vrijsq, fvec vzeta_ij,
fvec *vfpair, fvec *vprefactor, int EVDWL, fvec *vevdwl,
bvec vmask_repulsive
);
template<bool ZETA>
static void attractive_vector(
const c_inner_t * param,
ivec xjw,
bvec mask,
fvec vprefactor,
fvec vrijsq, fvec rsq2,
fvec vdijx, fvec vdijy, fvec vdijz,
fvec dikx, fvec diky, fvec dikz,
fvec *fix, fvec *fiy, fvec *fiz,
fvec *fjx, fvec *fjy, fvec *fjz,
fvec *fkx, fvec *fky, fvec *fkz,
fvec *zeta
);
// perform the actual computation
template<bool EVFLAG, bool EFLAG>
static void kernel(
int iito, int iifrom, int eatom, int vflag,
const int * _noalias const numneigh,
const int * _noalias const numneighhalf,
const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const c_inner_t * _noalias const c_inner,
const c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
);
// perform one step of calculation, pass in i-j pairs of atoms (is, js)
template<int EVFLAG, int EFLAG>
static void kernel_step(
int eatom, int vflag,
const int * _noalias const numneigh,
const int * _noalias const cnumneigh,
const int * _noalias const firstneigh,
int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const c_inner_t * _noalias const c_inner,
const c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl, avec *vsv0, avec * vsv1, avec *vsv2, avec* vsv3, avec *vsv4, avec *vsv5,
int compress_idx, iarr is, iarr js, bvec vmask_repulsive
);
// perform one step of calculation, as opposed to the previous method now
// with fixed i and a number of js
template<int EVFLAG, int EFLAG>
static void kernel_step_const_i(
int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const c_inner_t * _noalias const c_inner,
const c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl, avec *vsv0, avec *vsv1, avec *vsv2, avec *vsv3, avec *vsv4, avec *vsv5,
int compress_idx, int i, iarr js, bvec vmask_repulsive
);
};
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
/* ---------------------------------------------------------------------- */
// Dispatch to correct kernel instatiation and perform all the work neccesary
// for offloading. In this routine we enter the Phi.
// This method is nearly identical to what happens in the other /intel styles
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairTersoffIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc,
const int astart, const int aend)
{
const int inum = aend - astart;
if (inum == 0) return;
int nlocal, nall, minlocal;
fix->get_buffern(offload, nlocal, nall, minlocal);
const int ago = neighbor->ago;
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
ATOM_T * _noalias const x = buffers->get_x(offload);
tagint * _noalias tag = this->atom->tag;
flt_t * _noalias const q = buffers->get_q(offload);
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const numneighhalf = buffers->get_atombin();
const int * _noalias const firstneigh = buffers->firstneigh(list);
typedef typename ForceConst<flt_t>::c_inner_t c_inner_t;
typedef typename ForceConst<flt_t>::c_outer_t c_outer_t;
typedef typename ForceConst<flt_t>::c_cutoff_t c_cutoff_t;
const c_outer_t * _noalias const c_outer = fc.c_outer[0];
const c_inner_t * _noalias const c_inner = fc.c_inner[0][0];
const c_cutoff_t * _noalias const c_inner_cutoff = fc.c_cutoff_inner[0][0];
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
// Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
#ifdef _LMP_INTEL_OFFLOAD
int *overflow = fix->get_off_overflow_flag();
double *timer_compute = fix->off_watch_pair();
if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
#pragma offload target(mic:_cop) if(offload) \
in(c_inner, c_outer :length(0) alloc_if(0) free_if(0)) \
in(c_inner_cutoff :length(0) alloc_if(0) free_if(0)) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(overflow:length(0) alloc_if(0) free_if(0)) \
in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
in(f_stride,nlocal,minlocal,separate_flag,offload) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(f_start)
#endif
{
#ifdef _LMP_INTEL_OFFLOAD
#ifdef __MIC__
*timer_compute = MIC_Wtime();
#endif
#endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, 0);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EVFLAG) {
oevdwl = oecoul = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
}
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,nlocal,nall,minlocal) \
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int iifrom, iito, tid;
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
iifrom += astart;
iito += astart;
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
{
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
sevdwl = sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = 0.;
#define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5
// Pick the variable i algorithm under specific conditions
// do use scalar algorithm with very short vectors
int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
bool pack_i = VL >= 8 &&
lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
bool use_scalar = VL < 4;
if (use_scalar) {
IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EVFLAG,EFLAG>(ARGS);
} else if (pack_i) {
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EVFLAG,EFLAG>(ARGS);
} else {
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EVFLAG,EFLAG>(ARGS);
}
if (EVFLAG) {
if (EFLAG) oevdwl += sevdwl;
if (vflag == 1) {
ov0 += sv0;
ov1 += sv1;
ov2 += sv2;
ov3 += sv3;
ov4 += sv4;
ov5 += sv5;
}
}
}
#ifndef _LMP_INTEL_OFFLOAD
if (vflag == 2)
#endif
{
#if defined(_OPENMP)
#pragma omp barrier
#endif
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
nlocal, minlocal, nthreads, f_start, f_stride,
x, offload);
}
} // end of omp parallel region
if (EVFLAG) {
if (EFLAG) {
ev_global[0] = oevdwl;
ev_global[1] = 0.0;
}
if (vflag) {
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
}
#ifdef _LMP_INTEL_OFFLOAD
#ifdef __MIC__
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
#endif
} // end of offload region
if (offload)
fix->stop_watch(TIME_OFFLOAD_LATENCY);
else
fix->stop_watch(TIME_HOST_PAIR);
if (EVFLAG)
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
else
fix->add_result_array(f_start, 0, offload);
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
// As in any other /intel pair style
void PairTersoffIntel::init_style()
{
if (atom->tag_enable == 0)
error->all(FLERR,"Pair style Tersoff requires atom IDs");
if (force->newton_pair == 0)
error->all(FLERR,"Pair style Tersoff requires newton pair on");
// need a full neighbor list
int irequest = neighbor->request(this);
neighbor->requests[irequest]->half = 0;
neighbor->requests[irequest]->full = 1;
neighbor->requests[irequest]->intel = 1;
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD
_cop = fix->coprocessor_number();
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
pack_force_const(force_const_single, fix->get_mixed_buffers());
fix->get_mixed_buffers()->need_tag(1);
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
fix->get_double_buffers()->need_tag(1);
pack_force_const(force_const_double, fix->get_double_buffers());
} else {
pack_force_const(force_const_single, fix->get_single_buffers());
fix->get_single_buffers()->need_tag(1);
}
}
// As in any other /intel pair style
template <class flt_t, class acc_t>
void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
}
}
for (int i = 1; i < tp1; i++) {
for (int j = 1; j < tp1; j++) {
fc.c_inner_loop[i][j][0].d2 = 1.0;
fc.c_inner_loop[i][0][j].d2 = 1.0;
fc.c_inner_loop[0][i][j].d2 = 1.0;
for (int k = 1; k < tp1; k++) {
Param * param = ¶ms[elem2param[map[i]][map[j]][map[k]]];
fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq);
fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr);
fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd);
fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h);
fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma);
fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);
fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq);
fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr);
fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd);
fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h);
fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma);
fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);
}
Param * param = ¶ms[elem2param[map[i]][map[j]][map[j]]];
fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
fc.c_first_loop[i][j].bigr = static_cast<flt_t>(param->bigr);
fc.c_first_loop[i][j].bigd = static_cast<flt_t>(param->bigd);
fc.c_first_loop[i][j].lam1 = static_cast<flt_t>(param->lam1);
fc.c_first_loop[i][j].biga = static_cast<flt_t>(param->biga);
fc.c_second_loop[i][j].lam2 = static_cast<flt_t>(param->lam2);
fc.c_second_loop[i][j].beta = static_cast<flt_t>(param->beta);
fc.c_second_loop[i][j].bigb = static_cast<flt_t>(param->bigb);
fc.c_second_loop[i][j].powern = static_cast<flt_t>(param->powern);
fc.c_second_loop[i][j].c1 = static_cast<flt_t>(param->c1);
fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2);
fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3);
fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4);
fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr);
fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd);
fc.c_outer[i][j].lam1 = static_cast<flt_t>(param->lam1);
fc.c_outer[i][j].biga = static_cast<flt_t>(param->biga);
fc.c_outer[i][j].lam2 = static_cast<flt_t>(param->lam2);
fc.c_outer[i][j].beta = static_cast<flt_t>(param->beta);
fc.c_outer[i][j].bigb = static_cast<flt_t>(param->bigb);
fc.c_outer[i][j].powern = static_cast<flt_t>(param->powern);
fc.c_outer[i][j].c1 = static_cast<flt_t>(param->c1);
fc.c_outer[i][j].c2 = static_cast<flt_t>(param->c2);
fc.c_outer[i][j].c3 = static_cast<flt_t>(param->c3);
fc.c_outer[i][j].c4 = static_cast<flt_t>(param->c4);
}
fc.c_outer[i][0].cutsq = 0.;
}
#ifdef _LMP_INTEL_OFFLOAD
if (_cop < 0) return;
typename ForceConst<flt_t>::c_first_loop_t * c_first_loop = fc.c_first_loop[0];
typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_outer = fc.c_cutoff_outer[0];
typename ForceConst<flt_t>::c_outer_t * c_outer = fc.c_outer[0];
typename ForceConst<flt_t>::c_second_loop_t * c_second_loop = fc.c_second_loop[0];
typename ForceConst<flt_t>::c_inner_loop_t * c_inner_loop = fc.c_inner_loop[0][0];
typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_inner = fc.c_cutoff_inner[0][0];
typename ForceConst<flt_t>::c_inner_t * c_inner = fc.c_inner[0][0];
flt_t * ocutneighsq = cutneighsq[0];
size_t VL = 512 / 8 / sizeof(flt_t);
int ntypes = tp1;
int ntypes_pad = ntypes + VL - ntypes % VL;
int tp1sq = tp1 * tp1;
int tp1cb = tp1 * tp1 * tp1;
int tp1cb_pad = tp1 * tp1 * ntypes_pad;
#pragma offload_transfer target(mic:_cop) \
in(c_first_loop, c_second_loop, c_cutoff_outer, c_outer : length(tp1sq) alloc_if(0) free_if(0)) \
in(c_inner : length(tp1cb) alloc_if(0) free_if(0)) \
in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
#endif
}
/* ---------------------------------------------------------------------- */
// As in any other /intel pair style
template <class flt_t>
void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
Memory *memory,
const int cop) {
if ( (ntypes != _ntypes) ) {
if (_ntypes > 0) {
#ifdef _LMP_INTEL_OFFLOAD
c_first_loop_t * oc_first_loop = c_first_loop[0];
c_second_loop_t * oc_second_loop = c_second_loop[0];
c_inner_loop_t * oc_inner_loop = c_inner_loop[0][0];
c_cutoff_t * oc_cutoff_inner = c_cutoff_inner[0][0];
c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
c_inner_t * oc_inner = c_inner[0][0];
c_outer_t * oc_outer = c_outer[0];
if (c_first_loop != NULL && c_second_loop != NULL &&
c_inner_loop != NULL && _cop >= 0) {
#pragma offload_transfer target(mic:cop) \
nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
}
#endif
_memory->destroy(c_first_loop);
_memory->destroy(c_second_loop);
_memory->destroy(c_inner_loop);
_memory->destroy(c_cutoff_outer);
_memory->destroy(c_cutoff_inner);
_memory->destroy(c_inner);
_memory->destroy(c_outer);
}
if (ntypes > 0) {
_cop = cop;
size_t VL = 512 / 8 / sizeof(flt_t);
int ntypes_pad = ntypes + VL - ntypes % VL;
memory->create(c_first_loop,ntypes,ntypes,"fc.c_first_loop");
memory->create(c_second_loop,ntypes,ntypes,"fc.c_second_loop");
memory->create(c_cutoff_outer,ntypes,ntypes,"fc.c_cutoff_outer");
memory->create(c_inner_loop,ntypes,ntypes,ntypes,"fc.c_inner_loop");
memory->create(c_cutoff_inner,ntypes,ntypes,ntypes_pad,"fc.c_cutoff_inner");
memory->create(c_inner,ntypes,ntypes,ntypes,"fc.c_inner");
memory->create(c_outer,ntypes,ntypes,"fc.c_outer");
#ifdef _LMP_INTEL_OFFLOAD
c_first_loop_t * oc_first_loop = c_first_loop[0];
c_second_loop_t * oc_second_loop = c_second_loop[0];
c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
c_inner_loop_t * oc_inner_loop = c_inner_loop[0][0];
c_cutoff_t * oc_cutoff_inner = c_cutoff_inner[0][0];
c_inner_t * oc_inner = c_inner[0][0];
c_outer_t * oc_outer = c_outer[0];
int tp1sq = ntypes * ntypes;
int tp1cb = ntypes * ntypes * ntypes;
int tp1cb_pad = ntypes * ntypes * ntypes_pad;
if (oc_first_loop != NULL && oc_second_loop != NULL &&
oc_inner_loop != NULL && cop >= 0) {
#pragma offload_transfer target(mic:cop) \
nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oc_second_loop: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oc_cutoff_outer: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oc_outer: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oc_inner_loop: length(tp1cb) alloc_if(1) free_if(0)) \
nocopy(oc_inner: length(tp1cb) alloc_if(1) free_if(0)) \
nocopy(oc_cutoff_inner: length(tp1cb_pad) alloc_if(1) free_if(0))
}
#endif
}
}
_ntypes=ntypes;
_memory=memory;
}
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push,target(mic))
#endif
// The factor up to which we do caching
static const int N_CACHE = 8;
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
template<int EVFLAG, int EFLAG>
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl,
avec *vsv0,
avec *vsv1,
avec *vsv2,
avec* vsv3,
avec *vsv4,
avec *vsv5,
int compress_idx,
iarr is,
iarr js,
bvec vmask_repulsive
) {
ivec v_i4floats((int) (4 * sizeof(typename v::fscal)));
ivec v_i1(1);
fvec v_2(0.);
fvec v_0_5(0.5);
ivec v_i0(0);
ivec v_i_ntypes(ntypes);
ivec v_i_NEIGHMASK(NEIGHMASK);
farr fx, fy, fz, fw;
int cache_idx = 0;
fvec vfkx_cache[N_CACHE];
fvec vfky_cache[N_CACHE];
fvec vfkz_cache[N_CACHE];
ivec vks_cache[N_CACHE];
bvec vmask_cache[N_CACHE];
ivec vkks_final_cache;
bvec vmask_final_cache;
iarr ts;
// compute all the stuff we know from i and j
// TDO: We could extract this from the driver routine
ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
bvec vmask = v::mask_enable_lower(compress_idx);
fvec vx_i = v::zero(), vy_i = v::zero(), vz_i = v::zero();
ivec vw_i = v_i0;
v::gather_x(vis, vmask, x, &vx_i, &vy_i, &vz_i, &vw_i);
fvec vx_j = v::zero(), vy_j = v::zero(), vz_j = v::zero();
ivec vw_j = v_i0;
v::gather_x(vjs, vmask, x, &vx_j, &vy_j, &vz_j, &vw_j);
fvec vdx_ij = vx_j - vx_i, vdy_ij = vy_j - vy_i, vdz_ij = vz_j - vz_i;
fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij;
fvec vrij = sqrt(vrijsq);
ivec vis_orig = v::int_load_vl(is);
ivec vcnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, cnumneigh);
ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh);
ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i));
fvec vzeta = v::zero();
fvec vfxtmp = v::zero(), vfytmp = v::zero(), vfztmp = v::zero();
fvec vfjxtmp = v::zero(), vfjytmp = v::zero(), vfjztmp = v::zero();
// This piece of code faciliates the traversal of the k loop assuming
// nothing about i. As such, it uses masking to avoid superfluous loads
// and fast-forwards each lane until work is available.
// This is useful because we can not make assumptions as to where in the
// neighbor list the atoms within the cutoff might be.
// We also implement the caching in here, i.e. collect force contributions
// due to zeta.
// This means that you will see four loops:
// 1. the loop that does zeta calculation and caches the force contributions
// 2. the loop that processes the remaining zeta calculations
// 3. the loop that updates the force based on the cached force contributions
// 4. the loop that computes force contributions for the remainder
{
ivec vkks = v_i0;
bvec vactive_mask = vmask;
bvec veff_old_mask(0);
ivec vks, vw_k;
fvec vx_k, vy_k, vz_k, vcutsq;
while (! v::mask_testz(vactive_mask) && cache_idx < N_CACHE) {
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
fvec vdx_ik = (vx_k - vx_i);
fvec vdy_ik = (vy_k - vy_i);
fvec vdz_ik = (vz_k - vz_i);
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, vks);
bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
if (v::mask_testz(~(veff_mask | ~vactive_mask))) {
fvec vzeta_contrib;
fvec vfix, vfiy, vfiz;
fvec vfjx, vfjy, vfjz;
fvec vfkx, vfky, vfkz;
attractive_vector<true>(c_inner,vc_idx,veff_mask,fvec(1.),
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
&vfix,&vfiy,&vfiz,
&vfjx,&vfjy,&vfjz,
&vfkx,&vfky,&vfkz,
&vzeta_contrib);
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
vfjxtmp = v::mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
vfjytmp = v::mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
vfjztmp = v::mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
vfkx_cache[cache_idx] = vfkx;
vfky_cache[cache_idx] = vfky;
vfkz_cache[cache_idx] = vfkz;
vks_cache[cache_idx] = vks;
vmask_cache[cache_idx] = veff_mask;
cache_idx += 1;
vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
vkks = vkks + v_i1;
veff_old_mask = bvec(0);
} else {
vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
veff_old_mask = veff_mask;
}
vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
}
vkks_final_cache = vkks;
vmask_final_cache = vactive_mask;
while (! v::mask_testz(vactive_mask)) {
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
fvec vdx_ik = (vx_k - vx_i);
fvec vdy_ik = (vy_k - vy_i);
fvec vdz_ik = (vz_k - vz_i);
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, vks);
bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
if (v::mask_testz(~(veff_mask | ~vactive_mask))) {
fvec vzeta_contrib;
vzeta_contrib = zeta_vector(c_inner,vc_idx,veff_mask,vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
vkks = vkks + v_i1;
veff_old_mask = bvec(0);
} else {
vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
veff_old_mask = veff_mask;
}
vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
}
}
fvec vfpair, vevdwl, vprefactor, vfwtmp, vfjwtmp;
force_zeta_vector(c_outer, vc_idx_ij, vmask, vrij, vzeta, &vfpair, &vprefactor, EFLAG, &vevdwl, vmask_repulsive);
vfxtmp = vfxtmp * vprefactor + vdx_ij * vfpair;
vfytmp = vfytmp * vprefactor + vdy_ij * vfpair;
vfztmp = vfztmp * vprefactor + vdz_ij * vfpair;
vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair;
vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
if (EVFLAG) {
if (EFLAG) {
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
if (eatom) {
v::store(fw, (v_0_5 * vevdwl));
}
}
if (vflag == 1) {
*vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
*vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
*vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
*vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
*vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
*vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
}
}
{
while (cache_idx-- > 0) {
fvec vfkx = vprefactor * vfkx_cache[cache_idx];
fvec vfky = vprefactor * vfky_cache[cache_idx];
fvec vfkz = vprefactor * vfkz_cache[cache_idx];
ivec vks = vks_cache[cache_idx];
bvec veff_mask = vmask_cache[cache_idx];
v::store(fx, vfkx);
v::store(fy, vfky);
v::store(fz, vfkz);
v::int_store(ts, vks);
for (int t = 0; t < v::VL; t++) {
if (v::mask_test_at(veff_mask, t)) {
int t_ = ts[t] / (4 * sizeof(typename v::fscal));
f[t_].x += fx[t];
f[t_].y += fy[t];
f[t_].z += fz[t];
}
}
}
ivec vkks = vkks_final_cache;
bvec vactive_mask = vmask_final_cache;
bvec veff_old_mask(0);
ivec vks, vw_k;
fvec vx_k, vy_k, vz_k, vcutsq;
while (! v::mask_testz(vactive_mask)) {
bvec vnew_mask = vactive_mask & ~ veff_old_mask;
vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
fvec vdx_ik = vx_k - vx_i;
fvec vdy_ik = vy_k - vy_i;
fvec vdz_ik = vz_k - vz_i;
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, vks);
bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
if (v::mask_testz(~(veff_mask | ~vactive_mask))) {
fvec vfix, vfiy, vfiz;
fvec vfjx, vfjy, vfjz;
fvec vfkx, vfky, vfkz;
attractive_vector<false>(c_inner,vc_idx,veff_mask,vprefactor,
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
&vfix,&vfiy,&vfiz,
&vfjx,&vfjy,&vfjz,
&vfkx,&vfky,&vfkz,
0);
vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
vfjxtmp = v::mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
vfjytmp = v::mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
vfjztmp = v::mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
v::store(fx, vfkx);
v::store(fy, vfky);
v::store(fz, vfkz);
v::int_store(ts, vks);
for (int t = 0; t < v::VL; t++) {
if (v::mask_test_at(veff_mask, t)) {
int t_ = ts[t] / (4 * sizeof(typename v::fscal));
f[t_].x += fx[t];
f[t_].y += fy[t];
f[t_].z += fz[t];
}
}
vkks = vkks + v_i1;
veff_old_mask = bvec(0);
} else {
vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
veff_old_mask = veff_mask;
}
vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
} // while (vactive_mask != 0)
} // section k
// We can not make any assumptions regarding conflicts.
// So we sequentialize this.
// TDO: Once AVX-512 is around check out VPCONFLICT
v::store(fx, vfjxtmp);
v::store(fy, vfjytmp);
v::store(fz, vfjztmp);
for (int t = 0; t < compress_idx; t++) {
int t_ = js[t];
f[t_].x += fx[t];
f[t_].y += fy[t];
f[t_].z += fz[t];
if (EVFLAG && EFLAG && eatom) {
f[t_].w += fw[t];
}
}
v::store(fx, vfxtmp);
v::store(fy, vfytmp);
v::store(fz, vfztmp);
for (int t = 0; t < compress_idx; t++) {
int t_ = is[t];
f[t_].x += fx[t];
f[t_].y += fy[t];
f[t_].z += fz[t];
if (EVFLAG && EFLAG && eatom) {
f[t_].w += fw[t];
}
}
}
// Specialized kernel step for fixed i, means that we don't have to use the
// convoluted iteration scheme above, as the loop variables are uniform.
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
template<int EVFLAG, int EFLAG>
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
int eatom, int vflag,
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
avec *vsevdwl,
avec *vsv0,
avec *vsv1,
avec *vsv2,
avec* vsv3,
avec *vsv4,
avec *vsv5,
int compress_idx,
int i,
iarr js,
bvec vmask_repulsive
) {
typedef typename v::fvec fvec;
typedef typename v::ivec ivec;
typedef typename v::bvec bvec;
typedef typename v::farr farr;
typedef typename v::iarr iarr;
typedef typename v::avec avec;
typedef typename v::aarr aarr;
ivec v_i4floats((int) (4 * sizeof(typename v::fscal)));
ivec v_i1(1), v_i0(0), v_i_ntypes(ntypes), v_i_NEIGHMASK(NEIGHMASK);
fvec v_0_5(0.5);
int cache_idx = 0;
fvec vfkx_cache[N_CACHE];
fvec vfky_cache[N_CACHE];
fvec vfkz_cache[N_CACHE];
int k_cache[N_CACHE];
bvec vmask_cache[N_CACHE];
int kk_final_cache;
aarr fx, fy, fz, fw;
iarr ts;
bvec vmask = v::mask_enable_lower(compress_idx);
fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z);
int w_i = x[i].w;
ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
fvec vx_j = v::zero(), vy_j = v::zero(), vz_j = v::zero();
ivec vw_j = v_i0;
v::gather_x(vjs, vmask, x, &vx_j, &vy_j, &vz_j, &vw_j);
fvec vdx_ij = vx_j - vx_i, vdy_ij = vy_j - vy_i, vdz_ij = vz_j - vz_i;
fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij;
fvec vrij = sqrt(vrijsq);
int cnumneigh_i = cnumneigh[i];
int numneigh_i = numneigh[i];
ivec vc_idx_j = v::int_mullo(v_i4floats, vw_j);
ivec vc_idx_j_ntypes = v::int_mullo(v_i_ntypes, vc_idx_j);
avec vzeta = v::acc_zero();
avec vfxtmp = v::acc_zero(), vfytmp = v::acc_zero(), vfztmp = v::acc_zero();
avec vfjxtmp = v::acc_zero(), vfjytmp = v::acc_zero(), vfjztmp = v::acc_zero();
// Same structure as kernel_step, just simpler as the loops all iterate over
// the same k
int kk = 0;
for (; kk < numneigh_i && cache_idx < N_CACHE; kk++) {
int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
fvec vx_k(x[k].x);
fvec vy_k(x[k].y);
fvec vz_k(x[k].z);
int w_k = x[k].w;
fvec vdx_ik = vx_k - vx_i;
fvec vdy_ik = vy_k - vy_i;
fvec vdz_ik = vz_k - vz_i;
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
if (! v::mask_testz(veff_mask)) {
fvec vzeta_contrib;
fvec vfix, vfiy, vfiz;
fvec vfjx, vfjy, vfjz;
fvec vfkx, vfky, vfkz;
attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.),
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
&vfix,&vfiy,&vfiz,
&vfjx,&vfjy,&vfjz,
&vfkx,&vfky,&vfkz,
&vzeta_contrib);
vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero());
vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero());
vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero());
vmask_cache[cache_idx] = veff_mask;
k_cache[cache_idx] = k;
cache_idx += 1;
vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
}
}
kk_final_cache = kk;
for (; kk < numneigh_i; kk++) {
int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
fvec vx_k(x[k].x);
fvec vy_k(x[k].y);
fvec vz_k(x[k].z);
int w_k = x[k].w;
fvec vdx_ik = vx_k - vx_i;
fvec vdy_ik = vy_k - vy_i;
fvec vdz_ik = vz_k - vz_i;
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
if (! v::mask_testz(veff_mask)) {
fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq,
vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
}
}
fvec vfpair, vevdwl, vprefactor, vfwtmp;
force_zeta_vector(&c_outer[ntypes * w_i], vc_idx_j, vmask, vrij, vzeta, &vfpair, &vprefactor, EFLAG, &vevdwl, vmask_repulsive);
avec vaprefactor(vprefactor);
vfxtmp = vfxtmp * vaprefactor + avec(vdx_ij * vfpair);
vfytmp = vfytmp * vaprefactor + avec(vdy_ij * vfpair);
vfztmp = vfztmp * vaprefactor + avec(vdz_ij * vfpair);
vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair);
vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
if (EVFLAG) {
if (EFLAG) {
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
if (eatom) {
vfwtmp = v_0_5 * vevdwl;
v::store(fw, vfwtmp);
}
}
if (vflag == 1) {
*vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
*vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
*vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
*vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
*vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
*vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
}
}
while (cache_idx-- > 0) {
fvec vfkx = vprefactor * vfkx_cache[cache_idx];
fvec vfky = vprefactor * vfky_cache[cache_idx];
fvec vfkz = vprefactor * vfkz_cache[cache_idx];
int k = k_cache[cache_idx];
bvec veff_mask = vmask_cache[cache_idx];
f[k].x += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkx, v::zero()));
f[k].y += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfky, v::zero()));
f[k].z += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkz, v::zero()));
}
for (int kk = kk_final_cache; kk < numneigh_i; kk++) {
int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
fvec vx_k(x[k].x);
fvec vy_k(x[k].y);
fvec vz_k(x[k].z);
int w_k = x[k].w;
fvec vdx_ik = vx_k - vx_i;
fvec vdy_ik = vy_k - vy_i;
fvec vdz_ik = vz_k - vz_i;
fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq);
bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
if (! v::mask_testz(veff_mask)) {
fvec vfix, vfiy, vfiz;
fvec vfjx, vfjy, vfjz;
fvec vfkx, vfky, vfkz;
attractive_vector<false>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,vprefactor,
vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
&vfix,&vfiy,&vfiz,
&vfjx,&vfjy,&vfjz,
&vfkx,&vfky,&vfkz,
0);
vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
f[k].x += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkx, v::zero()));
f[k].y += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfky, v::zero()));
f[k].z += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkz, v::zero()));
}
}
// TDO: This could be a scatter
v::acc_store(fx, vfjxtmp);
v::acc_store(fy, vfjytmp);
v::acc_store(fz, vfjztmp);
for (int t = 0; t < compress_idx; t++) {
int t_ = js[t];
f[t_].x += fx[t];
f[t_].y += fy[t];
f[t_].z += fz[t];
if (EVFLAG && EFLAG && eatom) {
f[t_].w += fw[t];
}
}
f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero()));
f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero()));
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero()));
if (EVFLAG && EFLAG && eatom) {
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero()));
}
}
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
template<bool EVFLAG, bool EFLAG>
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
int iito, int iifrom, int eatom, int vflag,
const int * _noalias const numneigh,
const int * _noalias const numneighhalf,
const int * _noalias const cnumneigh,
const int * _noalias const firstneigh, int ntypes,
typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
const c_inner_t * _noalias const c_inner,
const c_outer_t * _noalias const c_outer,
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
) {
int compress_idx = 0;
int ii, jj;
iarr is, js;
avec vsevdwl = v::acc_zero();
avec vsv0 = v::acc_zero(), vsv1 = v::acc_zero(), vsv2 = v::acc_zero();
avec vsv3 = v::acc_zero(), vsv4 = v::acc_zero(), vsv5 = v::acc_zero();
ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
ivec vj, v_NEIGHMASK(NEIGHMASK);
bvec vmask_repulsive(0);
iarr repulsive_flag = {0};
// If you want to get the very most out of this, please uncomment.
// Consider getting a coffee or doing something else.
// Also good for heating.
//#pragma forceinline recursive
for (ii = iifrom; ii < iito; ii++) {
// Right now this loop is scalar, to allow for the compiler to do
// its prefetching magic.
int i = ii;
int w_i = x[i].w;
flt_t x_i = x[i].x;
flt_t y_i = x[i].y;
flt_t z_i = x[i].z;
int jlist_off_i = cnumneigh[i];
int jnum = numneigh[ii];
for (jj = 0; jj < jnum; jj++) {
int j = firstneigh[jlist_off_i + jj] & NEIGHMASK;
int w_j = x[j].w;
flt_t dx_ij = x[j].x - x_i;
flt_t dy_ij = x[j].y - y_i;
flt_t dz_ij = x[j].z - z_i;
flt_t rsq = dx_ij*dx_ij + dy_ij*dy_ij + dz_ij*dz_ij;
flt_t cutsq = c_outer[w_i * ntypes + w_j].cutsq;
if (rsq < cutsq) {
is[compress_idx] = ii;
js[compress_idx] = j;
if (jj < numneighhalf[i])
repulsive_flag[compress_idx] = 1;
compress_idx += 1;
}
if (pack_i) {
if (compress_idx == v::VL) {
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
kernel_step<EVFLAG,EFLAG>(
eatom, vflag,
numneigh, cnumneigh, firstneigh, ntypes,
x, c_inner, c_outer, f,
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
is, js, vmask_repulsive
);
compress_idx = 0;
v::int_clear_arr(repulsive_flag);
}
} else {
if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
kernel_step_const_i<EVFLAG,EFLAG>(
eatom, vflag,
numneigh, cnumneigh, firstneigh, ntypes,
x, c_inner, c_outer, f,
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
i, js, vmask_repulsive
);
compress_idx = 0;
v::int_clear_arr(repulsive_flag);
}
}
}
}
if (compress_idx > 0) {
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
IntelKernelTersoff::kernel_step<EVFLAG,EFLAG>(
eatom, vflag,
numneigh, cnumneigh, firstneigh, ntypes,
x, c_inner, c_outer, f,
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
is, js, vmask_repulsive
);
}
if (EVFLAG) {
if (EFLAG) {
*evdwl += v::acc_reduce_add(vsevdwl);
}
if (vflag == 1) {
*ov0 += v::acc_reduce_add(vsv0);
*ov1 += v::acc_reduce_add(vsv1);
*ov2 += v::acc_reduce_add(vsv2);
*ov3 += v::acc_reduce_add(vsv3);
*ov4 += v::acc_reduce_add(vsv4);
*ov5 += v::acc_reduce_add(vsv5);
}
}
}
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
const c_inner_t * param,
ivec xjw, bvec mask,
fvec vrij, fvec rsq2,
fvec vdijx, fvec vdijy, fvec vdijz,
fvec dikx, fvec diky, fvec dikz
) {
fvec v_1_0(1.0);
fvec v_0_5(0.5);
fvec vph = v::zero(), vpc2 = v::zero(), vpd2 = v::zero(), vpgamma = v::zero(), vplam3 = v::zero(), vppowermint = v::zero(), vpbigr = v::zero(), vpbigd = v::zero();
// TDO: Specialize on number of species
v::gather_8(xjw, mask, ¶m[0].lam3, &vplam3, &vppowermint, &vpbigr, &vpbigd, &vpc2, &vpd2, &vph, &vpgamma);
fvec vrik = sqrt(rsq2);
fvec vcostheta = (vdijx * dikx + vdijy * diky + vdijz * dikz) * v::recip(vrij * vrik);
fvec vhcth = vph - vcostheta;
fvec vgijk_a = vhcth * vhcth;
fvec vgijk = vpgamma * (v_1_0 + vpc2 * vgijk_a * v::recip(vpd2 * (vpd2 + vgijk_a)));
fvec varg1 = vplam3 * (vrij - vrik);
fvec varg3 = varg1 * varg1 * varg1;
bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
fvec varg = v::blend(mask_ex, varg1, varg3);
fvec vex_delr = v::min(fvec(1.e30), exp(varg));
bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
fvec vfc = v_1_0;
// Its kind of important to check the mask.
// Some simulations never/rarely invoke this branch.
if (! v::mask_testz(vmask_need_sine)) {
vfc = v::blend(vmask_need_sine, vfc,
v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd))));
}
return vgijk * vex_delr * vfc;
}
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
const c_outer_t * param,
ivec xjw,
bvec mask,
fvec vrij, fvec vzeta_ij,
fvec *vfpair, fvec *vprefactor, int EVDWL, fvec *vevdwl,
bvec vmask_repulsive
) {
fvec v_0_0(0.0);
fvec v_0_5(0.5);
fvec v_m0_5(-0.5);
fvec v_1_0(1.0);
fvec v_m1_0(-1.0);
fvec v_2_0(2.0);
fvec vpbigr = v::zero(), vpbigd = v::zero(), vplam1 = v::zero(), vpbiga = v::zero(), vplam2 = v::zero(), vpbeta = v::zero(), vpbigb = v::zero(), vppowern = v::zero();
v::gather_8(xjw, mask, ¶m[0].bigr, &vpbigr, &vpbigd, &vplam1, &vpbiga, &vplam2, &vpbeta, &vpbigb, &vppowern);
fvec vfccos;
// This is pretty much a literal translation.
bvec vmask_need_sine = v::cmpnle(vrij, vpbigr - vpbigd) & mask;
fvec vfc = v_1_0;
fvec vfc_d = v_0_0;
if (! v::mask_testz(vmask_need_sine)) {
fvec vtmp = fvec(MY_PI2) * v::recip(vpbigd);
vfc = v::blend(vmask_need_sine, vfc,
v_0_5 * (v_1_0 - v::sincos(&vfccos, vtmp * (vrij - vpbigr))));
vfc_d = v::blend(vmask_need_sine, vfc_d, v_m0_5 * vtmp * vfccos);
}
fvec vpminus_lam2 = - vplam2;
fvec vpminus_bigb = -vpbigb;
fvec vexp = exp(vpminus_lam2 * vrij);
fvec vfa = vpminus_bigb * vexp * vfc;
fvec vfa_d = vpminus_lam2 * vfa + vpminus_bigb * vexp * vfc_d;
fvec vpc1 = v::zero(), vpc2 = v::zero(), vpc3 = v::zero(), vpc4 = v::zero();
v::gather_4(xjw, mask, ¶m[0].c1, &vpc1, &vpc2, &vpc3, &vpc4);
fvec vpminus_powern = - vppowern;
fvec vbij(0.), vbij_d(0.);
fvec vtmp = vpbeta * vzeta_ij;
bvec vmc1 = v::cmple(vpc1, vtmp) & mask;
if (! v::mask_testz(vmc1)) {
vbij = v::invsqrt(vtmp);
vbij_d = vpbeta * v_m0_5 * vbij * v::recip(vtmp);
}
bvec vmc2 = v::cmple(vpc2, vtmp) & ~ vmc1 & mask;
if (! v::mask_testz(vmc2)) {
fvec vpowminus_powern = pow(vtmp, vpminus_powern);
fvec vinvsqrt = v::invsqrt(vtmp);
fvec vrcp2powern = v::recip(v_2_0 * vppowern);
fvec va = (v_1_0 - vpowminus_powern * vrcp2powern) * vinvsqrt;
fvec va_d = vpbeta * v_m0_5 * vinvsqrt * v::recip(vtmp) *
(v_1_0 + v_m0_5 * vpowminus_powern * (v_1_0 + vrcp2powern));
vbij = v::blend(vmc2, vbij, va);
vbij_d = v::blend(vmc2, vbij_d, va_d);
}
bvec vmc3 = v::cmplt(vtmp, vpc4) & ~vmc2 & ~vmc1 & mask;
if (! v::mask_testz(vmc3)) {
vbij = v::blend(vmc3, vbij, v_1_0);
vbij_d = v::blend(vmc3, vbij_d, v_0_0);
}
bvec vmc4 = v::cmple(vtmp, vpc3) & ~vmc3 & ~vmc2 & ~ vmc1 & mask;
if (! v::mask_testz(vmc4)) {
fvec vpowm1 = pow(vtmp, vppowern - v_1_0);
fvec vrcp2powern = v::recip(v_2_0 * vppowern);
fvec va = v_1_0 - vtmp * vrcp2powern * vpowm1;
fvec va_d = v_m0_5 * vpbeta * vpowm1;
vbij = v::blend(vmc4, vbij, va);
vbij_d = v::blend(vmc4, vbij_d, va_d);
}
bvec vmc5 = mask & ~vmc1 & ~vmc2 & ~vmc3 & ~vmc4;
if (! v::mask_testz(vmc5)) {
fvec vtmp_n = pow(vtmp, vppowern);
fvec vpow2 = pow(v_1_0 + vtmp_n, v_m1_0 - v::recip(v_2_0 * vppowern));
fvec va = (v_1_0 + vtmp_n) * vpow2;
fvec va_d = v_m0_5 * vpow2 * vtmp_n * v::recip(vzeta_ij);
vbij = v::blend(vmc5, vbij, va);
vbij_d = v::blend(vmc5, vbij_d, va_d);
}
fvec vtmp_exp = exp(-vplam1 * vrij);
fvec vrep_fforce = vpbiga * vtmp_exp * (vfc_d - vfc * vplam1);
fvec vfz_fforce = v_0_5 * vbij * vfa_d;
*vfpair = v::mask_add(vfz_fforce, vmask_repulsive, vfz_fforce, vrep_fforce) * v::recip(vrij);
*vprefactor = v_m0_5 * vfa * vbij_d;
if (EVDWL) {
fvec vrep_eng = vfc * vpbiga * vtmp_exp;
fvec vfz_eng = v_0_5 * vfa * vbij;
*vevdwl = v::mask_add(vfz_eng, vmask_repulsive, vfz_eng, vrep_eng);
}
}
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
template<bool ZETA>
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
const c_inner_t * param,
ivec xjw,
bvec mask,
fvec vprefactor,
fvec vrij, fvec rsq2,
fvec vdijx, fvec vdijy, fvec vdijz,
fvec dikx, fvec diky, fvec dikz,
fvec *fix, fvec *fiy, fvec *fiz,
fvec *fjx, fvec *fjy, fvec *fjz,
fvec *fkx, fvec *fky, fvec *fkz,
fvec *zeta
) {
fvec v_1_0 = fvec(1.0);
fvec vph = v::zero(), vpc2 = v::zero(), vpd2 = fvec(1.0), vpgamma = v::zero(), vplam3 = v::zero(), vppowermint = v::zero(), vpbigr = v::zero(), vpbigd = fvec(1.0);
v::gather_8(xjw, mask, ¶m[0].lam3, &vplam3, &vppowermint, &vpbigr, &vpbigd, &vpc2, &vpd2, &vph, &vpgamma);
fvec vrijinv = v::recip(vrij);
fvec vrij_hatx = vrijinv * vdijx;
fvec vrij_haty = vrijinv * vdijy;
fvec vrij_hatz = vrijinv * vdijz;
fvec rikinv = invsqrt(rsq2);
fvec rik_hatx = rikinv * dikx;
fvec rik_haty = rikinv * diky;
fvec rik_hatz = rikinv * dikz;
fvec vrik = sqrt(rsq2);
fvec vcostheta = (vdijx * dikx + vdijy * diky + vdijz * dikz) * v::recip(vrij * vrik);
fvec vhcth = vph - vcostheta;
fvec vdenominator = v::recip(vpd2 + vhcth * vhcth);
fvec vgijk = vpgamma * (v_1_0 + vpc2 * v::recip(vpd2) - vpc2 * vdenominator);
fvec vnumerator = fvec(-2.) * vpc2 * vhcth;
fvec vgijk_d = vpgamma * vnumerator * vdenominator * vdenominator;
fvec varg1 = vplam3 * (vrij - vrik);
fvec varg3 = varg1 * varg1 * varg1;
bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
fvec varg = v::blend(mask_ex, varg1, varg3);
fvec vex_delr = min(fvec(1.e30), exp(varg));
fvec vex_delr_d_factor = v::blend(mask, v_1_0, fvec(3.0) * varg1 * varg1);
fvec vex_delr_d = vplam3 * vex_delr_d_factor * vex_delr;
bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
fvec vfccos;
fvec vfc = v_1_0;
fvec vfc_d = v::zero();
if (! v::mask_testz(vmask_need_sine)) {
fvec vtmp = fvec(MY_PI2) * v::recip(vpbigd);
vfc = v::blend(vmask_need_sine, vfc,
fvec(0.5) * (v_1_0 - v::sincos(&vfccos, vtmp * (vrik - vpbigr))));
vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos);
}
fvec vzeta_d_fc = vfc_d * vgijk * vex_delr;
fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr;
fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d;
if (ZETA) *zeta = vfc * vgijk * vex_delr;
fvec vminus_costheta = - vcostheta;
fvec vdcosdrjx = vrijinv * fmadd(vminus_costheta, vrij_hatx, rik_hatx);
fvec vdcosdrjy = vrijinv * fmadd(vminus_costheta, vrij_haty, rik_haty);
fvec vdcosdrjz = vrijinv * fmadd(vminus_costheta, vrij_hatz, rik_hatz);
fvec vdcosdrkx = rikinv * fmadd(vminus_costheta, rik_hatx, vrij_hatx);
fvec vdcosdrky = rikinv * fmadd(vminus_costheta, rik_haty, vrij_haty);
fvec vdcosdrkz = rikinv * fmadd(vminus_costheta, rik_hatz, vrij_hatz);
fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);
*fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx);
*fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty);
*fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz);
*fjx = vprefactor * (vzeta_d_gijk * vdcosdrjx + vzeta_d_ex_delr * vrij_hatx);
*fjy = vprefactor * (vzeta_d_gijk * vdcosdrjy + vzeta_d_ex_delr * vrij_haty);
*fjz = vprefactor * (vzeta_d_gijk * vdcosdrjz + vzeta_d_ex_delr * vrij_hatz);
*fkx = vprefactor * ((vzeta_d_fc - vzeta_d_ex_delr) * rik_hatx + vzeta_d_gijk * vdcosdrkx);
*fky = vprefactor * ((vzeta_d_fc - vzeta_d_ex_delr) * rik_haty + vzeta_d_gijk * vdcosdrky);
*fkz = vprefactor * ((vzeta_d_fc - vzeta_d_ex_delr) * rik_hatz + vzeta_d_gijk * vdcosdrkz);
}
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
#endif
Event Timeline
Log In to Comment