Page MenuHomec4science

pair_airebo_intel.cpp
No OneTemporary

File Metadata

Created
Fri, Jul 5, 08:49

pair_airebo_intel.cpp

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Markus Hohnerbach (RWTH)
------------------------------------------------------------------------- */
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(push, target(mic))
#endif
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <stddef.h>
#include "lmptype.h"
#include "intel_preprocess.h"
#include "intel_intrinsics_airebo.h"
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
#include <omp.h>
#include <string.h>
#include "pair_airebo_intel.h"
#include "atom.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "force.h"
#include "comm.h"
#include "memory.h"
#include "error.h"
#include "group.h"
#include "kspace.h"
#include "modify.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(push, target(mic))
#endif
template<typename flt_t, typename acc_t>
struct LAMMPS_NS::PairAIREBOIntelParam {
flt_t cutlj, cutljrebosq, cut3rebo;
flt_t sigmin, sigcut;
flt_t cutljsq[2][2];
flt_t lj1[2][2], lj2[2][2], lj3[2][2], lj4[2][2];
flt_t smin, Nmin, Nmax, NCmin, NCmax, thmin, thmax;
flt_t rcmin[2][2], rcmax[2][2], rcmaxsq[2][2], rcmaxp[2][2];
flt_t Q[2][2], alpha[2][2], A[2][2], rho[2][2], BIJc[2][2][3],
Beta[2][2][3];
flt_t rcLJmin[2][2], rcLJmax[2][2], rcLJmaxsq[2][2], bLJmin[2][2],
bLJmax[2][2];
flt_t epsilon[2][2], sigma[2][2], epsilonT[2][2];
// spline coefficients
flt_t gCdom[5], gC1[4][6], gC2[4][6], gHdom[4], gH[3][6];
flt_t gDom[5+4];
flt_t gVal[(4+4+3)*6];
flt_t pCCdom[2][2], pCHdom[2][2], pCC[4][4][16], pCH[4][4][16];
flt_t piCCdom[3][2], piCHdom[3][2], piHHdom[3][2];
acc_t piCC[4][4][9][64], piCH[4][4][9][64], piHH[4][4][9][64];
flt_t Tijdom[3][2];
acc_t Tijc[4][4][9][64];
// spline knot values
flt_t PCCf[5][5], PCCdfdx[5][5], PCCdfdy[5][5], PCHf[5][5];
flt_t PCHdfdx[5][5], PCHdfdy[5][5];
flt_t piCCf[5][5][11], piCCdfdx[5][5][11];
flt_t piCCdfdy[5][5][11], piCCdfdz[5][5][11];
flt_t piCHf[5][5][11], piCHdfdx[5][5][11];
flt_t piCHdfdy[5][5][11], piCHdfdz[5][5][11];
flt_t piHHf[5][5][11], piHHdfdx[5][5][11];
flt_t piHHdfdy[5][5][11], piHHdfdz[5][5][11];
flt_t Tf[5][5][10], Tdfdx[5][5][10], Tdfdy[5][5][10], Tdfdz[5][5][10];
};
namespace {
struct NeighListAIREBO {
int * num; /* num_all */
int * num_half; /* num_all */
int * offset; /* num_all */
int * entries; /* num_all * num_neighs_per_atom */
};
template<typename flt_t>
struct AtomAIREBOT {
flt_t x, y, z;
int w;
};
template<typename acc_t>
struct ResultForceT {
acc_t x, y, z, w;
};
template<typename flt_t, typename acc_t>
struct KernelArgsAIREBOT {
int num_local;
int num_all;
int num_neighs_per_atom;
int num_types;
int frebo_from_atom, frebo_to_atom;
int neigh_from_atom, neigh_to_atom;
int rebuild_flag;
flt_t skin;
struct NeighListAIREBO neigh_lmp;
struct NeighListAIREBO neigh_rebo;
PairAIREBOIntelParam<flt_t,acc_t> params;
struct AtomAIREBOT<flt_t> * x; /* num_all */
int * tag; /* num_all */
flt_t * nC, * nH; /* num_all */
int * map; /* num_types+1 */
struct ResultForceT<acc_t> * result_f; /* num_all */
acc_t result_eng;
};
template<typename flt_t, typename acc_t>
void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag);
template<typename flt_t, typename acc_t>
void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka);
template<typename flt_t, typename acc_t>
void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torsion_flag);
}
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
/* ---------------------------------------------------------------------- */
PairAIREBOIntel::PairAIREBOIntel(LAMMPS *lmp) : PairAIREBO(lmp)
{
suffix_flag |= Suffix::INTEL;
REBO_cnumneigh = NULL;
REBO_num_skin = NULL;
REBO_list_data = NULL;
fix = NULL;
}
/* ---------------------------------------------------------------------- */
PairAIREBOIntel::~PairAIREBOIntel()
{
memory->destroy(REBO_cnumneigh);
memory->destroy(REBO_num_skin);
memory->destroy(REBO_list_data);
}
/* ---------------------------------------------------------------------- */
void PairAIREBOIntel::init_style()
{
PairAIREBO::init_style();
neighbor->requests[neighbor->nrequest-1]->intel = 1;
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD
_cop = fix->coprocessor_number();
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
pack_force_const(fix->get_mixed_buffers());
fix->get_mixed_buffers()->need_tag(1);
} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
pack_force_const(fix->get_double_buffers());
fix->get_double_buffers()->need_tag(1);
} else {
pack_force_const(fix->get_single_buffers());
fix->get_single_buffers()->need_tag(1);
}
#ifdef _LMP_INTEL_OFFLOAD
if (fix->offload_noghost())
error->all(FLERR,"The 'ghost no' option cannot be used with airebo/intel.");
#endif
}
/* ---------------------------------------------------------------------- */
template<typename T>
T * calloc_it(size_t size) {
return static_cast<T*>(calloc(size, sizeof(T)));
}
void PairAIREBOIntel::compute(int eflag, int vflag)
{
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers());
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers());
else
compute<float,float>(eflag, vflag, fix->get_single_buffers());
fix->balance_stamp();
vflag_fdotr = 0;
}
/* ---------------------------------------------------------------------- */
template<class flt_t, class acc_t>
PairAIREBOIntelParam<flt_t,acc_t> PairAIREBOIntel::get_param()
{
PairAIREBOIntelParam<flt_t,acc_t> fc;
#define A(a) \
for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) { \
reinterpret_cast<flt_t*>(&fc.a)[i] = \
reinterpret_cast<double*>(&this->a)[i]; \
}
#define A0(a) \
for (int i = 0; i < sizeof(fc.a)/sizeof(flt_t); i++) { \
reinterpret_cast<flt_t*>(&fc.a)[i] = \
reinterpret_cast<double*>(this->a[0])[i]; \
}
#define B(a) \
for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) { \
reinterpret_cast<acc_t*>(&fc.a)[i] = \
reinterpret_cast<double*>(&this->a)[i]; \
}
A(cutlj) A(cutljrebosq) A(cut3rebo) A(sigmin);
A(sigcut) A0(cutljsq) A0(lj1) A0(lj2) A0(lj3);
A0(lj4) A(smin) A(Nmin) A(Nmax) A(NCmin) A(NCmax) A(thmin) A(thmax);
A(rcmin) A(rcmax) A(rcmaxsq) A(rcmaxp) A(Q) A(alpha) A(A) A(rho) A(BIJc);
A(Beta) A(rcLJmin) A(rcLJmax) A(rcLJmaxsq) A(bLJmin) A(bLJmax) A(epsilon);
A(sigma) A(epsilonT) A(gCdom) A(gC1) A(gC2) A(gHdom) A(gH) A(pCCdom);
A(pCHdom) A(pCC) A(pCH) A(piCCdom) A(piCHdom) A(piHHdom) B(piCC);
B(piCH) B(piHH) A(Tijdom) B(Tijc) A(PCCf) A(PCCdfdx) A(PCCdfdy) A(PCHf);
A(PCHdfdx) A(PCHdfdy) A(piCCf) A(piCCdfdx) A(piCCdfdy) A(piCCdfdz);
A(piCHf) A(piCHdfdx) A(piCHdfdy) A(piCHdfdz) A(piHHf) A(piHHdfdx);
A(piHHdfdy) A(piHHdfdz) A(Tf) A(Tdfdx) A(Tdfdy) A(Tdfdz);
#undef A
#undef A0
#undef B
for (int i = 0; i < 5; i++) fc.gDom[i] = fc.gCdom[i];
for (int i = 0; i < 4; i++) fc.gDom[5+i] = fc.gHdom[i];
for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++)
fc.gVal[6*i+j] = fc.gC1[i][j];
for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++)
fc.gVal[4*6+6*i+j] = fc.gC2[i][j];
for (int i = 0; i < 3; i++) for (int j = 0; j < 6; j++)
fc.gVal[8*6+6*i+j] = fc.gH[i][j];
return fc;
}
/* ---------------------------------------------------------------------- */
template<class flt_t, class acc_t>
void PairAIREBOIntel::compute(
int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers
) {
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = vflag_fdotr = vflag_atom = 0;
pvector[0] = pvector[1] = pvector[2] = 0.0;
const int inum = list->inum;
const int nthreads = comm->nthreads;
const int host_start = fix->host_start_pair();
const int offload_end = fix->offload_end_pair();
const int ago = neighbor->ago;
if (ago != 0 && fix->separate_buffers() == 0) {
fix->start_watch(TIME_PACK);
int packthreads;
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
else packthreads = 1;
#if defined(_OPENMP)
#pragma omp parallel if(packthreads > 1)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
packthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago);
}
fix->stop_watch(TIME_PACK);
}
if (atom->nmax > maxlocal) {
#ifdef LMP_INTEL_OFFLOAD
if (maxlocal > 0 && _cop >= 0) {
int * const REBO_numneigh = this->REBO_numneigh;
int * const REBO_num_skin = this->REBO_num_skin;
int * const REBO_cnumneigh = this->REBO_cnumneigh;
int * const REBO_list_data = this->REBO_list_data;
double * const nC = this->nC;
double * const nH = this->nH;
#pragma offload_transfer target(mic:_cop) \
nocopy(REBO_numneigh: alloc_if(0) free_if(1)) \
nocopy(REBO_cnumneigh: alloc_if(0) free_if(1)) \
nocopy(REBO_num_skin: alloc_if(0) free_if(1)) \
nocopy(REBO_list_data: alloc_if(0) free_if(1)) \
nocopy(nH: alloc_if(0) free_if(1)) \
nocopy(nC: alloc_if(0) free_if(1))
}
#endif
maxlocal = atom->nmax;
memory->destroy(REBO_numneigh);
memory->destroy(REBO_cnumneigh);
memory->destroy(REBO_list_data);
memory->sfree(REBO_firstneigh);
memory->destroy(nC);
memory->destroy(nH);
memory->create(REBO_numneigh,maxlocal,"AIREBO:numneigh");
memory->create(REBO_cnumneigh,maxlocal,"AIREBO:cnumneigh");
memory->create(REBO_num_skin,maxlocal,"AIREBO:cnumneigh");
int max_nbors = buffers->get_max_nbors();
memory->create(REBO_list_data,maxlocal * max_nbors,"AIREBO:list_data");
REBO_firstneigh = (int **) memory->smalloc(maxlocal*sizeof(int *),
"AIREBO:firstneigh");
memory->create(nC,maxlocal,"AIREBO:nC");
memory->create(nH,maxlocal,"AIREBO:nH");
#ifdef _LMP_INTEL_OFFLOAD
if (_cop >= 0) {
int * const REBO_numneigh = this->REBO_numneigh;
int * const REBO_num_skin = this->REBO_num_skin;
int * const REBO_cnumneigh = this->REBO_cnumneigh;
int * const REBO_list_data = this->REBO_list_data;
double * const nC = this->nC;
double * const nH = this->nH;
const int mnml = max_nbors * maxlocal;
#pragma offload_transfer target(mic:_cop) \
nocopy(REBO_numneigh: length(maxlocal) alloc_if(1) free_if(0)) \
nocopy(REBO_cnumneigh:length(maxlocal) alloc_if(1) free_if(0)) \
nocopy(REBO_num_skin: length(maxlocal) alloc_if(1) free_if(0)) \
nocopy(REBO_list_data:length(mnml) alloc_if(1) free_if(0)) \
nocopy(nH: length(maxlocal) alloc_if(1) free_if(0)) \
nocopy(nC: length(maxlocal) alloc_if(1) free_if(0))
}
#endif
}
if (evflag || vflag_fdotr) {
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (eflag) {
eval<1,1>(1, ovflag, buffers, 0, offload_end);
eval<1,1>(0, ovflag, buffers, host_start, inum);
} else {
eval<1,0>(1, ovflag, buffers, 0, offload_end);
eval<1,0>(0, ovflag, buffers, host_start, inum);
}
} else {
eval<0,0>(1, 0, buffers, 0, offload_end);
eval<0,0>(0, 0, buffers, host_start, inum);
}
}
/* ---------------------------------------------------------------------- */
template<int EVFLAG, int EFLAG, class flt_t, class acc_t>
void PairAIREBOIntel::eval(
const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers,
const int astart, const int aend
) {
const int inum = aend - astart;
if (inum == 0) {
return;
}
int nlocal, nall, minlocal;
fix->get_buffern(offload, nlocal, nall, minlocal);
const int ago = neighbor->ago;
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
ATOM_T * _noalias const x = buffers->get_x(offload);
const int * _noalias const numneighhalf = buffers->get_atombin();
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
int * const tag = atom->tag;
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, 1 /*NEWTON_PAIR*/, EFLAG, vflag,
buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
const double skin = neighbor->skin;
const int max_nbor = buffers->get_max_nbors();
const PairAIREBOIntelParam<flt_t,acc_t> param = get_param<flt_t,acc_t>();
// offload here
#ifdef _LMP_INTEL_OFFLOAD
int *overflow = fix->get_off_overflow_flag();
double *timer_compute = fix->off_watch_pair();
int * const REBO_numneigh = this->REBO_numneigh;
int * const REBO_num_skin = this->REBO_num_skin;
int * const REBO_cnumneigh = this->REBO_cnumneigh;
int * const REBO_list_data = this->REBO_list_data;
double * const nC = this->nC;
double * const nH = this->nH;
const int torflag = this->torflag;
const int ljflag = this->ljflag;
const int morseflag = this->morseflag;
int * const map = this->map;
if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
#pragma offload target(mic:_cop) if(offload) \
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(overflow:length(0) alloc_if(0) free_if(0)) \
in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
in(f_stride,nlocal,minlocal,separate_flag,offload) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
in(param,skin,max_nbor) \
in(tag: length(0) alloc_if(0) free_if(0)) \
in(torflag, ljflag, morseflag, ago) \
in(nC: length(0) alloc_if(0) free_if(0)) \
in(nH: length(0) alloc_if(0) free_if(0)) \
in(REBO_numneigh: length(0) alloc_if(0) free_if(0)) \
in(REBO_cnumneigh: length(0) alloc_if(0) free_if(0)) \
in(REBO_num_skin: length(0) alloc_if(0) free_if(0)) \
in(REBO_list_data: length(0) alloc_if(0) free_if(0)) \
in(map: length(0) alloc_if(0) free_if(0)) \
signal(f_start)
#endif
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
IP_PRE_repack_for_offload(1 /*NEWTON_PAIR*/, separate_flag, nlocal, nall,
f_stride, x, 0/*q*/);
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
if (EVFLAG) {
oevdwl = oecoul = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
}
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel \
shared(f_start,f_stride,nlocal,nall,minlocal) \
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int iifrom, iito, tid;
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
iifrom += astart;
iito += astart;
int neigh_iifrom, neigh_iito;
IP_PRE_omp_range(neigh_iifrom, neigh_iito, tid, nall, nthreads);
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
KernelArgsAIREBOT<flt_t,acc_t> args;
args.num_local = nlocal;
args.num_all = nall;
args.num_neighs_per_atom = max_nbor;
args.num_types = ntypes;
args.frebo_from_atom = 0;
args.frebo_to_atom = args.num_local;
args.neigh_from_atom = 0;
args.neigh_to_atom = args.num_all;
args.rebuild_flag = ago == 0;
args.skin = skin;
args.neigh_lmp.num = const_cast<int*>(numneigh);
args.neigh_lmp.num_half = const_cast<int*>(numneighhalf);
args.neigh_lmp.offset = const_cast<int*>(cnumneigh);
args.neigh_lmp.entries = const_cast<int*>(firstneigh);
args.neigh_rebo.num = REBO_numneigh;
args.neigh_rebo.num_half = REBO_num_skin;
args.neigh_rebo.offset = REBO_cnumneigh;
args.neigh_rebo.entries = REBO_list_data;
args.params = param;
args.tag = tag;
args.nC = reinterpret_cast<flt_t*>(nC);
args.nH = reinterpret_cast<flt_t*>(nH);
args.map = map;
args.result_eng = 0;
args.x = (AtomAIREBOT<flt_t>*) x;
args.result_f = (ResultForceT<acc_t> *) f;
args.neigh_from_atom = neigh_iifrom;
args.neigh_to_atom = neigh_iito;
args.frebo_from_atom = iifrom;
args.frebo_to_atom = iito;
aut_rebo_neigh(&args);
#if defined(_OPENMP)
#pragma omp barrier
#endif
aut_frebo(&args, torflag);
if (ljflag) aut_lennard_jones(&args, morseflag);
oevdwl += args.result_eng;
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, x,
offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
} // end of omp parallel region
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EVFLAG) {
if (EFLAG) {
ev_global[0] = oevdwl;
ev_global[1] = oecoul;
}
if (vflag) {
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
}
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end of offload region
if (offload)
fix->stop_watch(TIME_OFFLOAD_LATENCY);
else
fix->stop_watch(TIME_HOST_PAIR);
if (EVFLAG)
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
else
fix->add_result_array(f_start, 0, offload);
}
/* ---------------------------------------------------------------------- */
template<class flt_t, class acc_t>
void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
int tp1 = atom->ntypes + 1;
buffers->set_ntypes(tp1,1);
flt_t **cutneighsq = buffers->get_cutneighsq();
flt_t **cutneighghostsq = buffers->get_cutneighghostsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i, j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
cut = cutghost[i][j] + neighbor->skin;
cutneighghostsq[i][j] = cutneighghostsq[j][i] = cut*cut;
}
}
}
#ifdef _LMP_INTEL_OFFLOAD
if (_cop < 0) return;
flt_t * ocutneighsq = cutneighsq[0];
size_t VL = 512 / 8 / sizeof(flt_t);
int ntypes = tp1;
int tp1sq = tp1 * tp1;
// TODO the lifecycle of "map" is currently not 100% correct
// it might not be freed if this method is called more than once
int * map = this->map;
#pragma offload_transfer target(mic:_cop) \
in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \
in(map: length(tp1) alloc_if(1) free_if(0))
#endif
}
/* ----------------------------------------------------------------------
Implementation
---------------------------------------------------------------------- */
namespace {
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(push, target(mic))
#endif
namespace overloaded {
double sqrt(double a) { return ::sqrt(a); }
float sqrt(float a) { return ::sqrtf(a); }
double sin(double a) { return ::sin(a); }
float sin(float a) { return ::sinf(a); }
double cos(double a) { return ::cos(a); }
float cos(float a) { return ::cosf(a); }
double exp(double a) { return ::exp(a); }
float exp(float a) { return ::expf(a); }
double pow(double a, double b) { return ::pow(a, b); }
float pow(float a, float b) { return ::powf(a, b); }
}
/* ----------------------------------------------------------------------
Scalar AIREBO implementation, standalone, with massive code reuse
compared to original code.
---------------------------------------------------------------------- */
#define M_PI 3.14159265358979323846 /* pi */
#define CARBON 0
#define HYDROGEN 1
#define TOL 1.0e-9
template<typename T>
inline T fmin_nonan(T a, T b) {
return a < b ? a : b;
}
template<typename T>
inline T fmax_nonan(T a, T b) {
return a > b ? a : b;
}
template<typename flt_t>
inline flt_t Sp(flt_t r, flt_t lo, flt_t hi, flt_t * del) {
flt_t t = (r - lo) / (hi - lo);
if (t <= 0) {
if (del) *del = 0;
return 1;
} else if (t >= 1) {
if (del) *del = 0;
return 0;
} else {
t *= static_cast<flt_t>(M_PI);
if (del) *del = static_cast<flt_t>(-0.5 * M_PI)
* overloaded::sin(t) / (hi - lo);
return static_cast<flt_t>(0.5) * (1 + overloaded::cos(t));
}
}
template<typename flt_t>
inline flt_t Sp2(flt_t r, flt_t lo, flt_t hi, flt_t * del) {
flt_t t = (r - lo) / (hi - lo);
if (t <= 0) {
if (del) *del = 0;
return 1;
} else if (t >= 1) {
if (del) *del = 0;
return 0;
} else {
if (del) *del = 6 * (t * t - t) / (hi - lo);
return 1 - t * t * (3 - 2 * t);
}
}
template<typename flt_t>
inline flt_t eval_poly_lin(int n, flt_t * coeffs, flt_t x, flt_t * deriv) {
flt_t result = coeffs[n - 1];
*deriv = coeffs[n - 1] * (n - 1);
for (int i = n - 2; i > 0; i--) {
result = coeffs[i] + x * result;
*deriv = coeffs[i] * i + x * (*deriv);
}
result = coeffs[0] + x * result;
return result;
}
template<typename flt_t, typename acc_t>
inline flt_t gSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, flt_t cos, flt_t N, flt_t * dgdc, flt_t * dgdN) {
flt_t NCmin = ka->params.NCmin;
flt_t NCmax = ka->params.NCmax;
int index = 0;
flt_t * gDom = NULL;
int nDom = 0;
int offs = 0;
if (itype == 0) {
nDom = 4;
gDom = &ka->params.gCdom[0];
if (N > NCmin) offs = 4 * 6;
} else {
nDom = 3;
gDom = &ka->params.gHdom[0];
offs = 8 * 6;
}
cos = fmax_nonan(gDom[0], fmin_nonan(gDom[nDom], cos));
int i;
for (i = 0; i < nDom; i++) {
if (cos >= gDom[i] && cos <= gDom[i + 1]) {
index = i;
}
}
flt_t g = eval_poly_lin(6, &ka->params.gVal[offs+index*6], cos, dgdc);
*dgdN = 0;
if (itype == 0 && N > NCmin && N < NCmax) {
flt_t dg1;
flt_t g1 = eval_poly_lin(6, &ka->params.gVal[index*6], cos, &dg1);
flt_t dS;
flt_t cut = Sp(N, NCmin, NCmax, &dS);
*dgdN = dS * (g1 - g);
g = g + cut * (g1 - g);
*dgdc = *dgdc + cut * (dg1 - *dgdc);
}
return g;
}
template<typename flt_t>
inline flt_t eval_poly_bi(int n, flt_t * coeffs, flt_t x, flt_t y,
flt_t * deriv) {
flt_t dy;
flt_t vy = eval_poly_lin(n, &coeffs[n * (n - 1)], y, &dy);
flt_t result = vy;
deriv[0] = vy * (n - 1);
deriv[1] = dy;
for (int i = n - 2; i > 0; i--) {
vy = eval_poly_lin(n, &coeffs[n * i], y, &dy);
result = vy + x * result;
deriv[0] = vy * i + x * deriv[0];
deriv[1] = dy + x * deriv[1];
}
result = eval_poly_lin(n, &coeffs[0], y, &dy) + x * result;
deriv[1] = dy + x * deriv[1];
return result;
}
template<typename flt_t>
inline flt_t eval_poly_tri(int n, flt_t * coeffs, flt_t x, flt_t y, flt_t z,
flt_t * deriv) {
flt_t dyz[2];
flt_t vyz = eval_poly_bi(n, &coeffs[n * n * (n - 1)], y, z, &dyz[0]);
flt_t result = vyz;
deriv[0] = vyz * (n - 1);
deriv[1] = dyz[0];
deriv[2] = dyz[1];
for (int i = n - 2; i > 0; i--) {
vyz = eval_poly_bi(n, &coeffs[n * n * i], y, z, &dyz[0]);
result = vyz + x * result;
deriv[0] = vyz * i + x * deriv[0];
deriv[1] = dyz[0] + x * deriv[1];
deriv[2] = dyz[1] + x * deriv[2];
}
result = eval_poly_bi(n, &coeffs[0], y, z, &dyz[0]) + x * result;
deriv[1] = dyz[0] + x * deriv[1];
deriv[2] = dyz[1] + x * deriv[2];
return result;
}
template<typename flt_t, typename acc_t>
inline flt_t PijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
int jtype, flt_t NC, flt_t NH, flt_t * dN) {
dN[0] = 0.0;
dN[1] = 0.0;
if (itype == HYDROGEN) return 0;
flt_t *pCJdom = jtype == CARBON ? &ka->params.pCCdom[0][0] :
&ka->params.pCHdom[0][0];
NC = fmax_nonan(pCJdom[0], fmin_nonan(pCJdom[1], NC));
NH = fmax_nonan(pCJdom[2], fmin_nonan(pCJdom[3], NH));
int nC = floor(NC);
int nH = floor(NH);
#define PijSelect(a, b) (jtype == CARBON ? ka->params.a : ka->params.b)
if (fabs(NC - nC) < TOL && fabs(NH - nH) < TOL) {
dN[0] = PijSelect(PCCdfdx, PCHdfdx)[nC][nH];
dN[1] = PijSelect(PCCdfdy, PCHdfdy)[nC][nH];
return PijSelect(PCCf, PCHf)[nC][nH];
}
if (NC == pCJdom[1]) nC -= 1;
if (NH == pCJdom[3]) nH -= 1;
return eval_poly_bi(4, &PijSelect(pCC, pCH)[nC][nH][0], NC, NH, dN);
#undef PijSelect
}
template<typename flt_t, typename acc_t>
inline flt_t TijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, flt_t Nij,
flt_t Nji, flt_t Nijconj, acc_t * dN3) {
flt_t * Tijdom = &ka->params.Tijdom[0][0];
Nij = fmax_nonan(Tijdom[0], fmin_nonan(Tijdom[1], Nij));
Nji = fmax_nonan(Tijdom[2], fmin_nonan(Tijdom[3], Nji));
Nijconj = fmax_nonan(Tijdom[4], fmin_nonan(Tijdom[5], Nijconj));
int nij = floor(Nij);
int nji = floor(Nji);
int nijconj = floor(Nijconj);
if (fabs(Nij - nij) < TOL && fabs(Nji - nji) <
TOL && fabs(Nijconj - nijconj) < TOL) {
dN3[0] = ka->params.Tdfdx[nij][nji][nijconj];
dN3[1] = ka->params.Tdfdy[nij][nji][nijconj];
dN3[2] = ka->params.Tdfdz[nij][nji][nijconj];
return ka->params.Tf[nij][nji][nijconj];
}
if (Nij == Tijdom[1]) nij -= 1;
if (Nji == Tijdom[3]) nji -= 1;
if (Nijconj == Tijdom[5]) nijconj -= 1;
return eval_poly_tri<acc_t>(4, &ka->params.Tijc[nij][nji][nijconj][0], Nij,
Nji, Nijconj, dN3);
}
template<typename flt_t, typename acc_t>
inline flt_t piRCSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, acc_t * dN3) {
const int HH = 2;
const int CH = 1;
/* const int CC = 0; */
int select = itype + jtype;
#define piRCSelect(a, b, c) (select == HH ? ka->params.a : select == CH ? \
ka->params.b : ka->params.c)
flt_t * piIJdom = &piRCSelect(piHHdom, piCHdom, piCCdom)[0][0];
if (select == HH) {
if (Nij < piIJdom[0] || Nij > piIJdom[1] || Nji < piIJdom[2] ||
Nji > piIJdom[3] || Nijconj < piIJdom[4] || Nijconj > piIJdom[5]) {
Nij = 0;
Nji = 0;
Nijconj = 0;
}
}
Nij = fmax_nonan(piIJdom[0], fmin_nonan(piIJdom[1], Nij));
Nji = fmax_nonan(piIJdom[2], fmin_nonan(piIJdom[3], Nji));
Nijconj = fmax_nonan(piIJdom[4], fmin_nonan(piIJdom[5], Nijconj));
int nij = floor(Nij);
int nji = floor(Nji);
int nijconj = floor(Nijconj);
if (fabs(Nij - nij) < TOL && fabs(Nji - nji) <
TOL && fabs(Nijconj - nijconj) < TOL) {
dN3[0] = piRCSelect(piHHdfdx, piCHdfdx, piCCdfdx)[nij][nji][nijconj];
dN3[1] = piRCSelect(piHHdfdy, piCHdfdy, piCCdfdy)[nij][nji][nijconj];
dN3[2] = piRCSelect(piHHdfdz, piCHdfdz, piCCdfdz)[nij][nji][nijconj];
return piRCSelect(piHHf, piCHf, piCCf)[nij][nji][nijconj];
}
if (Nij == piIJdom[1]) nij -= 1;
if (Nji == piIJdom[3]) nji -= 1;
if (Nijconj == piIJdom[5]) nijconj -= 1;
return eval_poly_tri<acc_t>(4,
&piRCSelect(piHH, piCH, piCC)[nij][nji][nijconj][0], Nij, Nji, Nijconj,
dN3);
#undef piRCSelect
}
/*
* Implements the p_ij term in airebo, which occurs on 4 different occasions
* in the original lammps code.
*/
template<typename flt_t, typename acc_t>
inline flt_t frebo_pij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int j,
flt_t rijx, flt_t rijy, flt_t rijz, flt_t rijmag, flt_t wij, flt_t VA,
flt_t * sum_N, acc_t fij[3]) {
ResultForceT<acc_t> * result_f = ka->result_f;
AtomAIREBOT<flt_t> * x = ka->x;
int * map = ka->map;
flt_t * nC = ka->nC;
flt_t * nH = ka->nH;
flt_t x_i = x[i].x;
flt_t y_i = x[i].y;
flt_t z_i = x[i].z;
int itype = map[x[i].w];
int jtype = map[x[j].w];
flt_t invrijm = 1 / rijmag;
flt_t invrijm2 = invrijm * invrijm;
flt_t rcminij = ka->params.rcmin[itype][jtype];
flt_t rcmaxij = ka->params.rcmax[itype][jtype];
flt_t Nmin = ka->params.Nmin;
flt_t Nmax = ka->params.Nmax;
flt_t Nij = nC[i] + nH[i] - wij;
flt_t NijC = nC[i] - wij * (1 - jtype);
flt_t NijH = nH[i] - wij * jtype;
flt_t sum_pij = 0;
flt_t sum_dpij_dN = 0;
flt_t dN2[2] = {0};
flt_t pij = 0;
*sum_N = 0;
int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
int pass;
for (pass = 0; pass < 2; pass++) {
int kk;
int knum = ka->neigh_rebo.num[i];
for (kk = 0; kk < knum; kk++) {
int k = neighs[kk];
if (k == j) continue;
flt_t rikx = x_i - x[k].x;
flt_t riky = y_i - x[k].y;
flt_t rikz = z_i - x[k].z;
int ktype = map[x[k].w];
flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz);
flt_t rho_k = ka->params.rho[ktype][1];
flt_t rho_j = ka->params.rho[jtype][1];
flt_t lamdajik = 4 * itype * ((rho_k - rikmag) - (rho_j - rijmag));
flt_t ex_lam = exp(lamdajik);
flt_t rcminik = ka->params.rcmin[itype][ktype];
flt_t rcmaxik = ka->params.rcmax[itype][ktype];
flt_t dwik;
flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik);
flt_t Nki = nC[k] + nH[k] - wik;
flt_t cosjik = (rijx * rikx + rijy * riky + rijz * rikz) /
(rijmag * rikmag);
cosjik = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cosjik));
flt_t dgdc, dgdN;
flt_t g = gSpline(ka, itype, cosjik, Nij, &dgdc, &dgdN);
if (pass == 0) {
sum_pij += wik * g * ex_lam;
sum_dpij_dN += wik * dgdN * ex_lam;
flt_t cutN = Sp<flt_t>(Nki, Nmin, Nmax, NULL);
*sum_N += (1 - ktype) * wik * cutN;
} else {
flt_t tmp = -0.5 * pij * pij * pij;
flt_t invrikm = 1 / rikmag;
flt_t rjkx = rikx - rijx;
flt_t rjky = riky - rijy;
flt_t rjkz = rikz - rijz;
flt_t rjkmag = sqrt(rjkx * rjkx + rjky * rjky + rjkz * rjkz);
flt_t rijrik = 2 * rijmag * rikmag;
flt_t rr = rijmag * rijmag - rikmag * rikmag;
flt_t dctdjk = -2 / rijrik;
flt_t dctdik = (-rr + rjkmag * rjkmag) / (rijrik * rikmag * rikmag);
flt_t dctdij = (rr + rjkmag * rjkmag) / (rijrik * rijmag * rijmag);
acc_t fi[3], fj[3], fk[3];
flt_t pref = 0.5 * VA * tmp;
flt_t tmp20 = pref * wik * dgdc * ex_lam;
fj[0] = fj[1] = fj[2] = 0;
fi[0] = -tmp20 * dctdik * rikx;
fi[1] = -tmp20 * dctdik * riky;
fi[2] = -tmp20 * dctdik * rikz;
fk[0] = tmp20 * dctdik * rikx;
fk[1] = tmp20 * dctdik * riky;
fk[2] = tmp20 * dctdik * rikz;
fij[0] += -tmp20 * dctdij * rijx;
fij[1] += -tmp20 * dctdij * rijy;
fij[2] += -tmp20 * dctdij * rijz;
fi[0] += -tmp20 * dctdjk * rjkx;
fi[1] += -tmp20 * dctdjk * rjky;
fi[2] += -tmp20 * dctdjk * rjkz;
fk[0] += tmp20 * dctdjk * rjkx;
fk[1] += tmp20 * dctdjk * rjky;
fk[2] += tmp20 * dctdjk * rjkz;
fij[0] -= -tmp20 * dctdjk * rjkx;
fij[1] -= -tmp20 * dctdjk * rjky;
fij[2] -= -tmp20 * dctdjk * rjkz;
flt_t tmp21 = pref * (wik * g * ex_lam * 4 * itype);
fij[0] -= 1 * tmp21 * rijx * invrijm;
fij[1] -= 1 * tmp21 * rijy * invrijm;
fij[2] -= 1 * tmp21 * rijz * invrijm;
fi[0] -= tmp21 * (-rikx * invrikm);
fi[1] -= tmp21 * (-riky * invrikm);
fi[2] -= tmp21 * (-rikz * invrikm);
fk[0] -= tmp21 * (rikx * invrikm);
fk[1] -= tmp21 * (riky * invrikm);
fk[2] -= tmp21 * (rikz * invrikm);
// coordination forces
// dwik forces
flt_t tmp22 = pref * dwik * g * ex_lam * invrikm;
fi[0] -= tmp22 * rikx;
fi[1] -= tmp22 * riky;
fi[2] -= tmp22 * rikz;
fk[0] += tmp22 * rikx;
fk[1] += tmp22 * riky;
fk[2] += tmp22 * rikz;
// PIJ forces
flt_t tmp23 = pref * dN2[ktype] * dwik * invrikm;
fi[0] -= tmp23 * rikx;
fi[1] -= tmp23 * riky;
fi[2] -= tmp23 * rikz;
fk[0] += tmp23 * rikx;
fk[1] += tmp23 * riky;
fk[2] += tmp23 * rikz;
// dgdN forces
flt_t tmp24 = pref * sum_dpij_dN * dwik * invrikm;
fi[0] -= tmp24 * rikx;
fi[1] -= tmp24 * riky;
fi[2] -= tmp24 * rikz;
fk[0] += tmp24 * rikx;
fk[1] += tmp24 * riky;
fk[2] += tmp24 * rikz;
result_f[i].x += fi[0];
result_f[i].y += fi[1];
result_f[i].z += fi[2];
result_f[j].x += fj[0];
result_f[j].y += fj[1];
result_f[j].z += fj[2];
result_f[k].x += fk[0];
result_f[k].y += fk[1];
result_f[k].z += fk[2];
}
}
if (pass == 0) {
flt_t PijS = PijSpline(ka, itype, jtype, NijC, NijH, dN2);
pij = 1 / overloaded::sqrt(1 + sum_pij + PijS);
}
}
return pij;
}
template<typename flt_t, typename acc_t>
inline flt_t frebo_pi_rc(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) {
acc_t dN3tmp[3] = {0};
flt_t ret = piRCSpline(ka, itype, jtype, Nij, Nji, Nijconj, dN3tmp);
dN3[0] = dN3tmp[0];
dN3[1] = dN3tmp[1];
dN3[2] = dN3tmp[2];
return ret;
}
template<typename flt_t, typename acc_t>
inline flt_t frebo_Tij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) {
dN3[0] = 0;
dN3[1] = 0;
dN3[2] = 0;
if (itype == HYDROGEN || jtype == HYDROGEN) return 0;
acc_t dN3tmp[3] = {0};
flt_t ret = TijSpline(ka, Nij, Nji, Nijconj, dN3tmp);
dN3[0] = dN3tmp[0];
dN3[1] = dN3tmp[1];
dN3[2] = dN3tmp[2];
return ret;
}
/*
* Implements a scalar version of the sum cos^1(omega) term used in pi^dh_ij.
* Occurs in both bondorder and bondorderLJ.
*/
template<typename flt_t, typename acc_t>
inline flt_t frebo_sum_omega(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int j,
flt_t r23x, flt_t r23y, flt_t r23z, flt_t r23mag, flt_t VA, acc_t fij[3]) {
ResultForceT<acc_t> * result_f = ka->result_f;
acc_t sum_omega = 0;
int a2 = i;
int a3 = j;
flt_t r32x = - r23x;
flt_t r32y = - r23y;
flt_t r32z = - r23z;
int * map = ka->map;
AtomAIREBOT<flt_t> * x = ka->x;
flt_t thmin = ka->params.thmin;
flt_t thmax = ka->params.thmax;
int itype = map[x[i].w];
int jtype = map[x[j].w];
int * neighs_i = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
int * neighs_j = ka->neigh_rebo.entries + ka->neigh_rebo.offset[j];
int num_i = ka->neigh_rebo.num[i];
int num_j = ka->neigh_rebo.num[j];
int kk;
for (kk = 0; kk < num_i; kk++) {
int k = neighs_i[kk];
if (k == j) continue;
int a1 = k;
int ktype = map[x[k].w];
flt_t r21x = x[a2].x - x[a1].x;
flt_t r21y = x[a2].y - x[a1].y;
flt_t r21z = x[a2].z - x[a1].z;
flt_t r21mag = overloaded::sqrt(r21x * r21x + r21y * r21y + r21z * r21z);
flt_t cos321 = (r23x * r21x + r23y * r21y + r23z * r21z) /
(r23mag * r21mag);
cos321 = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cos321));
flt_t sin321 = overloaded::sqrt(1 - cos321 * cos321);
if (sin321 == 0) continue;
flt_t sink2i = 1 / (sin321 * sin321);
flt_t rik2i = 1 / (r21mag * r21mag);
flt_t rr = r23mag * r23mag - r21mag * r21mag;
flt_t r31x = r21x - r23x;
flt_t r31y = r21y - r23y;
flt_t r31z = r21z - r23z;
flt_t r31mag2 = r31x * r31x + r31y * r31y + r31z * r31z;
flt_t rijrik = 2 * r23mag * r21mag;
flt_t r21mag2 = r21mag * r21mag;
flt_t dctik = (-rr + r31mag2) / (rijrik * r21mag2);
flt_t dctij = (rr + r31mag2) / (rijrik * r23mag * r23mag);
flt_t dctjk = -2 / rijrik;
flt_t rcmin21 = ka->params.rcmin [itype][ktype];
flt_t rcmaxp21 = ka->params.rcmaxp[itype][ktype];
flt_t dw21;
flt_t w21 = Sp(r21mag, rcmin21, rcmaxp21, &dw21);
// why does this additional cutoff in the cosine exist?
// the original code by stuart answers this:
// it avoid issues when bonds in the dihedral are linear
// by switching the dihedral off beforehand.
// This is the reason for both the sin == 0 checks and the
// tspjik = Sp2(..) calls.
// Unfortunately, this is not exactly stated in the original paper.
// It might be similar in purpose to the H(sin - s^min) term that
// appears in that paper, but can not be found in original REBO papers.
flt_t dtsjik;
flt_t tspjik = Sp2(cos321, thmin, thmax, &dtsjik);
dtsjik = - dtsjik;
int ll;
for (ll = 0; ll < num_j; ll++) {
int l = neighs_j[ll];
if (l == i || l == k) continue;
int ltype = map[x[l].w];
int a4 = l;
flt_t r34x = x[a3].x - x[a4].x;
flt_t r34y = x[a3].y - x[a4].y;
flt_t r34z = x[a3].z - x[a4].z;
flt_t r34mag = overloaded::sqrt(r34x * r34x + r34y * r34y + r34z * r34z);
flt_t cos234 = (r32x * r34x + r32y * r34y + r32z * r34z) /
(r23mag * r34mag);
cos234 = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cos234));
flt_t sin234 = overloaded::sqrt(1 - cos234 * cos234);
if (sin234 == 0) continue;
flt_t sinl2i = 1 / (sin234 * sin234);
flt_t rjl2i = 1 / (r34mag * r34mag);
flt_t rcminjl = ka->params.rcmin[jtype][ltype];
flt_t rcmaxpjl = ka->params.rcmaxp[jtype][ltype];
flt_t dw34;
flt_t w34 = Sp(r34mag, rcminjl, rcmaxpjl, &dw34);
flt_t rr = (r23mag * r23mag) - (r34mag * r34mag);
flt_t r24x = r23x + r34x;
flt_t r24y = r23y + r34y;
flt_t r24z = r23z + r34z;
flt_t r242 =
(r24x * r24x) + (r24y * r24y) + (r24z * r24z);
flt_t rijrjl = 2 * r23mag * r34mag;
flt_t rjl2 = r34mag * r34mag;
flt_t dctjl = (-rr + r242) / (rijrjl * rjl2);
flt_t dctji = (rr + r242) / (rijrjl * r23mag * r23mag);
flt_t dctil = -2 / rijrjl;
flt_t dtsijl;
flt_t tspijl = Sp2(cos234, thmin, thmax, &dtsijl);
dtsijl = -dtsijl; // need minus sign
flt_t prefactor = VA;
flt_t cross321x = (r32y * r21z) - (r32z * r21y);
flt_t cross321y = (r32z * r21x) - (r32x * r21z);
flt_t cross321z = (r32x * r21y) - (r32y * r21x);
flt_t cross234x = (r23y * r34z) - (r23z * r34y);
flt_t cross234y = (r23z * r34x) - (r23x * r34z);
flt_t cross234z = (r23x * r34y) - (r23y * r34x);
flt_t cwnum = (cross321x * cross234x) +
(cross321y * cross234y) +
(cross321z * cross234z);
flt_t cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234;
flt_t om1234 = cwnum / cwnom;
flt_t cw = om1234;
sum_omega += ((1 - (om1234 * om1234)) * w21 * w34) *
(1 - tspjik) * (1 - tspijl);
if (VA == static_cast<flt_t>(0.0)) continue;
flt_t dt1dik = (rik2i) - (dctik * sink2i * cos321);
flt_t dt1djk = (-dctjk * sink2i * cos321);
flt_t dt1djl = (rjl2i) - (dctjl * sinl2i * cos234);
flt_t dt1dil = (-dctil * sinl2i * cos234);
flt_t dt1dij = (2 / (r23mag * r23mag)) -
(dctij * sink2i * cos321) -
(dctji * sinl2i * cos234);
flt_t dt2dikx = (-r23z * cross234y) + (r23y * cross234z);
flt_t dt2diky = (-r23x * cross234z) + (r23z * cross234x);
flt_t dt2dikz = (-r23y * cross234x) + (r23x * cross234y);
flt_t dt2djlx = (-r23y * cross321z) + (r23z * cross321y);
flt_t dt2djly = (-r23z * cross321x) + (r23x * cross321z);
flt_t dt2djlz = (-r23x * cross321y) + (r23y * cross321x);
flt_t dt2dijx = (r21z * cross234y) - (r34z * cross321y) -
flt_t (r21y * cross234z) + (r34y * cross321z);
flt_t dt2dijy = (r21x * cross234z) - (r34x * cross321z) -
flt_t (r21z * cross234x) + (r34z * cross321x);
flt_t dt2dijz = (r21y * cross234x) - (r34y * cross321x) -
flt_t (r21x * cross234y) + (r34x * cross321y);
flt_t aa = (prefactor * 2 * cw / cwnom) * w21 * w34 *
(1 - tspjik) * (1 - tspijl);
flt_t aaa1 = -prefactor * (1 - (om1234 * om1234)) *
(1 - tspjik) * (1 - tspijl);
flt_t aaa2 = -prefactor * (1 - (om1234 * om1234)) * w21 * w34;
flt_t at2 = aa * cwnum;
flt_t fcijpc = (-dt1dij * at2) +
(aaa2 * dtsjik * dctij * (1 - tspijl)) +
(aaa2 * dtsijl * dctji * (1 - tspjik));
flt_t fcikpc = (-dt1dik * at2) +
(aaa2 * dtsjik * dctik * (1 - tspijl));
flt_t fcjlpc = (-dt1djl * at2) +
(aaa2 * dtsijl * dctjl * (1 - tspjik));
flt_t fcjkpc = (-dt1djk * at2) +
(aaa2 * dtsjik * dctjk * (1 - tspijl));
flt_t fcilpc = (-dt1dil * at2) +
(aaa2 * dtsijl * dctil * (1 - tspjik));
flt_t F23x = (fcijpc * r23x) + (aa * dt2dijx);
flt_t F23y = (fcijpc * r23y) + (aa * dt2dijy);
flt_t F23z = (fcijpc * r23z) + (aa * dt2dijz);
flt_t F12x = (fcikpc * r21x) + (aa * dt2dikx);
flt_t F12y = (fcikpc * r21y) + (aa * dt2diky);
flt_t F12z = (fcikpc * r21z) + (aa * dt2dikz);
flt_t F34x = (fcjlpc * r34x) + (aa * dt2djlx);
flt_t F34y = (fcjlpc * r34y) + (aa * dt2djly);
flt_t F34z = (fcjlpc * r34z) + (aa * dt2djlz);
flt_t F31x = (fcjkpc * r31x);
flt_t F31y = (fcjkpc * r31y);
flt_t F31z = (fcjkpc * r31z);
flt_t F24x = (fcilpc * r24x);
flt_t F24y = (fcilpc * r24y);
flt_t F24z = (fcilpc * r24z);
flt_t f1x = -F12x - F31x;
flt_t f1y = -F12y - F31y;
flt_t f1z = -F12z - F31z;
flt_t f2x = F12x + F31x;
flt_t f2y = F12y + F31y;
flt_t f2z = F12z + F31z;
flt_t f3x = F34x + F24x;
flt_t f3y = F34y + F24y;
flt_t f3z = F34z + F24z;
flt_t f4x = -F34x - F24x;
flt_t f4y = -F34y - F24y;
flt_t f4z = -F34z - F24z;
fij[0] += F23x + F24x - F31x;
fij[1] += F23y + F24y - F31y;
fij[2] += F23z + F24z - F31z;
// coordination forces
flt_t tmp20 = VA * ((1 - (om1234 * om1234))) *
(1 - tspjik) * (1 - tspijl) * dw21 * w34 / r21mag;
f2x -= tmp20 * r21x;
f2y -= tmp20 * r21y;
f2z -= tmp20 * r21z;
f1x += tmp20 * r21x;
f1y += tmp20 * r21y;
f1z += tmp20 * r21z;
flt_t tmp21 = VA * ((1 - (om1234 * om1234))) *
(1 - tspjik) * (1 - tspijl) * w21 * dw34 / r34mag;
f3x -= tmp21 * r34x;
f3y -= tmp21 * r34y;
f3z -= tmp21 * r34z;
f4x += tmp21 * r34x;
f4y += tmp21 * r34y;
f4z += tmp21 * r34z;
result_f[a1].x += f1x;
result_f[a1].y += f1y;
result_f[a1].z += f1z;
result_f[a2].x += f2x;
result_f[a2].y += f2y;
result_f[a2].z += f2z;
result_f[a3].x += f3x;
result_f[a3].y += f3y;
result_f[a3].z += f3z;
result_f[a4].x += f4x;
result_f[a4].y += f4y;
result_f[a4].z += f4z;
}
}
return sum_omega;
}
/*
* Implements a scalar implementation the force update due to splines.
* It is used for both pi^rc_ij and T_ij.
* Occurs four times in each bondorder and bondorderLJ.
*/
template<typename flt_t, typename acc_t>
inline void frebo_N_spline_force(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
int j, flt_t VA, flt_t dN, flt_t dNconj, flt_t Nconj) {
int * map = ka->map;
AtomAIREBOT<flt_t> * x = ka->x;
ResultForceT<acc_t> * result_f = ka->result_f;
flt_t * nC = ka->nC;
flt_t * nH = ka->nH;
flt_t Nmin = ka->params.Nmin;
flt_t Nmax = ka->params.Nmax;
int itype = map[x[i].w];
int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
int knum = ka->neigh_rebo.num[i];
int kk;
for (kk = 0; kk < knum; kk++) {
int k = neighs[kk];
if (k == j) continue;
flt_t rikx = x[i].x - x[k].x;
flt_t riky = x[i].y - x[k].y;
flt_t rikz = x[i].z - x[k].z;
flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz);
int ktype = map[x[k].w];
flt_t rcminik = ka->params.rcmin[itype][ktype];
flt_t rcmaxik = ka->params.rcmax[itype][ktype];
flt_t dwik;
flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik);
flt_t Nki = nC[k] + nH[k] - wik;
flt_t dNki;
flt_t SpN = Sp(Nki, Nmin, Nmax, &dNki);
flt_t fdN = VA * dN * dwik / rikmag;
flt_t fdNconj = VA * dNconj * 2 * Nconj * dwik * SpN / rikmag;
flt_t ffactor = fdN;
if (ktype == 0) ffactor += fdNconj;
flt_t fkx = ffactor * rikx;
flt_t fky = ffactor * riky;
flt_t fkz = ffactor * rikz;
result_f[i].x -= fkx;
result_f[i].y -= fky;
result_f[i].z -= fkz;
result_f[k].x += fkx;
result_f[k].y += fky;
result_f[k].z += fkz;
if (ktype != 0 || fabs(dNki) <= TOL) continue;
int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k];
int nnum = ka->neigh_rebo.num[k];
int nn;
for (nn = 0; nn < nnum; nn++) {
int n = neighs_k[nn];
if (n == i) continue;
flt_t rknx = x[k].x - x[n].x;
flt_t rkny = x[k].y - x[n].y;
flt_t rknz = x[k].z - x[n].z;
flt_t rknmag = overloaded::sqrt(rknx * rknx + rkny * rkny + rknz * rknz);
int ntype = map[x[n].w];
flt_t rcminkn = ka->params.rcmin[ktype][ntype];
flt_t rcmaxkn = ka->params.rcmax[ktype][ntype];
flt_t dwkn;
Sp(rknmag, rcminkn, rcmaxkn, &dwkn);
flt_t ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag;
result_f[k].x -= ffactor * rknx;
result_f[k].y -= ffactor * rkny;
result_f[k].z -= ffactor * rknz;
result_f[n].x += ffactor * rknx;
result_f[n].y += ffactor * rkny;
result_f[n].z += ffactor * rknz;
}
}
}
/*
* This data-structure contains the result of a search through neighbor-lists.
* It is used to calculate C_ij and the corresponding force updates.
*/
template<typename flt_t>
struct LennardJonesPathAIREBOT {
AtomAIREBOT<flt_t> del[3];
int num;
flt_t w[3];
flt_t dw[3];
flt_t r[3];
int idx[4];
};
/*
* Checks a candidate path stored in idxs whether it is better than *path
* and updates *path accordingly.
*/
template<typename flt_t, typename acc_t>
inline flt_t ref_lennard_jones_test_path_single(
KernelArgsAIREBOT<flt_t,acc_t> * ka, flt_t best, int num, int * idxs,
LennardJonesPathAIREBOT<flt_t> * path) {
LennardJonesPathAIREBOT<flt_t> result;
AtomAIREBOT<flt_t> * x = ka->x;
int * map = ka->map;
result.num = num;
flt_t combined = 1;
for (int i = num - 2; i >= 0; i--) {
int a0 = idxs[i+0];
int a1 = idxs[i+1];
flt_t delx = x[a1].x - x[a0].x;
flt_t dely = x[a1].y - x[a0].y;
flt_t delz = x[a1].z - x[a0].z;
flt_t rsq = delx * delx + dely * dely + delz * delz;
int type0 = map[x[a0].w];
int type1 = map[x[a1].w];
if (rsq >= ka->params.rcmaxsq[type0][type1]) return best;
flt_t r = overloaded::sqrt(rsq);
flt_t dw, w = Sp<flt_t>(r, ka->params.rcmin[type0][type1],
ka->params.rcmax[type0][type1], &dw);
if (w == 0) return best;
combined *= w;
if (combined <= best) return best;
result.idx[i] = a0;
result.del[i].x = delx;
result.del[i].y = dely;
result.del[i].z = delz;
result.r[i] = r;
result.w[i] = w;
result.dw[i] = dw;
}
result.idx[num - 1] = idxs[num - 1];
*path = result;
return combined;
}
/*
* Test through all paths surrounding i and j to find the corresponding
* best path. Uses the same iteration ordering as FLJ() does.
* Note that an optimization would use the j neighlist instead in the inner
* loop.
*/
template<typename flt_t, typename acc_t>
inline flt_t ref_lennard_jones_test_path(KernelArgsAIREBOT<flt_t,acc_t> * ka,
int i, int j, flt_t rij, flt_t rcmax,
LennardJonesPathAIREBOT<flt_t> * path) {
int idxs[4];
idxs[0] = i;
idxs[1] = j;
flt_t best = 0;
if (rij <= rcmax) {
best = ref_lennard_jones_test_path_single(ka, best, 2, idxs, path);
if (best == static_cast<flt_t>(1.0)) return 0;
}
for (int kk = 0; kk < ka->neigh_rebo.num[i]; kk++) {
int k = ka->neigh_rebo.entries[ka->neigh_rebo.offset[i] + kk];
if (k == j) continue;
idxs[1] = k;
idxs[2] = j;
best = ref_lennard_jones_test_path_single(ka, best, 3, idxs, path);
if (best == static_cast<flt_t>(1.0)) return 0;
for (int mm = 0; mm < ka->neigh_rebo.num[k]; mm++) {
int m = ka->neigh_rebo.entries[ka->neigh_rebo.offset[k] + mm];
if (m == i || m == j) continue;
idxs[2] = m;
idxs[3] = j;
best = ref_lennard_jones_test_path_single(ka, best, 4, idxs, path);
if (best == static_cast<flt_t>(1.0)) return 0;
}
}
return 1 - best;
}
/*
* Conducts the force update due to C_ij, given the active path.
*/
template<typename flt_t, typename acc_t>
inline void ref_lennard_jones_force_path(KernelArgsAIREBOT<flt_t,acc_t> * ka,
flt_t dC, LennardJonesPathAIREBOT<flt_t> * path) {
AtomAIREBOT<flt_t> * x = ka->x;
ResultForceT<acc_t> * result_f = ka->result_f;
for (int i = 0; i < path->num - 1; i++) {
flt_t fpair = dC * path->dw[i] / path->r[i];
for (int j = 0; j < path->num - 1; j++) {
if (i != j) fpair *= path->w[j];
}
result_f[path->idx[i+0]].x -= fpair * path->del[i].x;
result_f[path->idx[i+0]].y -= fpair * path->del[i].y;
result_f[path->idx[i+0]].z -= fpair * path->del[i].z;
result_f[path->idx[i+1]].x += fpair * path->del[i].x;
result_f[path->idx[i+1]].y += fpair * path->del[i].y;
result_f[path->idx[i+1]].z += fpair * path->del[i].z;
}
}
/*
* Calculate the bondorderLJ term.
*/
template<typename flt_t, typename acc_t>
inline flt_t ref_lennard_jones_bondorder(KernelArgsAIREBOT<flt_t,acc_t> * ka,
int i, int j, flt_t VA, acc_t fij[3]) {
AtomAIREBOT<flt_t> * x = ka->x;
int * map = ka->map;
ResultForceT<acc_t> * result_f = ka->result_f;
int itype = map[x[i].w];
int jtype = map[x[j].w];
flt_t delx = x[i].x - x[j].x;
flt_t dely = x[i].y - x[j].y;
flt_t delz = x[i].z - x[j].z;
flt_t rsq = delx * delx + dely * dely + delz * delz;
flt_t rij = overloaded::sqrt(rsq);
flt_t rcminij = ka->params.rcmin[itype][jtype];
flt_t rcmaxij = ka->params.rcmax[itype][jtype];
flt_t dwij;
flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij);
flt_t the_r = ka->params.rcmin[itype][jtype];
flt_t scale = the_r / rij;
flt_t Nij = ka->nH[i] + ka->nC[i] - wij;
flt_t Nji = ka->nH[j] + ka->nC[j] - wij;
flt_t NconjtmpI;
acc_t fijc[3] = {0}, fjic[3] = {0};
flt_t pij = frebo_pij<flt_t,acc_t>(ka, i, j, delx * scale, dely * scale,
delz * scale, the_r, wij, 0.0, &NconjtmpI, fijc);
flt_t NconjtmpJ;
flt_t pji = frebo_pij<flt_t,acc_t>(ka, j, i, -delx * scale, -dely * scale,
-delz * scale, the_r, wij, 0.0, &NconjtmpJ, fjic);
flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ);
flt_t dN3_pi_rc[3];
flt_t pi_rc = frebo_pi_rc<flt_t,acc_t>(ka, itype, jtype, Nij, Nji, Nijconj,
dN3_pi_rc);
flt_t dN3_Tij[3];
flt_t Tij = frebo_Tij<flt_t,acc_t>(ka, itype, jtype, Nij, Nji, Nijconj,
dN3_Tij);
flt_t sum_omega = 0;
if (fabs(Tij) > TOL) {
sum_omega = frebo_sum_omega<flt_t,acc_t>(ka, i, j, delx * scale, dely *
scale, delz * scale, the_r, 0.0,
fijc);
}
flt_t pi_dh = Tij * sum_omega;
flt_t bij = 0.5 * (pij + pji) + pi_rc + pi_dh;
flt_t dStb;
flt_t Stb = Sp2<flt_t>(bij, ka->params.bLJmin[itype][jtype],
ka->params.bLJmax[itype][jtype], &dStb);
if (dStb != 0) {
flt_t pij_reverse = frebo_pij<flt_t,acc_t>(ka, i, j, delx * scale,
dely * scale, delz * scale, the_r, wij, VA * dStb, &NconjtmpI, fijc);
flt_t pji_reverse = frebo_pij<flt_t,acc_t>(ka, j, i, -delx * scale,
-dely * scale, -delz * scale, the_r, wij, VA * dStb, &NconjtmpJ, fjic);
fijc[0] -= fjic[0];
fijc[1] -= fjic[1];
fijc[2] -= fjic[2];
frebo_N_spline_force<flt_t,acc_t>(ka, i, j, VA * dStb, dN3_pi_rc[0],
dN3_pi_rc[2], NconjtmpI);
frebo_N_spline_force<flt_t,acc_t>(ka, j, i, VA * dStb, dN3_pi_rc[1],
dN3_pi_rc[2], NconjtmpJ);
if (fabs(Tij) > TOL) {
flt_t sum_omega_reverse = frebo_sum_omega<flt_t,acc_t>(ka, i, j,
delx * scale, dely * scale, delz * scale, the_r, VA * dStb * Tij, fijc);
frebo_N_spline_force(ka, i, j, VA * dStb * sum_omega, dN3_Tij[0],
dN3_Tij[2], NconjtmpI);
frebo_N_spline_force(ka, j, i, VA * dStb * sum_omega, dN3_Tij[1],
dN3_Tij[2], NconjtmpJ);
}
assert(fij[0] == 0);
assert(fij[1] == 0);
assert(fij[2] == 0);
fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx *
fijc[1] + delz * delx * fijc[2]) / rsq);
fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely *
fijc[1] + delz * dely * fijc[2]) / rsq);
fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz *
fijc[1] + delz * delz * fijc[2]) / rsq);
}
return Stb;
}
/*
* Scalar reference implementation of neighbor routine.
*/
template<typename flt_t, typename acc_t>
void ref_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
int offset = ka->neigh_from_atom * ka->num_neighs_per_atom;
for (int i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) {
ka->neigh_rebo.offset[i] = offset;
int itype = ka->map[ka->x[i].w];
int n = 0;
ka->nC[i] = 0;
ka->nH[i] = 0;
for (int j = 0; j < ka->neigh_lmp.num[i]; j++) {
int ji = ka->neigh_lmp.entries[ka->neigh_lmp.offset[i] + j];
flt_t delx = ka->x[i].x - ka->x[ji].x;
flt_t dely = ka->x[i].y - ka->x[ji].y;
flt_t delz = ka->x[i].z - ka->x[ji].z;
flt_t rsq = delx * delx + dely * dely + delz * delz;
int jtype = ka->map[ka->x[ji].w];
if (rsq < ka->params.rcmaxsq[itype][jtype]) {
ka->neigh_rebo.entries[offset + n++] = ji;
flt_t rcmin = ka->params.rcmin[itype][jtype];
flt_t rcmax = ka->params.rcmax[itype][jtype];
if (jtype == CARBON)
ka->nC[i] += Sp<flt_t>(overloaded::sqrt(rsq), rcmin, rcmax, NULL);
else
ka->nH[i] += Sp<flt_t>(overloaded::sqrt(rsq), rcmin, rcmax, NULL);
}
}
ka->neigh_rebo.num[i] = n;
offset += n;
}
}
template<typename flt_t, typename acc_t>
void ref_torsion_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
int j) {
AtomAIREBOT<flt_t> * x = ka->x;
int * map = ka->map;
ResultForceT<acc_t> * f = ka->result_f;
flt_t (*rcmin)[2] = ka->params.rcmin;
flt_t (*rcmax)[2] = ka->params.rcmax;
flt_t (*epsilonT)[2] = ka->params.epsilonT;
flt_t thmin = ka->params.thmin;
flt_t thmax = ka->params.thmax;
int itype = map[x[i].w];
flt_t xtmp = x[i].x;
flt_t ytmp = x[i].y;
flt_t ztmp = x[i].z;
int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
int jnum = ka->neigh_rebo.num[i];
int jtype = map[x[j].w];
flt_t del32x = x[j].x-x[i].x;
flt_t del32y = x[j].y-x[i].y;
flt_t del32z = x[j].z-x[i].z;
flt_t rsq = del32x*del32x + del32y*del32y + del32z*del32z;
flt_t r32 = overloaded::sqrt(rsq);
flt_t del23x = -del32x;
flt_t del23y = -del32y;
flt_t del23z = -del32z;
flt_t r23 = r32;
flt_t dw23, w23 = Sp<flt_t>(r23,rcmin[itype][jtype],rcmax[itype][jtype],
&dw23);
assert(itype == 0);
assert(jtype == 0);
for (int kk = 0; kk < jnum; kk++) {
int k = REBO_neighs_i[kk];
int ktype = map[x[k].w];
if (k == j) continue;
flt_t del21x = x[i].x-x[k].x;
flt_t del21y = x[i].y-x[k].y;
flt_t del21z = x[i].z-x[k].z;
flt_t rsq = del21x*del21x + del21y*del21y + del21z*del21z;
flt_t r21 = overloaded::sqrt(rsq);
flt_t cos321 = - ((del21x*del32x) + (del21y*del32y) +
(del21z*del32z)) / (r21*r32);
cos321 = fmin(cos321,1);
cos321 = fmax(cos321,-1);
flt_t sin321 = overloaded::sqrt(1 - cos321*cos321);
if (sin321 < TOL) continue;
flt_t deljkx = del21x-del23x;
flt_t deljky = del21y-del23y;
flt_t deljkz = del21z-del23z;
flt_t rjk2 = deljkx*deljkx + deljky*deljky + deljkz*deljkz;
flt_t rjk = overloaded::sqrt(rjk2);
flt_t rik2 = r21*r21;
flt_t dw21, w21 = Sp<flt_t>(r21,rcmin[itype][ktype],rcmax[itype][ktype],
&dw21);
flt_t rij = r32;
flt_t rik = r21;
flt_t rij2 = r32*r32;
flt_t costmp = static_cast<flt_t>(0.5)*(rij2+rik2-rjk2)/rij/rik;
flt_t dtsjik, tspjik = Sp2<flt_t>(costmp,thmin,thmax,&dtsjik);
dtsjik = -dtsjik;
int * REBO_neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]];
int lnum = ka->neigh_rebo.num[j];
for (int ll = 0; ll < lnum; ll++) {
int l = REBO_neighs_j[ll];
int ltype = map[x[l].w];
if (l == i || l == k) continue;
flt_t del34x = x[j].x-x[l].x;
flt_t del34y = x[j].y-x[l].y;
flt_t del34z = x[j].z-x[l].z;
flt_t rsq = del34x*del34x + del34y*del34y + del34z*del34z;
flt_t r34 = overloaded::sqrt(rsq);
flt_t cos234 = (del32x*del34x + del32y*del34y +
del32z*del34z) / (r32*r34);
cos234 = fmin(cos234,1);
cos234 = fmax(cos234,-1);
flt_t sin234 = overloaded::sqrt(1 - cos234*cos234);
if (sin234 < TOL) continue;
flt_t dw34, w34 = Sp<flt_t>(r34,rcmin[jtype][ltype],rcmax[jtype][ltype],
&dw34);
flt_t delilx = del23x + del34x;
flt_t delily = del23y + del34y;
flt_t delilz = del23z + del34z;
flt_t ril2 = delilx*delilx + delily*delily + delilz*delilz;
flt_t ril = overloaded::sqrt(ril2);
flt_t rjl2 = r34*r34;
flt_t rjl = r34;
flt_t costmp = static_cast<flt_t>(0.5)*(rij2+rjl2-ril2)/rij/rjl;
flt_t dtsijl, tspijl = Sp2<flt_t>(costmp,thmin,thmax,&dtsijl);
dtsijl = -dtsijl; //need minus sign
flt_t cross321x = (del32y*del21z)-(del32z*del21y);
flt_t cross321y = (del32z*del21x)-(del32x*del21z);
flt_t cross321z = (del32x*del21y)-(del32y*del21x);
flt_t cross321mag = overloaded::sqrt(cross321x*cross321x+
cross321y*cross321y + cross321z*cross321z);
flt_t cross234x = (del23y*del34z)-(del23z*del34y);
flt_t cross234y = (del23z*del34x)-(del23x*del34z);
flt_t cross234z = (del23x*del34y)-(del23y*del34x);
flt_t cross234mag = overloaded::sqrt(cross234x*cross234x+
cross234y*cross234y + cross234z*cross234z);
flt_t cwnum = (cross321x*cross234x) +
(cross321y*cross234y)+(cross321z*cross234z);
flt_t cwnom = r21*r34*r32*r32*sin321*sin234;
flt_t cw = cwnum/cwnom;
flt_t cw2 = (static_cast<flt_t>(.5)*(1-cw));
flt_t ekijl = epsilonT[ktype][ltype];
flt_t Ec = 256*ekijl/405;
flt_t Vtors = (Ec*(overloaded::pow(cw2,5)))-(ekijl/10);
ka->result_eng += Vtors*w21*w23*w34*(1-tspjik)*(1-tspijl);
flt_t dndijx = (cross234y*del21z)-(cross234z*del21y);
flt_t dndijy = (cross234z*del21x)-(cross234x*del21z);
flt_t dndijz = (cross234x*del21y)-(cross234y*del21x);
flt_t tmpvecx = (del34y*cross321z)-(del34z*cross321y);
flt_t tmpvecy = (del34z*cross321x)-(del34x*cross321z);
flt_t tmpvecz = (del34x*cross321y)-(del34y*cross321x);
dndijx = dndijx+tmpvecx;
dndijy = dndijy+tmpvecy;
dndijz = dndijz+tmpvecz;
flt_t dndikx = (del23y*cross234z)-(del23z*cross234y);
flt_t dndiky = (del23z*cross234x)-(del23x*cross234z);
flt_t dndikz = (del23x*cross234y)-(del23y*cross234x);
flt_t dndjlx = (cross321y*del23z)-(cross321z*del23y);
flt_t dndjly = (cross321z*del23x)-(cross321x*del23z);
flt_t dndjlz = (cross321x*del23y)-(cross321y*del23x);
flt_t dcidij = ((r23*r23)-(r21*r21)+(rjk*rjk))/(2*r23*r23*r21);
flt_t dcidik = ((r21*r21)-(r23*r23)+(rjk*rjk))/(2*r23*r21*r21);
flt_t dcidjk = (-rjk)/(r23*r21);
flt_t dcjdji = ((r23*r23)-(r34*r34)+(ril*ril))/(2*r23*r23*r34);
flt_t dcjdjl = ((r34*r34)-(r23*r23)+(ril*ril))/(2*r23*r34*r34);
flt_t dcjdil = (-ril)/(r23*r34);
flt_t dsidij = (-cos321/sin321)*dcidij;
flt_t dsidik = (-cos321/sin321)*dcidik;
flt_t dsidjk = (-cos321/sin321)*dcidjk;
flt_t dsjdji = (-cos234/sin234)*dcjdji;
flt_t dsjdjl = (-cos234/sin234)*dcjdjl;
flt_t dsjdil = (-cos234/sin234)*dcjdil;
flt_t dxidij = (r21*sin321)+(r23*r21*dsidij);
flt_t dxidik = (r23*sin321)+(r23*r21*dsidik);
flt_t dxidjk = (r23*r21*dsidjk);
flt_t dxjdji = (r34*sin234)+(r23*r34*dsjdji);
flt_t dxjdjl = (r23*sin234)+(r23*r34*dsjdjl);
flt_t dxjdil = (r23*r34*dsjdil);
flt_t ddndij = (dxidij*cross234mag)+(cross321mag*dxjdji);
flt_t ddndik = dxidik*cross234mag;
flt_t ddndjk = dxidjk*cross234mag;
flt_t ddndjl = cross321mag*dxjdjl;
flt_t ddndil = cross321mag*dxjdil;
flt_t dcwddn = -cwnum/(cwnom*cwnom);
flt_t dcwdn = 1/cwnom;
flt_t dvpdcw = (-1)*Ec*static_cast<flt_t>(-0.5)*5*overloaded::pow(cw2,4)*
w23*w21*w34*(1-tspjik)*(1-tspijl);
flt_t Ftmpx = dvpdcw*((dcwdn*dndijx)+(dcwddn*ddndij*del23x/r23));
flt_t Ftmpy = dvpdcw*((dcwdn*dndijy)+(dcwddn*ddndij*del23y/r23));
flt_t Ftmpz = dvpdcw*((dcwdn*dndijz)+(dcwddn*ddndij*del23z/r23));
flt_t fix = Ftmpx;
flt_t fiy = Ftmpy;
flt_t fiz = Ftmpz;
flt_t fjx = -Ftmpx;
flt_t fjy = -Ftmpy;
flt_t fjz = -Ftmpz;
Ftmpx = dvpdcw*((dcwdn*dndikx)+(dcwddn*ddndik*del21x/r21));
Ftmpy = dvpdcw*((dcwdn*dndiky)+(dcwddn*ddndik*del21y/r21));
Ftmpz = dvpdcw*((dcwdn*dndikz)+(dcwddn*ddndik*del21z/r21));
fix += Ftmpx;
fiy += Ftmpy;
fiz += Ftmpz;
flt_t fkx = -Ftmpx;
flt_t fky = -Ftmpy;
flt_t fkz = -Ftmpz;
Ftmpx = (dvpdcw*dcwddn*ddndjk*deljkx)/rjk;
Ftmpy = (dvpdcw*dcwddn*ddndjk*deljky)/rjk;
Ftmpz = (dvpdcw*dcwddn*ddndjk*deljkz)/rjk;
fjx += Ftmpx;
fjy += Ftmpy;
fjz += Ftmpz;
fkx -= Ftmpx;
fky -= Ftmpy;
fkz -= Ftmpz;
Ftmpx = dvpdcw*((dcwdn*dndjlx)+(dcwddn*ddndjl*del34x/r34));
Ftmpy = dvpdcw*((dcwdn*dndjly)+(dcwddn*ddndjl*del34y/r34));
Ftmpz = dvpdcw*((dcwdn*dndjlz)+(dcwddn*ddndjl*del34z/r34));
fjx += Ftmpx;
fjy += Ftmpy;
fjz += Ftmpz;
flt_t flx = -Ftmpx;
flt_t fly = -Ftmpy;
flt_t flz = -Ftmpz;
Ftmpx = (dvpdcw*dcwddn*ddndil*delilx)/ril;
Ftmpy = (dvpdcw*dcwddn*ddndil*delily)/ril;
Ftmpz = (dvpdcw*dcwddn*ddndil*delilz)/ril;
fix += Ftmpx;
fiy += Ftmpy;
fiz += Ftmpz;
flx -= Ftmpx;
fly -= Ftmpy;
flz -= Ftmpz;
// coordination forces
flt_t fpair = Vtors*dw21*w23*w34*(1-tspjik)*(1-tspijl) / r21;
fix -= del21x*fpair;
fiy -= del21y*fpair;
fiz -= del21z*fpair;
fkx += del21x*fpair;
fky += del21y*fpair;
fkz += del21z*fpair;
fpair = Vtors*w21*dw23*w34*(1-tspjik)*(1-tspijl) / r23;
fix -= del23x*fpair;
fiy -= del23y*fpair;
fiz -= del23z*fpair;
fjx += del23x*fpair;
fjy += del23y*fpair;
fjz += del23z*fpair;
fpair = Vtors*w21*w23*dw34*(1-tspjik)*(1-tspijl) / r34;
fjx -= del34x*fpair;
fjy -= del34y*fpair;
fjz -= del34z*fpair;
flx += del34x*fpair;
fly += del34y*fpair;
flz += del34z*fpair;
// additional cut off function forces
flt_t fcpc = -Vtors*w21*w23*w34*dtsjik*(1-tspijl);
fpair = fcpc*dcidij/rij;
fix += fpair*del23x;
fiy += fpair*del23y;
fiz += fpair*del23z;
fjx -= fpair*del23x;
fjy -= fpair*del23y;
fjz -= fpair*del23z;
fpair = fcpc*dcidik/rik;
fix += fpair*del21x;
fiy += fpair*del21y;
fiz += fpair*del21z;
fkx -= fpair*del21x;
fky -= fpair*del21y;
fkz -= fpair*del21z;
fpair = fcpc*dcidjk/rjk;
fjx += fpair*deljkx;
fjy += fpair*deljky;
fjz += fpair*deljkz;
fkx -= fpair*deljkx;
fky -= fpair*deljky;
fkz -= fpair*deljkz;
fcpc = -Vtors*w21*w23*w34*(1-tspjik)*dtsijl;
fpair = fcpc*dcjdji/rij;
fix += fpair*del23x;
fiy += fpair*del23y;
fiz += fpair*del23z;
fjx -= fpair*del23x;
fjy -= fpair*del23y;
fjz -= fpair*del23z;
fpair = fcpc*dcjdjl/rjl;
fjx += fpair*del34x;
fjy += fpair*del34y;
fjz += fpair*del34z;
flx -= fpair*del34x;
fly -= fpair*del34y;
flz -= fpair*del34z;
fpair = fcpc*dcjdil/ril;
fix += fpair*delilx;
fiy += fpair*delily;
fiz += fpair*delilz;
flx -= fpair*delilx;
fly -= fpair*delily;
flz -= fpair*delilz;
// sum per-atom forces into atom force array
f[i].x += fix; f[i].y += fiy; f[i].z += fiz;
f[j].x += fjx; f[j].y += fjy; f[j].z += fjz;
f[k].x += fkx; f[k].y += fky; f[k].z += fkz;
f[l].x += flx; f[l].y += fly; f[l].z += flz;
}
}
}
template<typename flt_t, typename acc_t>
void ref_torsion(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
AtomAIREBOT<flt_t> * x = ka->x;
int * map = ka->map;
int * tag = ka->tag;
for (int ii = ka->frebo_from_atom; ii < ka->frebo_to_atom; ii++) {
int i = ii;
int itag = tag[i];
int itype = map[x[i].w];
if (itype != 0) continue;
flt_t xtmp = x[i].x;
flt_t ytmp = x[i].y;
flt_t ztmp = x[i].z;
int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
int jnum = ka->neigh_rebo.num[i];
for (int jj = 0; jj < jnum; jj++) {
int j = REBO_neighs_i[jj];
int jtag = tag[j];
if (itag > jtag) {
if (((itag+jtag) & 1) == 0) continue;
} else if (itag < jtag) {
if (((itag+jtag) & 1) == 1) continue;
} else {
if (x[j].z < ztmp) continue;
if (x[j].z == ztmp && x[j].y < ytmp) continue;
if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp) continue;
}
int jtype = map[x[j].w];
if (jtype != 0) continue;
ref_torsion_single_interaction(ka, i, j);
}
}
}
/*
* Calculate single REBO interaction.
* Corresponds to FREBO method. Note that the bondorder() function is
* inlined.
*/
template<typename flt_t, typename acc_t>
void ref_frebo_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
int j) {
AtomAIREBOT<flt_t> * x = ka->x;
int * map = ka->map;
ResultForceT<acc_t> * result_f = ka->result_f;
int jj;
int itype = map[x[i].w];
flt_t x_i = x[i].x;
flt_t y_i = x[i].y;
flt_t z_i = x[i].z;
int jtype = map[x[j].w];
flt_t delx = x[i].x - x[j].x;
flt_t dely = x[i].y - x[j].y;
flt_t delz = x[i].z - x[j].z;
flt_t rsq = delx * delx + dely * dely + delz * delz;
flt_t rij = overloaded::sqrt(rsq);
flt_t rcminij = ka->params.rcmin[itype][jtype];
flt_t rcmaxij = ka->params.rcmax[itype][jtype];
flt_t dwij;
flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij);
if (wij <= TOL) return;
flt_t Qij = ka->params.Q[itype][jtype];
flt_t Aij = ka->params.A[itype][jtype];
flt_t alphaij = ka->params.alpha[itype][jtype];
flt_t exp_alphar = exp(-alphaij * rij);
flt_t VR_by_wij = (1.0 + (Qij / rij)) * Aij * exp_alphar;
flt_t VR = wij * VR_by_wij;
flt_t pre = wij * Aij * exp_alphar;
flt_t dVRdi = pre * ((-alphaij) - (Qij / rsq) - (Qij * alphaij / rij));
dVRdi += VR_by_wij * dwij;
flt_t VA_by_wij = 0, dVA = 0;
for (int k = 0; k < 3; k++) {
flt_t BIJc = ka->params.BIJc[itype][jtype][k];
flt_t Betaij = ka->params.Beta[itype][jtype][k];
flt_t term = -BIJc * overloaded::exp(-Betaij * rij);
VA_by_wij += term;
dVA += -Betaij * wij * term;
}
dVA += VA_by_wij * dwij;
flt_t VA = VA_by_wij * wij;
acc_t fij[3] = {0};
flt_t Nij = ka->nH[i] + ka->nC[i] - wij;
flt_t Nji = ka->nH[j] + ka->nC[j] - wij;
flt_t NconjtmpI;
flt_t pij = frebo_pij(ka, i, j, delx, dely, delz, rij, wij, VA, &NconjtmpI,
fij);
flt_t NconjtmpJ;
acc_t fji[3] = {0};
flt_t pji = frebo_pij(ka, j, i, -delx, -dely, -delz, rij, wij, VA,
&NconjtmpJ, fji);
fij[0] -= fji[0]; fij[1] -= fji[1]; fij[2] -= fji[2];
flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ);
flt_t dN3[3];
flt_t pi_rc = frebo_pi_rc(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
frebo_N_spline_force(ka, i, j, VA, dN3[0], dN3[2], NconjtmpI);
frebo_N_spline_force(ka, j, i, VA, dN3[1], dN3[2], NconjtmpJ);
flt_t Tij = frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
flt_t sum_omega = 0.0;
if (fabs(Tij) > TOL) {
sum_omega = frebo_sum_omega(ka, i, j, delx, dely, delz, rij, VA * Tij, fij);
frebo_N_spline_force(ka, i, j, VA * sum_omega, dN3[0], dN3[2], NconjtmpI);
frebo_N_spline_force(ka, j, i, VA * sum_omega, dN3[1], dN3[2], NconjtmpJ);
}
flt_t pi_dh = Tij * sum_omega;
flt_t bij = static_cast<flt_t>(0.5) * (pij + pji) + pi_rc + pi_dh;
flt_t dVAdi = bij * dVA;
flt_t fpair = -(dVRdi + dVAdi) / rij;
result_f[i].x += fpair * delx + fij[0];
result_f[i].y += fpair * dely + fij[1];
result_f[i].z += fpair * delz + fij[2];
result_f[j].x -= fpair * delx + fij[0];
result_f[j].y -= fpair * dely + fij[1];
result_f[j].z -= fpair * delz + fij[2];
flt_t evdwl = VR + bij * VA;
ka->result_eng += evdwl;
result_f[i].w += 0.5 * evdwl;
result_f[j].w += 0.5 * evdwl;
}
template<typename flt_t, typename acc_t>
inline void ref_frebo_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i) {
AtomAIREBOT<flt_t> * x = ka->x;
int * tag = ka->tag;
int jj;
int itag = tag[i];
flt_t x_i = x[i].x;
flt_t y_i = x[i].y;
flt_t z_i = x[i].z;
int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
int jnum = ka->neigh_rebo.num[i];
for (jj = 0; jj < jnum; jj++) {
int j = neighs[jj];
int jtag = tag[j];
if (itag > jtag) {
if (((itag + jtag) & 1) == 0)
continue;
} else if (itag < jtag) {
if (((itag + jtag) & 1) == 1)
continue;
} else {
if (x[j].z < z_i)
continue;
if (x[j].z == z_i && x[j].y < y_i)
continue;
if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i)
continue;
}
ref_frebo_single_interaction(ka, i, j);
}
}
template<typename flt_t, typename acc_t>
void ref_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torflag) {
for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
ref_frebo_single_atom(ka, i);
}
if (torflag) ref_torsion(ka);
}
template<typename flt_t, typename acc_t>
void ref_lennard_jones_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka,
int i, int j, int morseflag) {
AtomAIREBOT<flt_t> * x = ka->x;
int * map = ka->map;
ResultForceT<acc_t> * result_f = ka->result_f;
int itype = map[x[i].w];
int jtype = map[x[j].w];
flt_t delx = x[i].x - x[j].x;
flt_t dely = x[i].y - x[j].y;
flt_t delz = x[i].z - x[j].z;
flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq >= ka->params.cutljsq[itype][jtype]) { return; }
flt_t rij = overloaded::sqrt(rsq);
LennardJonesPathAIREBOT<flt_t> testpath;
flt_t cij = 1.0;
if (rij < ka->params.cut3rebo) {
#pragma noinline
cij = ref_lennard_jones_test_path<flt_t,acc_t>(ka, i, j, rij,
ka->params.rcmax[itype][jtype], &testpath);
}
if (cij == 0) {
return;
}
flt_t sigcut = ka->params.sigcut;
flt_t sigmin = ka->params.sigmin;
flt_t sigma = ka->params.sigma[itype][jtype];
flt_t rljmax = sigcut * sigma;
flt_t rljmin = sigmin * sigma;
flt_t dslw, slw = Sp2(rij, rljmin, rljmax, &dslw);
flt_t vdw, dvdw;
if (morseflag) {
const flt_t exr = exp(-rij * ka->params.lj4[itype][jtype]);
vdw = ka->params.lj1[itype][jtype] * exr *
(ka->params.lj2[itype][jtype]*exr - 2);
dvdw = ka->params.lj3[itype][jtype] * exr *
(1 - ka->params.lj2[itype][jtype]*exr);
} else {
flt_t r2inv = 1 / rsq;
flt_t r6inv = r2inv * r2inv * r2inv;
vdw = r6inv * (ka->params.lj3[itype][jtype]*r6inv -
ka->params.lj4[itype][jtype]);
dvdw = -r6inv * (ka->params.lj1[itype][jtype]*r6inv -
ka->params.lj2[itype][jtype]) / rij;
}
flt_t VLJ = vdw * slw;
flt_t dVLJ = dvdw * slw + vdw * dslw;
flt_t dStr, Str = Sp2<flt_t>(rij, ka->params.rcLJmin[itype][jtype],
ka->params.rcLJmax[itype][jtype], &dStr);
flt_t VA = Str * cij * VLJ;
flt_t Stb = 0;
acc_t fij[3] = {0};
if (Str > 0) {
#pragma noinline
Stb = ref_lennard_jones_bondorder(ka, i, j, VA, fij);
}
flt_t fpair = -(dStr * (Stb * cij * VLJ - cij * VLJ) +
dVLJ * (Str * Stb * cij + cij - Str * cij)) / rij;
flt_t evdwl = VA * Stb + (1 - Str) * cij * VLJ;
result_f[i].x += fpair * delx + fij[0];
result_f[i].y += fpair * dely + fij[1];
result_f[i].z += fpair * delz + fij[2];
result_f[j].x -= fpair * delx + fij[0];
result_f[j].y -= fpair * dely + fij[1];
result_f[j].z -= fpair * delz + fij[2];
ka->result_eng += evdwl;
if (cij < 1) {
#pragma noinline
ref_lennard_jones_force_path(ka, Str * Stb * VLJ + (1 - Str) * VLJ,
&testpath);
}
}
template<typename flt_t, typename acc_t>
void ref_lennard_jones_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
int morseflag) {
AtomAIREBOT<flt_t> * x = ka->x;
int * tag = ka->tag;
int jj;
int itag = tag[i];
int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
int jnum = ka->neigh_lmp.num_half[i];
for (jj = 0; jj < jnum; jj++) {
int j = neighs[jj];
ref_lennard_jones_single_interaction(ka, i, j, morseflag);
}
}
template<typename flt_t, typename acc_t>
void ref_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag) {
for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
#pragma noinline
ref_lennard_jones_single_atom(ka, i, morseflag);
}
}
/* ----------------------------------------------------------------------
Vectorized AIREBO implementation, standalone, using caching to reduce
memory access.
---------------------------------------------------------------------- */
template<typename flt_t, typename acc_t>
struct aut_wrap {
typedef typename intr_types<flt_t, acc_t>::fvec fvec;
typedef typename intr_types<flt_t, acc_t>::avec avec;
typedef typename intr_types<flt_t, acc_t>::ivec ivec;
typedef typename intr_types<flt_t, acc_t>::bvec bvec;
VEC_INLINE inline
static void aut_loadatoms_vec(
AtomAIREBOT<flt_t> * atoms, ivec j_vec,
fvec *x, fvec * y, fvec * z, bvec * type_mask, int * map, ivec map_i,
ivec c_1
) {
const ivec c_4 = ivec::set1(4);
ivec j_vec_4 = ivec::mullo(c_4, j_vec);
fvec w;
fvec::gather_4_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z, &w);
ivec jtype = fvec::unpackloepi32(w);
jtype = ivec::srlv(map_i, jtype); //_mm512_castpd_si512(w));
jtype = ivec::the_and(c_1, jtype);
bvec jtype_mask = ivec::cmpneq(jtype, ivec::setzero());
*type_mask = jtype_mask;
}
VEC_INLINE inline
static void aut_loadatoms_vec_notype(
AtomAIREBOT<flt_t> * atoms, ivec j_vec,
fvec *x, fvec * y, fvec * z
) {
const ivec c_4 = ivec::set1(4);
ivec j_vec_4 = ivec::mullo(c_4, j_vec);
fvec::gather_3_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z);
}
static fvec aut_Sp2_deriv(fvec r, fvec lo, fvec hi, fvec * d) {
fvec c_1 = fvec::set1(1);
fvec c_2 = fvec::set1(2);
fvec c_3 = fvec::set1(3);
fvec c_6 = fvec::set1(6);
bvec m_lo = fvec::cmple(r, lo);
bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge
bvec m_tr = bvec::kandn(m_lo, ~ m_hi);
fvec ret = c_1;
ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
fvec der = fvec::setzero();
if (bvec::test_any_set(m_tr)) {
fvec diff = hi - lo;
fvec rcp = fvec::recip(diff);
fvec t = (r - lo) * rcp;
fvec v = c_1 - t * t * ( c_3 - c_2 * t);
ret = fvec::mask_blend(m_tr, ret, v);
fvec dv = c_6 * rcp * ( t * t - t);
der = fvec::mask_blend(m_tr, der, dv);
}
*d = der;
return ret;
}
static fvec aut_Sp_deriv(fvec r, fvec lo, fvec hi, fvec * d) {
fvec c_1 = fvec::set1(1);
fvec c_0_5 = fvec::set1(0.5);
fvec c_m0_5 = fvec::set1(-0.5);
fvec c_PI = fvec::set1(M_PI);
bvec m_lo = fvec::cmple(r, lo);
bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge
bvec m_tr = bvec::kandn(m_lo, ~ m_hi);
fvec ret = c_1;
ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
fvec der = fvec::setzero();
if (bvec::test_any_set(m_tr)) {
fvec diff = hi - lo;
fvec rcp = fvec::mask_recip(c_1, m_tr, diff);
fvec t = (r - lo) / diff;
fvec sinval, cosval;
sinval = fvec::mask_sincos(&cosval, fvec::setzero(), c_1, m_tr, c_PI * t);
fvec v = c_0_5 * ( c_1 + cosval);
ret = fvec::mask_blend(m_tr, ret, v);
fvec dv = c_PI * c_m0_5 * rcp * sinval;
der = fvec::mask_blend(m_tr, der, dv);
}
*d = der;
return ret;
}
static fvec aut_mask_Sp(bvec mask, fvec r, fvec lo, fvec hi) {
fvec c_1 = fvec::set1(1);
fvec c_0_5 = fvec::set1(0.5);
fvec c_PI = fvec::set1(M_PI);
bvec m_lo = fvec::mask_cmple(mask, r, lo);
bvec m_hi = fvec::mask_cmpnlt(mask, r, hi); // nlt == ge
bvec m_tr = bvec::kandn(m_lo, bvec::kandn(m_hi, mask));
fvec ret = c_1;
ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
if (bvec::test_any_set(m_tr)) {
fvec rcp = fvec::mask_recip(c_1, m_tr, hi - lo);
fvec t = (r - lo) * rcp;
fvec v = c_0_5 * ( c_1 + fvec::mask_cos(c_1, m_tr, c_PI * t));
ret = fvec::mask_blend(m_tr, ret, v);
}
return ret;
}
static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
int offset = ka->neigh_from_atom * ka->num_neighs_per_atom;
ivec c_CARBON = ivec::setzero();
int map_i = 0;
int i;
for (i = 1; i < ka->num_types; i++) {
if (ka->map[i])
map_i |= (1 << i);
}
ivec c_i1 = ivec::set1(1);
ivec c_im = ivec::set1(map_i);
AtomAIREBOT<flt_t> * _noalias x = ka->x;
for (i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) {
fvec x_i = fvec::set1(x[i].x);
fvec y_i = fvec::set1(x[i].y);
fvec z_i = fvec::set1(x[i].z);
int itype = ka->map[ka->x[i].w];
fvec rcmaxsq0 = fvec::set1(ka->params.rcmaxsq[itype][0]);
fvec rcmaxsq1 = fvec::set1(ka->params.rcmaxsq[itype][1]);
fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
fvec rcmaxskinsq0 = fvec::set1(
(ka->params.rcmax[itype][0] + ka->skin) * (ka->params.rcmax[itype][0] +
ka->skin));
fvec rcmaxskinsq1 = fvec::set1(
(ka->params.rcmax[itype][1] + ka->skin) * (ka->params.rcmax[itype][1] +
ka->skin));
fvec nC = fvec::setzero();
fvec nH = fvec::setzero();
ka->neigh_rebo.offset[i] = offset;
int jnum = ka->rebuild_flag ? ka->neigh_lmp.num[i] :
ka->neigh_rebo.num_half[i];
int * neighs = ka->rebuild_flag ?
&ka->neigh_lmp.entries[ka->neigh_lmp.offset[i]] :
&ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]+jnum];
int * skin_target = &ka->neigh_rebo.entries[offset+ka->num_neighs_per_atom];
int n = 0;
int n_skin = 0;
int lowest_idx;
#pragma unroll(4)
for (lowest_idx = 0; lowest_idx < jnum; lowest_idx += fvec::VL) {
bvec j_mask = bvec::full();
if (lowest_idx + fvec::VL > jnum) j_mask = bvec::only(jnum - lowest_idx);
int * _noalias neighs_l = neighs + lowest_idx;
fvec x_j, y_j, z_j;
bvec jtype_mask;
ivec ji = ivec::maskz_loadu(j_mask, neighs_l);
aut_loadatoms_vec(x, ji,
&x_j, &y_j, &z_j, &jtype_mask, ka->map, c_im, c_i1);
fvec delx = x_i - x_j;
fvec dely = y_i - y_j;
fvec delz = z_i - z_j;
fvec rsq = delx * delx + dely * dely + delz * delz;
if (ka->rebuild_flag) {
fvec rcmaxskinsq = fvec::mask_blend(jtype_mask, rcmaxskinsq0,
rcmaxskinsq1);
bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxskinsq);
ivec::mask_compressstore(c_mask, &skin_target[n_skin], ji);
n_skin += bvec::popcnt(c_mask);
}
fvec rcmaxsq = fvec::mask_blend(jtype_mask, rcmaxsq0, rcmaxsq1);
bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxsq);
if (bvec::test_all_unset(c_mask)) continue;
ivec::mask_compressstore(c_mask, &ka->neigh_rebo.entries[offset + n], ji);
n += bvec::popcnt(c_mask);
fvec rcmax = fvec::mask_blend(jtype_mask, rcmax0, rcmax1);
fvec rcmin = fvec::mask_blend(jtype_mask, rcmin0, rcmin1);
fvec sp = aut_mask_Sp(c_mask, fvec::sqrt(rsq), rcmin, rcmax);
nC = fvec::mask_add(nC, bvec::kandn(jtype_mask, c_mask), nC, sp);
nH = fvec::mask_add(nH, bvec::kand (jtype_mask, c_mask), nH, sp);
}
ka->neigh_rebo.num[i] = n;
if (ka->rebuild_flag) {
for (int i = 0; i < n_skin; i++) {
ka->neigh_rebo.entries[offset+n_skin+i] = skin_target[i];
}
}
if (ka->rebuild_flag) {
assert(n <= n_skin);
offset += 2 * n_skin;
ka->neigh_rebo.num_half[i] = n_skin;
} else {
assert(n <= jnum);
offset += 2 * jnum;
}
ka->nC[i] = fvec::reduce_add(nC);
ka->nH[i] = fvec::reduce_add(nH);
}
}
static fvec aut_eval_poly_lin_pd_2(int n, flt_t * vals, ivec idx, fvec x,
fvec * deriv) {
fvec c_1 = fvec::set1(1);
fvec x_i = c_1;
fvec x_im1 = fvec::setzero();
fvec result = fvec::setzero();
fvec i_v = fvec::setzero();
*deriv = fvec::setzero();
int i;
for (i = 0; i < n; i++) {
fvec coeff = fvec::gather(idx, vals + i, sizeof(flt_t));
result = result + coeff * x_i;
*deriv = *deriv + coeff * x_im1 * i_v;
x_im1 = x_i;
x_i = x_i * x;
i_v = i_v + c_1;
}
return result;
}
static fvec aut_mask_gSpline_pd_2(KernelArgsAIREBOT<flt_t,acc_t> * ka,
bvec active_mask, int itype, fvec cosjik,
fvec Nij, fvec *dgdc, fvec *dgdN) {
int i;
flt_t * gDom = NULL;
int nDom = 0;
ivec offs = ivec::setzero();
fvec NCmin = fvec::set1(ka->params.NCmin);
bvec Ngt = fvec::cmpnle(Nij, NCmin); //gt
if (itype == 0) {
nDom = 4;
gDom = &ka->params.gCdom[0];
offs = ivec::mask_blend(Ngt, offs, ivec::set1(4*6));
} else {
nDom = 3;
gDom = &ka->params.gHdom[0];
offs = ivec::set1(8 * 6);
}
cosjik = fvec::max(fvec::set1(gDom[0]), fvec::min(fvec::set1(gDom[nDom]),
cosjik));
ivec index6 = ivec::setzero();
for (i = 0; i < nDom; i++) {
bvec cosge = fvec::cmpnlt(cosjik, fvec::set1(gDom[i])); //ge
bvec cosle = fvec::cmple(cosjik, fvec::set1(gDom[i+1]));
index6 = ivec::mask_blend(cosge & cosle, index6, ivec::set1(6*i));
}
fvec g = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], offs + index6,
cosjik, dgdc);
*dgdN = fvec::setzero();
if (itype == 0) {
fvec NCmax = fvec::set1(ka->params.NCmax);
bvec Nlt = fvec::cmplt(Nij, NCmax); //gt
bvec Nmask = Ngt & Nlt;
if (bvec::test_any_set(Nmask)) {
fvec dg1;
fvec g1 = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], index6, cosjik,
&dg1);
fvec dS;
fvec cut = aut_Sp_deriv(Nij, NCmin, NCmax, &dS);
*dgdN = fvec::mask_mul(*dgdN, Nmask, dS, g1 - g);
g = fvec::mask_add(g, Nmask, g, cut * ( g1 - g));
*dgdc = fvec::mask_add(*dgdc, Nmask, *dgdc, cut * ( dg1 - *dgdc));
}
}
return g;
}
static fvec aut_PijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
int jtype, fvec NijC, fvec NijH, fvec *dN2) {
flt_t ret[fvec::VL] __attribute__((aligned(64)));
flt_t dN20[fvec::VL] __attribute__((aligned(64)));
flt_t dN21[fvec::VL] __attribute__((aligned(64)));
flt_t NijC_[fvec::VL] __attribute__((aligned(64)));
flt_t NijH_[fvec::VL] __attribute__((aligned(64)));
flt_t tmp_dN2[2];
fvec::store(NijC_, NijC);
fvec::store(NijH_, NijH);
int i;
for (i = 0; i < fvec::VL; i++) {
ret[i] = PijSpline(ka, itype, jtype, NijC_[i], NijH_[i], tmp_dN2);
dN20[i] = tmp_dN2[0];
dN21[i] = tmp_dN2[1];
}
dN2[0] = fvec::load(dN20);
dN2[1] = fvec::load(dN21);
return fvec::load(ret);
}
/*
* aut_frebo_data stores all the short-ranged coordinations
* and intermediate values that get reused frequently during
* bondorder calculations.
* BUF_CAP should rarely exceed 4, so 8 is a very conservative
* value.
*/
static const int BUF_CAP = 8;
struct aut_frebo_data {
fvec rikx_buf[BUF_CAP];
fvec riky_buf[BUF_CAP];
fvec rikz_buf[BUF_CAP];
fvec rikmag_buf[BUF_CAP];
fvec cosjik_buf[BUF_CAP];
ivec k_buf[BUF_CAP];
fvec g_buf[BUF_CAP];
fvec dgdc_buf[BUF_CAP];
fvec ex_lam_buf[BUF_CAP];
fvec wik_buf[BUF_CAP];
fvec dwik_buf[BUF_CAP];
fvec cutN_buf[BUF_CAP];
fvec dcutN_buf[BUF_CAP];
bvec ktype_buf[BUF_CAP];
bvec mask_buf[BUF_CAP];
fvec force_k_x_buf[BUF_CAP];
fvec force_k_y_buf[BUF_CAP];
fvec force_k_z_buf[BUF_CAP];
int buf_len;
fvec x_i;
fvec y_i;
fvec z_i;
fvec x_j;
fvec y_j;
fvec z_j;
fvec nCi;
fvec nHi;
fvec force_i_x;
fvec force_i_y;
fvec force_i_z;
fvec force_j_x;
fvec force_j_y;
fvec force_j_z;
};
/*
* Initialize values in aut_frebo_data and perform the calculations
* for p_ij.
*/
static fvec aut_frebo_pij_pd_2(
KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
struct aut_frebo_data * _noalias data,
int itype, int jtype,
ivec vi, ivec vj,
fvec rijx, fvec rijy, fvec rijz, fvec rijmag,
fvec wij, fvec VA, fvec * sum_N, fvec fij[3]
) {
AtomAIREBOT<flt_t> * _noalias x = ka->x;
int * _noalias map = ka->map;
flt_t * _noalias nC = ka->nC;
flt_t * _noalias nH = ka->nH;
fvec x_i, y_i, z_i;
fvec x_j, y_j, z_j;
x_i = data->x_i;
y_i = data->y_i;
z_i = data->z_i;
x_j = data->x_j;
y_j = data->y_j;
z_j = data->z_j;
fvec invrijm = fvec::recip(rijmag);
fvec invrijm2 = invrijm * invrijm;
fvec rcminij = fvec::set1(ka->params.rcmin[itype][jtype]);
fvec rcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]);
fvec Nmin = fvec::set1(ka->params.Nmin);
fvec Nmax = fvec::set1(ka->params.Nmax);
int map_i_scalar = 0;
{
int i;
for (i = 1; i < ka->num_types; i++) {
if (ka->map[i])
map_i_scalar |= (1 << i);
}
}
ivec map_i = ivec::set1(map_i_scalar);
fvec nCi = data->nCi;
fvec nHi = data->nHi;
fvec Nij = nHi + nCi - wij;
fvec factor_jtype, factor_not_jtype;
if (jtype) {
factor_jtype = fvec::set1(1);
factor_not_jtype = fvec::set1(0);
} else {
factor_jtype = fvec::set1(0);
factor_not_jtype = fvec::set1(1);
}
fvec NijC = nCi - wij * factor_not_jtype;
fvec NijH = nHi - wij * factor_jtype;
fvec sum_pij = fvec::setzero();
fvec sum_dpij_dN = fvec::setzero();
fvec dN2[2];
ivec offseti = ivec::mask_gather(ivec::setzero(), bvec::full(), vi,
ka->neigh_rebo.offset, sizeof(int));
int buf_len = 0;
ivec knum = ivec::mask_gather(ivec::setzero(), bvec::full(), vi,
ka->neigh_rebo.num, sizeof(int));
ivec kk = ivec::setzero();
bvec active_mask = ivec::cmplt(kk, knum);
ivec c_i1 = ivec::set1(1);
fvec rho_j = fvec::set1(ka->params.rho[jtype][1]);
fvec rho_k0 = fvec::set1(ka->params.rho[0][1]);
fvec rho_k1 = fvec::set1(ka->params.rho[1][1]);
fvec c_4 = fvec::set1(4);
fvec c_2_0 = fvec::set1(2.0);
fvec c_m2_0 = fvec::set1(-2.0);
fvec c_4_0 = fvec::set1(4.0);
fvec c_0_5 = fvec::set1(0.5);
fvec c_m0_5 = fvec::set1(-0.5);
fvec c_1 = fvec::set1(1);
fvec c_m1 = fvec::set1(-1);
fvec factor_itype = itype ? c_1 : fvec::setzero();
fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
fvec result_f_i_x = fvec::setzero();
fvec result_f_i_y = fvec::setzero();
fvec result_f_i_z = fvec::setzero();
fvec result_f_j_x = fvec::setzero();
fvec result_f_j_y = fvec::setzero();
fvec result_f_j_z = fvec::setzero();
*sum_N = fvec::setzero();
{
while (bvec::test_any_set(active_mask)) {
ivec k = ivec::mask_gather(ivec::setzero(), active_mask, kk + offseti,
ka->neigh_rebo.entries, sizeof(int));
bvec excluded_mask = ivec::cmpeq(k, vj) & active_mask;
if (bvec::test_any_set(excluded_mask)) {
kk = ivec::mask_add(kk, excluded_mask, kk, c_i1);
active_mask = ivec::cmplt(kk, knum);
continue;
}
fvec x_k, y_k, z_k;
bvec ktype_mask;
aut_loadatoms_vec(x, k, &x_k, &y_k, &z_k, &ktype_mask, ka->map, map_i,
c_i1);
fvec rikx = x_i - x_k;
fvec riky = y_i - y_k;
fvec rikz = z_i - z_k;
fvec rikmag = fvec::sqrt(rikx * rikx + riky * riky + rikz * rikz);
fvec rho_k = fvec::mask_blend(ktype_mask, rho_k0, rho_k1);
fvec lamdajik = c_4 * factor_itype * ( rho_k - rikmag - ( rho_j -
rijmag));
fvec ex_lam = fvec::exp(lamdajik);
fvec rcmax = fvec::mask_blend(ktype_mask, rcmax0, rcmax1);
fvec rcmin = fvec::mask_blend(ktype_mask, rcmin0, rcmin1);
fvec dwik;
fvec wik = aut_Sp_deriv(rikmag, rcmin, rcmax, &dwik);
fvec Nki = fvec::gather(k, nC, sizeof(flt_t)) +
fvec::gather(k, nH, sizeof(flt_t)) - wik;
fvec cosjik = (rijx * rikx + rijy * riky + rijz * rikz) /
( rijmag * rikmag);
cosjik = fvec::min(c_1, fvec::max(c_m1, cosjik));
fvec dgdc, dgdN;
fvec g = aut_mask_gSpline_pd_2(ka, active_mask, itype, cosjik, Nij,
&dgdc, &dgdN);
sum_pij = fvec::mask_add(sum_pij, active_mask, sum_pij, wik * g * ex_lam);
sum_dpij_dN = fvec::mask_add(sum_dpij_dN, active_mask, sum_dpij_dN,
wik * ex_lam * dgdN);
fvec dcutN;
fvec cutN = aut_Sp_deriv(Nki, Nmin, Nmax, &dcutN);
*sum_N = fvec::mask_add(*sum_N, active_mask, *sum_N,
fvec::mask_blend(ktype_mask, c_1,
fvec::setzero()) * wik * cutN);
if (buf_len == BUF_CAP) goto exceed_buffer;
data->rikx_buf[buf_len] = rikx;
data->riky_buf[buf_len] = riky;
data->rikz_buf[buf_len] = rikz;
data->rikmag_buf[buf_len] = rikmag;
data->cosjik_buf[buf_len] = cosjik;
data->ktype_buf[buf_len] = ktype_mask;
data->k_buf[buf_len] = k;
data->g_buf[buf_len] = g;
data->dgdc_buf[buf_len] = dgdc;
data->ex_lam_buf[buf_len] = ex_lam;
data->wik_buf[buf_len] = wik;
data->dwik_buf[buf_len] = dwik;
data->mask_buf[buf_len] = active_mask;
data->cutN_buf[buf_len] = cutN;
data->dcutN_buf[buf_len] = dcutN;
buf_len += 1;
kk = ivec::mask_add(kk, active_mask, kk, c_i1);
active_mask = ivec::cmplt(kk, knum);
}
data->buf_len = buf_len;
fvec PijS = aut_PijSpline(ka, itype, jtype, NijC, NijH, &dN2[0]);
fvec pij = fvec::invsqrt(c_1 + sum_pij + PijS);
fvec tmp = c_m0_5 * pij * pij * pij;
int buf_idx;
for (buf_idx = 0; buf_idx < buf_len; buf_idx++) {
fvec rikx = data->rikx_buf[buf_idx];
fvec riky = data->riky_buf[buf_idx];
fvec rikz = data->rikz_buf[buf_idx];
fvec rikmag = data->rikmag_buf[buf_idx];
fvec cosjik = data->cosjik_buf[buf_idx];
bvec ktype_mask = data->ktype_buf[buf_idx];
ivec k = data->k_buf[buf_idx];
fvec g = data->g_buf[buf_idx];
fvec dgdc = data->dgdc_buf[buf_idx];
fvec ex_lam = data->ex_lam_buf[buf_idx];
fvec wik = data->wik_buf[buf_idx];
fvec dwik = data->dwik_buf[buf_idx];
bvec mask = data->mask_buf[buf_idx];
fvec invrikm = fvec::recip(rikmag);
fvec rjkx = rikx - rijx;
fvec rjky = riky - rijy;
fvec rjkz = rikz - rijz;
fvec rjkmag = fvec::sqrt(
rjkx * rjkx + rjky * rjky + rjkz * rjkz);
fvec rijrik = c_2_0 * rijmag * rikmag;
fvec rr = rijmag * rijmag - rikmag * rikmag;
fvec dctdjk = c_m2_0 / rijrik;
fvec dctdik = (rjkmag * rjkmag - rr) / ( rijrik * rikmag * rikmag);
fvec dctdij = (rjkmag * rjkmag + rr) / ( rijrik * rijmag * rijmag);
fvec fi[3], fj[3], fk[3];
fvec pref = c_0_5 * VA * tmp;
fvec tmp20 = pref * wik * dgdc * ex_lam;
fj[0] = fj[1] = fj[2] = fvec::setzero();
fvec tmpdik = tmp20 * dctdik;
fi[0] = fvec::setzero() - tmpdik * rikx;
fi[1] = fvec::setzero() - tmpdik * riky;
fi[2] = fvec::setzero() - tmpdik * rikz;
fk[0] = tmpdik * rikx;
fk[1] = tmpdik * riky;
fk[2] = tmpdik * rikz;
fvec tmpdij = tmp20 * dctdij;
fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmpdij * rijx);
fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmpdij * rijy);
fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmpdij * rijz);
fvec tmpdjk = tmp20 * dctdjk;
fi[0] = fi[0] - tmpdjk * rjkx;
fi[1] = fi[1] - tmpdjk * rjky;
fi[2] = fi[2] - tmpdjk * rjkz;
fk[0] = fk[0] + tmpdjk * rjkx;
fk[1] = fk[1] + tmpdjk * rjky;
fk[2] = fk[2] + tmpdjk * rjkz;
fij[0] = fvec::mask_add(fij[0], mask, fij[0], tmpdjk * rjkx);
fij[1] = fvec::mask_add(fij[1], mask, fij[1], tmpdjk * rjky);
fij[2] = fvec::mask_add(fij[2], mask, fij[2], tmpdjk * rjkz);
if (itype) {
fvec tmp21 = pref * wik * g * ex_lam * c_4_0;
fvec tmp21ij = tmp21 * invrijm;
fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmp21ij * rijx);
fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmp21ij * rijy);
fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmp21ij * rijz);
fvec tmp21ik = tmp21 * invrikm;
fi[0] = fi[0] + tmp21ik * rikx;
fi[1] = fi[1] + tmp21ik * riky;
fi[2] = fi[2] + tmp21ik * rikz;
fk[0] = fk[0] - tmp21ik * rikx;
fk[1] = fk[1] - tmp21ik * riky;
fk[2] = fk[2] - tmp21ik * rikz;
}
// coordination forces
// dwik forces
fvec tmp22 = pref * dwik * g * ex_lam * invrikm;
fi[0] = fi[0] - tmp22 * rikx;
fi[1] = fi[1] - tmp22 * riky;
fi[2] = fi[2] - tmp22 * rikz;
fk[0] = fk[0] + tmp22 * rikx;
fk[1] = fk[1] + tmp22 * riky;
fk[2] = fk[2] + tmp22 * rikz;
// PIJ forces
fvec dN2ktype = fvec::mask_blend(ktype_mask, dN2[0], dN2[1]);
fvec tmp23 = pref * dN2ktype * dwik * invrikm;
fi[0] = fi[0] - tmp23 * rikx;
fi[1] = fi[1] - tmp23 * riky;
fi[2] = fi[2] - tmp23 * rikz;
fk[0] = fk[0] + tmp23 * rikx;
fk[1] = fk[1] + tmp23 * riky;
fk[2] = fk[2] + tmp23 * rikz;
// dgdN forces
fvec tmp24 = pref * sum_dpij_dN * dwik * invrikm;
fi[0] = fi[0] - tmp24 * rikx;
fi[1] = fi[1] - tmp24 * riky;
fi[2] = fi[2] - tmp24 * rikz;
fk[0] = fk[0] + tmp24 * rikx;
fk[1] = fk[1] + tmp24 * riky;
fk[2] = fk[2] + tmp24 * rikz;
result_f_i_x = fvec::mask_add(result_f_i_x, mask, result_f_i_x, fi[0]);
result_f_i_y = fvec::mask_add(result_f_i_y, mask, result_f_i_y, fi[1]);
result_f_i_z = fvec::mask_add(result_f_i_z, mask, result_f_i_z, fi[2]);
result_f_j_x = fvec::mask_add(result_f_j_x, mask, result_f_j_x, fj[0]);
result_f_j_y = fvec::mask_add(result_f_j_y, mask, result_f_j_y, fj[1]);
result_f_j_z = fvec::mask_add(result_f_j_z, mask, result_f_j_z, fj[2]);
data->force_k_x_buf[buf_idx] = fk[0];
data->force_k_y_buf[buf_idx] = fk[1];
data->force_k_z_buf[buf_idx] = fk[2];
}
data->force_i_x = result_f_i_x;
data->force_i_y = result_f_i_y;
data->force_i_z = result_f_i_z;
data->force_j_x = result_f_j_x;
data->force_j_y = result_f_j_y;
data->force_j_z = result_f_j_z;
return pij;
}
exceed_buffer:
data->buf_len = -1;
return fvec::setzero();
}
/*
* Apply the force values stored iin aut_frebo_data to
* the respective neighbors.
*/
static void aut_frebo_data_writeback(
KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
struct aut_frebo_data * _noalias data) {
ResultForceT<acc_t> * _noalias result_f = ka->result_f;
flt_t fk_x_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fk_y_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fk_z_buf[fvec::VL] __attribute__((aligned(64)));
int fk_k_buf[ivec::VL] __attribute__((aligned(64)));
int buf_idx;
for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) {
ivec k = data->k_buf[buf_idx];
bvec active_mask = data->mask_buf[buf_idx];
fvec::store(fk_x_buf, data->force_k_x_buf[buf_idx]);
fvec::store(fk_y_buf, data->force_k_y_buf[buf_idx]);
fvec::store(fk_z_buf, data->force_k_z_buf[buf_idx]);
ivec::store(fk_k_buf, k);
int lane;
for (lane = 0; lane < fvec::VL; lane++) {
if (bvec::test_at(active_mask, lane)) {} else continue;
int kk = fk_k_buf[lane];
result_f[kk].x += fk_x_buf[lane];
result_f[kk].y += fk_y_buf[lane];
result_f[kk].z += fk_z_buf[lane];
}
}
}
static void aut_frebo_N_spline_force(
KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
struct aut_frebo_data * _noalias data, int itype, int jtype, ivec vi,
ivec vj, fvec VA, fvec dN, fvec dNconj, fvec Nconj) {
ivec c_i1 = ivec::set1(1);
fvec c_2 = fvec::set1(2);
fvec c_TOL = fvec::set1(TOL);
ResultForceT<acc_t> * _noalias result_f = ka->result_f;
AtomAIREBOT<flt_t> * _noalias x = ka->x;
int * _noalias map = ka->map;
flt_t * _noalias nC = ka->nC;
flt_t * _noalias nH = ka->nH;
fvec x_i, y_i, z_i;
x_i = data->x_i;
y_i = data->y_i;
z_i = data->z_i;
fvec Nmin = fvec::set1(ka->params.Nmin);
fvec Nmax = fvec::set1(ka->params.Nmax);
int map_i_scalar = 0;
{
int i;
for (i = 1; i < ka->num_types; i++) {
if (ka->map[i])
map_i_scalar |= (1 << i);
}
}
ivec map_i = ivec::set1(map_i_scalar);
fvec dN2[2];
ivec kk = ivec::setzero();
fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
fvec result_f_i_x = fvec::setzero();
fvec result_f_i_y = fvec::setzero();
fvec result_f_i_z = fvec::setzero();
int buf_idx;
for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) {
ivec k = data->k_buf[buf_idx];
bvec active_mask = data->mask_buf[buf_idx];
fvec rikx = data->rikx_buf[buf_idx];
fvec riky = data->riky_buf[buf_idx];
fvec rikz = data->rikz_buf[buf_idx];
fvec rikmag = data->rikmag_buf[buf_idx];
bvec ktype_mask = data->ktype_buf[buf_idx];
fvec dwik = data->dwik_buf[buf_idx];
fvec wik = data->wik_buf[buf_idx];
fvec dNki = data->dcutN_buf[buf_idx];
fvec SpN = data->cutN_buf[buf_idx];
fvec invrikmag = fvec::recip(rikmag);
fvec pref = VA * dwik * invrikmag;
fvec fdN = dN * pref;
fvec fdNconj = pref * SpN * c_2 * dNconj * Nconj;
fvec ffactor = fdN;
bvec ktype_is_C = ~ ktype_mask;
ffactor = fvec::mask_add(ffactor, ktype_is_C, ffactor, fdNconj);
fvec fkx = ffactor * rikx;
fvec fky = ffactor * riky;
fvec fkz = ffactor * rikz;
data->force_k_x_buf[buf_idx] = data->force_k_x_buf[buf_idx] + fkx;
data->force_k_y_buf[buf_idx] = data->force_k_y_buf[buf_idx] + fky;
data->force_k_z_buf[buf_idx] = data->force_k_z_buf[buf_idx] + fkz;
result_f_i_x = fvec::mask_sub(result_f_i_x, active_mask, result_f_i_x, fkx);
result_f_i_y = fvec::mask_sub(result_f_i_y, active_mask, result_f_i_y, fky);
result_f_i_z = fvec::mask_sub(result_f_i_z, active_mask, result_f_i_z, fkz);
bvec need_k_neighs = fvec::mask_cmpnle(active_mask, fvec::abs(dNki), c_TOL)
& ktype_is_C;
if (bvec::test_any_set(need_k_neighs)) {
int lane;
for (lane = 0; lane < fvec::VL; lane++) {
if (! bvec::test_at(need_k_neighs, lane)) continue;
int kk = ivec::at(k, lane);
int k = kk;
int ktype = map[x[k].w];
int i = ivec::at(vi, lane);
fvec oldVA = VA;
double VA = fvec::at(oldVA, lane);
fvec oldwik = wik;
double wik = fvec::at(oldwik, lane);
fvec olddNconj = dNconj;
double dNconj = fvec::at(olddNconj, lane);
fvec oldNconj = Nconj;
double Nconj = fvec::at(oldNconj, lane);
fvec olddNki = dNki;
double dNki = fvec::at(olddNki, lane);
int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k];
int nnum = ka->neigh_rebo.num[k];
int nn;
for (nn = 0; nn < nnum; nn++) {
int n = neighs_k[nn];
if (n == i) continue;
double rknx = x[k].x - x[n].x;
double rkny = x[k].y - x[n].y;
double rknz = x[k].z - x[n].z;
double rknmag = sqrt(rknx * rknx + rkny * rkny + rknz * rknz);
int ntype = map[x[n].w];
double rcminkn = ka->params.rcmin[ktype][ntype];
double rcmaxkn = ka->params.rcmax[ktype][ntype];
double dwkn;
Sp(rknmag, rcminkn, rcmaxkn, &dwkn);
double ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag;
result_f[k].x -= ffactor * rknx;
result_f[k].y -= ffactor * rkny;
result_f[k].z -= ffactor * rknz;
result_f[n].x += ffactor * rknx;
result_f[n].y += ffactor * rkny;
result_f[n].z += ffactor * rknz;
}
}
}
}
data->force_i_x = data->force_i_x + result_f_i_x;
data->force_i_y = data->force_i_y + result_f_i_y;
data->force_i_z = data->force_i_z + result_f_i_z;
}
static fvec aut_frebo_pi_rc_pd(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
int jtype, fvec Nij, fvec Nji, fvec Nijconj,
fvec * dN3) {
flt_t ret[fvec::VL] __attribute__((aligned(64)));
flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64)));
int i;
for (i = 0; i < fvec::VL; i++) {
flt_t dN3tmp[3];
ret[i] = frebo_pi_rc(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i),
fvec::at(Nijconj, i), &dN3tmp[0]);
dN3ret[0][i] = dN3tmp[0];
dN3ret[1][i] = dN3tmp[1];
dN3ret[2][i] = dN3tmp[2];
}
dN3[0] = fvec::load(&dN3ret[0][0]);
dN3[1] = fvec::load(&dN3ret[1][0]);
dN3[2] = fvec::load(&dN3ret[2][0]);
return fvec::load(&ret[0]);
}
static fvec aut_frebo_Tij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
int jtype, fvec Nij, fvec Nji, fvec Nijconj,
fvec * dN3) {
flt_t ret[fvec::VL] __attribute__((aligned(64)));
flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64)));
int i;
for (i = 0; i < fvec::VL; i++) {
flt_t dN3tmp[3];
ret[i] = frebo_Tij(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i),
fvec::at(Nijconj, i), &dN3tmp[0]);
dN3ret[0][i] = dN3tmp[0];
dN3ret[1][i] = dN3tmp[1];
dN3ret[2][i] = dN3tmp[2];
}
dN3[0] = fvec::load(&dN3ret[0][0]);
dN3[1] = fvec::load(&dN3ret[1][0]);
dN3[2] = fvec::load(&dN3ret[2][0]);
return fvec::load(&ret[0]);
}
static fvec aut_frebo_sum_omega(
KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
struct aut_frebo_data * _noalias i_data,
struct aut_frebo_data * _noalias j_data,
int itype, int jtype,
ivec vi, ivec vj,
fvec r23x, fvec r23y, fvec r23z, fvec r23mag,
fvec VA, fvec fij[3]
) {
fvec c_1 = fvec::set1(1);
fvec c_m1 = fvec::set1(-1);
fvec c_2 = fvec::set1(2);
fvec c_m2 = fvec::set1(-2);
fvec sum_omega = fvec::setzero();
fvec thmin = fvec::set1(ka->params.thmin);
fvec thmax = fvec::set1(ka->params.thmax);
// 2 == i, 3 == j
fvec r32x = fvec::setzero() - r23x;
fvec r32y = fvec::setzero() - r23y;
fvec r32z = fvec::setzero() - r23z;
int buf_idx_i, buf_idx_j;
for (buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) {
// a1 == k == buf_idx_i
bvec mask_start = i_data->mask_buf[buf_idx_i];
fvec r21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k
fvec r21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k
fvec r21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k
fvec r21mag = i_data->rikmag_buf[buf_idx_i];
// TODO use buffered cosjik
fvec cos321 = (
r23x * r21x + r23y * r21y + r23z * r21z) / ( r23mag * r21mag);
cos321 = fvec::min(c_1, fvec::max(c_m1, cos321));
fvec sin321 = fvec::sqrt(c_1 - cos321 * cos321);
bvec mask_outer = fvec::cmpneq(fvec::setzero(), sin321) & mask_start;
// add "continue"
fvec sink2i = fvec::mask_recip(fvec::undefined(), mask_outer,
sin321 * sin321);
fvec rik2i = fvec::mask_recip(fvec::undefined(), mask_outer,
r21mag * r21mag);
fvec rr = r23mag * r23mag - r21mag * r21mag;
fvec r31x = r21x - r23x;
fvec r31y = r21y - r23y;
fvec r31z = r21z - r23z;
fvec r31mag2 = r31x * r31x + r31y * r31y + r31z * r31z;
fvec rijrik = c_2 * r23mag * r21mag;
fvec r21mag2 = r21mag * r21mag;
fvec dctik = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 - rr,
rijrik * r21mag2);
fvec dctij = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 + rr,
rijrik * r23mag * r23mag);
fvec dctjk = fvec::mask_div(fvec::undefined(), mask_outer, c_m2, rijrik);
fvec dw21 = i_data->dwik_buf[buf_idx_i];
fvec w21 = i_data->wik_buf[buf_idx_i];
fvec dtsjik;
fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik);
dtsjik = fvec::setzero() - dtsjik; // todo replace by appropriate xor.
ivec k = i_data->k_buf[buf_idx_i];
for (buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) {
// check l == k in second loop.
// l == a4 == buf_idx_j
ivec l = j_data->k_buf[buf_idx_j];
bvec mask_inner_0 = ivec::mask_cmpneq(mask_outer, k, l) &
j_data->mask_buf[buf_idx_j];
// add "continue"
fvec r34x = j_data->rikx_buf[buf_idx_j];
fvec r34y = j_data->riky_buf[buf_idx_j];
fvec r34z = j_data->rikz_buf[buf_idx_j];
fvec r34mag = j_data->rikmag_buf[buf_idx_j];
fvec cos234 = fvec::mask_div(fvec::undefined(), mask_inner_0,
r32x * r34x + r32y * r34y + r32z * r34z,
r23mag * r34mag);
cos234 = fvec::min(c_1, fvec::max(c_m1, cos234));
fvec sin234 = fvec::mask_sqrt(fvec::undefined(), mask_inner_0,
c_1 - cos234 * cos234);
bvec mask_inner_1 = fvec::mask_cmpneq(mask_inner_0, sin234,
fvec::setzero());
// add "continue"
fvec sinl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1,
sin234 * sin234);
fvec rjl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1,
r34mag * r34mag);
fvec dw34 = j_data->dwik_buf[buf_idx_j];
fvec w34 = j_data->wik_buf[buf_idx_j];
fvec rr = r23mag * r23mag - r34mag * r34mag;
fvec r24x = r23x + r34x;
fvec r24y = r23y + r34y;
fvec r24z = r23z + r34z;
fvec r242 = r24x * r24x + r24y * r24y + r24z * r24z;
fvec rijrjl = c_2 * r23mag * r34mag;
fvec rjl2 = r34mag * r34mag;
fvec dctjl = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 - rr,
rijrjl * rjl2);
fvec dctji = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 + rr,
rijrjl * r23mag * r23mag);
fvec dctil = fvec::mask_div(fvec::undefined(), mask_inner_1, c_m2,
rijrjl);
fvec dtsijl;
fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl);
dtsijl = fvec::setzero() - dtsijl;
fvec prefactor = VA;
fvec cross321x = r32y * r21z - r32z * r21y;
fvec cross321y = r32z * r21x - r32x * r21z;
fvec cross321z = r32x * r21y - r32y * r21x;
fvec cross234x = r23y * r34z - r23z * r34y;
fvec cross234y = r23z * r34x - r23x * r34z;
fvec cross234z = r23x * r34y - r23y * r34x;
fvec cwnum = cross321x * cross234x + cross321y * cross234y + cross321z *
cross234z;
fvec cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234;
fvec om1234 = fvec::mask_div(fvec::undefined(), mask_inner_1, cwnum,
cwnom);
fvec cw = om1234;
fvec sum_omega_contrib = (c_1 - om1234 * om1234) * w21 * w34 *
(c_1 - tspjik) * ( c_1 - tspijl);
sum_omega = fvec::mask_add(sum_omega, mask_inner_1, sum_omega,
sum_omega_contrib);
fvec dt1dik = rik2i - dctik * sink2i * cos321;
fvec dt1djk = fvec::setzero() - dctjk * sink2i * cos321;
fvec dt1djl = rjl2i - dctjl * sinl2i * cos234;
fvec dt1dil = fvec::setzero() - dctil * sinl2i * cos234;
fvec dt1dij = fvec::mask_div(fvec::undefined(), mask_inner_1, c_2,
r23mag * r23mag) -
dctij * sink2i * cos321 - dctji * sinl2i * cos234;
fvec dt2dikx = r23y * cross234z - r23z * cross234y;
fvec dt2diky = r23z * cross234x - r23x * cross234z;
fvec dt2dikz = r23x * cross234y - r23y * cross234x;
fvec dt2djlx = r23z * cross321y - r23y * cross321z;
fvec dt2djly = r23x * cross321z - r23z * cross321x;
fvec dt2djlz = r23y * cross321x - r23x * cross321y;
fvec dt2dijx = r21z * cross234y + r34y * cross321z -
( r34z * cross321y + r21y * cross234z);
fvec dt2dijy = r21x * cross234z + r34z * cross321x -
( r34x * cross321z + r21z * cross234x);
fvec dt2dijz = r21y * cross234x + r34x * cross321y -
( r34y * cross321x + r21x * cross234y);
fvec aa = prefactor * c_2 * fvec::mask_div(fvec::undefined(),
mask_inner_1, cw, cwnom) *
w21 * w34 * (c_1 - tspjik) * ( c_1 - tspijl);
fvec aaa1 = (fvec::setzero() - prefactor) * (c_1 - om1234 * om1234) *
(c_1 - tspjik) * (c_1 - tspijl);
fvec aaa2 = (fvec::setzero() - prefactor) * (c_1 - om1234 * om1234) *
w21 * w34;
fvec at2 = aa * cwnum;
fvec fcijpc = aaa2 * dtsjik * dctij * (c_1 - tspijl) + aaa2 * dtsijl *
dctji * (c_1 - tspjik) - dt1dij * at2;
fvec fcikpc = aaa2 * dtsjik * dctik * (c_1 - tspijl) - dt1dik * at2;
fvec fcjlpc = aaa2 * dtsijl * dctjl * (c_1 - tspjik) - dt1djl * at2;
fvec fcjkpc = aaa2 * dtsjik * dctjk * (c_1 - tspijl) - dt1djk * at2;
fvec fcilpc = aaa2 * dtsijl * dctil * (c_1 - tspjik) - dt1dil * at2;
fvec F23x = fcijpc * r23x + aa * dt2dijx;
fvec F23y = fcijpc * r23y + aa * dt2dijy;
fvec F23z = fcijpc * r23z + aa * dt2dijz;
fvec F12x = fcikpc * r21x + aa * dt2dikx;
fvec F12y = fcikpc * r21y + aa * dt2diky;
fvec F12z = fcikpc * r21z + aa * dt2dikz;
fvec F34x = fcjlpc * r34x + aa * dt2djlx;
fvec F34y = fcjlpc * r34y + aa * dt2djly;
fvec F34z = fcjlpc * r34z + aa * dt2djlz;
fvec F31x = fcjkpc * r31x;
fvec F31y = fcjkpc * r31y;
fvec F31z = fcjkpc * r31z;
fvec F24x = fcilpc * r24x;
fvec F24y = fcilpc * r24y;
fvec F24z = fcilpc * r24z;
fvec f1x = fvec::setzero() - ( F12x + F31x);
fvec f1y = fvec::setzero() - ( F12y + F31y);
fvec f1z = fvec::setzero() - ( F12z + F31z);
fvec f2x = F12x + F31x;
fvec f2y = F12y + F31y;
fvec f2z = F12z + F31z;
fvec f3x = F34x + F24x;
fvec f3y = F34y + F24y;
fvec f3z = F34z + F24z;
fvec f4x = fvec::setzero() - ( F34x + F24x);
fvec f4y = fvec::setzero() - ( F34y + F24y);
fvec f4z = fvec::setzero() - ( F34z + F24z);
fij[0] = fvec::mask_add(fij[0], mask_inner_1, fij[0],
F23x + F24x - F31x);
fij[1] = fvec::mask_add(fij[1], mask_inner_1, fij[1],
F23y + F24y - F31y);
fij[2] = fvec::mask_add(fij[2], mask_inner_1, fij[2],
F23z + F24z - F31z);
fvec tmp20 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) *
(c_1 - tspijl) * dw21 * w34 * fvec::mask_recip(fvec::undefined(),
mask_inner_1, r21mag);
f2x = f2x - tmp20 * r21x;
f2y = f2y - tmp20 * r21y;
f2z = f2z - tmp20 * r21z;
f1x = f1x + tmp20 * r21x;
f1y = f1y + tmp20 * r21y;
f1z = f1z + tmp20 * r21z;
fvec tmp21 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) *
(c_1 - tspijl) * w21 * dw34 * fvec::mask_recip(fvec::undefined(),
mask_inner_1, r34mag);
f3x = f3x - tmp21 * r34x;
f3y = f3y - tmp21 * r34y;
f3z = f3z - tmp21 * r34z;
f4x = f4x + tmp21 * r34x;
f4y = f4y + tmp21 * r34y;
f4z = f4z + tmp21 * r34z;
// 1 == buf_idx_i, 2 == i, 3 == j, 4 == buf_idx_j
i_data->force_k_x_buf[buf_idx_i] =
fvec::mask_add(i_data->force_k_x_buf[buf_idx_i],
mask_inner_1, i_data->force_k_x_buf[buf_idx_i], f1x);
i_data->force_k_y_buf[buf_idx_i] =
fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_1,
i_data->force_k_y_buf[buf_idx_i], f1y);
i_data->force_k_z_buf[buf_idx_i] =
fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_1,
i_data->force_k_z_buf[buf_idx_i], f1z);
i_data->force_i_x =
fvec::mask_add(i_data->force_i_x, mask_inner_1, i_data->force_i_x, f2x);
i_data->force_i_y =
fvec::mask_add(i_data->force_i_y, mask_inner_1, i_data->force_i_y, f2y);
i_data->force_i_z =
fvec::mask_add(i_data->force_i_z, mask_inner_1, i_data->force_i_z, f2z);
j_data->force_i_x =
fvec::mask_add(j_data->force_i_x, mask_inner_1, j_data->force_i_x, f3x);
j_data->force_i_y =
fvec::mask_add(j_data->force_i_y, mask_inner_1, j_data->force_i_y, f3y);
j_data->force_i_z =
fvec::mask_add(j_data->force_i_z, mask_inner_1, j_data->force_i_z, f3z);
j_data->force_k_x_buf[buf_idx_j] =
fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_1,
j_data->force_k_x_buf[buf_idx_j], f4x);
j_data->force_k_y_buf[buf_idx_j] =
fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_1,
j_data->force_k_y_buf[buf_idx_j], f4y);
j_data->force_k_z_buf[buf_idx_j] =
fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_1,
j_data->force_k_z_buf[buf_idx_j], f4z);
}
}
return sum_omega;
}
static fvec aut_frebo_pi_dh(
KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
struct aut_frebo_data * _noalias i_data,
struct aut_frebo_data * _noalias j_data,
int itype, int jtype, ivec vi, ivec vj,
fvec r23x, fvec r23y, fvec r23z, fvec r23mag,
fvec VA,
fvec Nij, fvec Nji, fvec Nijconj, fvec NconjtmpI, fvec NconjtmpJ,
fvec fij[3]
) {
fvec c_TOL = fvec::set1(TOL);
fvec dN3[3];
fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3[0]);
bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL);
fvec sum_omega = fvec::setzero();
if (bvec::test_any_set(TijgtTOLmask)) {
sum_omega = aut_frebo_sum_omega(
ka, i_data, j_data, itype, jtype, vi, vj,
r23x, r23y, r23z, r23mag, VA * Tij, fij);
sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega);
aut_frebo_N_spline_force(ka, i_data, itype, jtype, vi, vj, VA * sum_omega,
dN3[0], dN3[2], NconjtmpI);
aut_frebo_N_spline_force(ka, j_data, jtype, itype, vj, vi, VA * sum_omega,
dN3[1], dN3[2], NconjtmpJ);
}
return Tij * sum_omega;
}
/*
We can reuse the aut_frebo_data buffers here to do this calculation very
cheaply.
*/
static void aut_torsion_vec(
KernelArgsAIREBOT<flt_t,acc_t> * ka,
struct aut_frebo_data * i_data,
struct aut_frebo_data * j_data,
ivec i, ivec j, fvec wij, fvec dwij
) {
AtomAIREBOT<flt_t> * x = ka->x;
int * map = ka->map;
flt_t (*epsilonT)[2] = ka->params.epsilonT;
fvec epsilonT00 = fvec::set1(epsilonT[0][0]);
fvec epsilonT01 = fvec::set1(epsilonT[0][1]);
fvec epsilonT10 = fvec::set1(epsilonT[1][0]);
fvec epsilonT11 = fvec::set1(epsilonT[1][1]);
fvec thmin = fvec::set1(ka->params.thmin);
fvec thmax = fvec::set1(ka->params.thmax);
const fvec c_1_0 = fvec::set1(1.0);
const fvec c_0_5 = fvec::set1(0.5);
const fvec c_0_1 = fvec::set1(0.1);
const fvec c_2_0 = fvec::set1(2.0);
const fvec c_2_5 = fvec::set1(2.5);
const fvec c_256_405 = fvec::set1(256.0/405.0);
fvec del32x = j_data->x_i - i_data->x_i;
fvec del32y = j_data->y_i - i_data->y_i;
fvec del32z = j_data->z_i - i_data->z_i;
fvec rsq = del32x * del32x + del32y * del32y + del32z * del32z;
fvec r32 = fvec::sqrt(rsq);
fvec del23x = fvec::setzero() - del32x;
fvec del23y = fvec::setzero() - del32y;
fvec del23z = fvec::setzero() - del32z;
fvec r23 = r32;
fvec w23 = wij;
fvec dw23 = dwij;
for (int buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) {
bvec mask_start = i_data->mask_buf[buf_idx_i];
fvec del21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k
fvec del21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k
fvec del21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k
fvec r21 = i_data->rikmag_buf[buf_idx_i];
fvec cos321 = i_data->cosjik_buf[buf_idx_i];
fvec sin321 = fvec::sqrt(c_1_0 - cos321 * cos321);
// strictly equivalent to sin321 < TOL
mask_start = fvec::mask_cmpneq(mask_start, fvec::setzero(), sin321);
if (! bvec::test_any_set(mask_start)) continue;
fvec deljkx = del21x - del23x;
fvec deljky = del21y - del23y;
fvec deljkz = del21z - del23z;
fvec rjk2 = deljkx * deljkx + deljky * deljky + deljkz * deljkz;
fvec rjk = fvec::sqrt(rjk2);
fvec rik2 = r21 * r21;
fvec w21 = i_data->wik_buf[buf_idx_i];
fvec dw21 = i_data->dwik_buf[buf_idx_i];
fvec rij = r32;
fvec rik = r21;
fvec rij2 = r32 * r32;
fvec dtsjik;
fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik);
dtsjik = fvec::setzero() - dtsjik;
bvec ktype_mask = i_data->ktype_buf[buf_idx_i];
fvec epsilonT0 = fvec::mask_blend(ktype_mask, epsilonT00, epsilonT10);
fvec epsilonT1 = fvec::mask_blend(ktype_mask, epsilonT01, epsilonT11);
ivec k = i_data->k_buf[buf_idx_i];
for (int buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) {
ivec l = j_data->k_buf[buf_idx_j];
bvec mask_inner_0 = ivec::mask_cmpneq(mask_start, k, l) &
j_data->mask_buf[buf_idx_j];
if (! bvec::test_any_set(mask_inner_0)) continue;
fvec del34x = j_data->rikx_buf[buf_idx_j];
fvec del34y = j_data->riky_buf[buf_idx_j];
fvec del34z = j_data->rikz_buf[buf_idx_j];
fvec r34 = j_data->rikmag_buf[buf_idx_j];
bvec ltype_mask = j_data->ktype_buf[buf_idx_j];
fvec cos234 = j_data->cosjik_buf[buf_idx_j];
fvec sin234 = fvec::sqrt(c_1_0 - cos234 * cos234);
// strictly equivalent to sin234 < TOL
mask_inner_0 = fvec::mask_cmpneq(mask_inner_0, sin234, fvec::setzero());
if (! bvec::test_any_set(mask_inner_0)) continue;
fvec dw34 = j_data->dwik_buf[buf_idx_j];
fvec w34 = j_data->wik_buf[buf_idx_j];
fvec delilx = del23x + del34x;
fvec delily = del23y + del34y;
fvec delilz = del23z + del34z;
fvec ril2 = delilx * delilx + delily * delily + delilz * delilz;
fvec ril = fvec::sqrt(ril2);
fvec rjl2 = r34 * r34;
fvec rjl = r34;
fvec dtsijl;
fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl);
dtsijl = fvec::setzero() - dtsijl;
fvec cross321x = del32y * del21z - del32z * del21y;
fvec cross321y = del32z * del21x - del32x * del21z;
fvec cross321z = del32x * del21y - del32y * del21x;
fvec cross321mag = fvec::sqrt(cross321x * cross321x +
cross321y * cross321y +
cross321z * cross321z);
fvec cross234x = del23y * del34z - del23z * del34y;
fvec cross234y = del23z * del34x - del23x * del34z;
fvec cross234z = del23x * del34y - del23y * del34x;
fvec cross234mag = fvec::sqrt(cross234x * cross234x +
cross234y * cross234y +
cross234z * cross234z);
fvec cwnum = cross321x * cross234x + cross321y * cross234y +
cross321z * cross234z;
fvec cwnom = r21 * r34 * r32 * r32 * sin321 * sin234;
fvec cw = cwnum / cwnom;
fvec cw2 = c_0_5 * ( c_1_0 - cw);
fvec ekijl = fvec::mask_blend(ltype_mask, epsilonT0, epsilonT1);
fvec Ec = c_256_405 * ekijl;
fvec cw2_5 = cw2 * cw2 * cw2 * cw2 * cw2;
fvec Vtors = Ec * cw2_5 - ekijl * c_0_1;
fvec evdwl = Vtors * w21 * w23 * w34 * (c_1_0-tspjik) * (c_1_0-tspijl);
ka->result_eng += fvec::mask_reduce_add(mask_inner_0, evdwl);
fvec dndijx = cross234y * del21z - cross234z * del21y;
fvec dndijy = cross234z * del21x - cross234x * del21z;
fvec dndijz = cross234x * del21y - cross234y * del21x;
fvec tmpvecx = del34y * cross321z - del34z * cross321y;
fvec tmpvecy = del34z * cross321x - del34x * cross321z;
fvec tmpvecz = del34x * cross321y - del34y * cross321x;
dndijx = dndijx + tmpvecx;
dndijy = dndijy + tmpvecy;
dndijz = dndijz + tmpvecz;
fvec dndikx = del23y * cross234z - del23z * cross234y;
fvec dndiky = del23z * cross234x - del23x * cross234z;
fvec dndikz = del23x * cross234y - del23y * cross234x;
fvec dndjlx = cross321y * del23z - cross321z * del23y;
fvec dndjly = cross321z * del23x - cross321x * del23z;
fvec dndjlz = cross321x * del23y - cross321y * del23x;
fvec r23sq = r23 * r23;
fvec r21sq = r21 * r21;
fvec r34sq = r34 * r34;
fvec rjksq = rjk * rjk;
fvec rilsq = ril * ril;
fvec dcidij = (r23sq - r21sq + rjksq) / ( c_2_0 * r23sq * r21);
fvec dcidik = (r21sq - r23sq + rjksq) / ( c_2_0 * r21sq * r23);
fvec dcidjk = fvec::setzero() - rjk / ( r23 * r21);
fvec dcjdji = (r23sq - r34sq + rilsq) / ( c_2_0 * r23sq * r34);
fvec dcjdjl = (r34sq - r23sq + rilsq) / ( c_2_0 * r34sq * r23);
fvec dcjdil = fvec::setzero() - ril / ( r23 * r34);
fvec dsidij = fvec::setzero() - cos321 / sin321 * dcidij;
fvec dsidik = fvec::setzero() - cos321 / sin321 * dcidik;
fvec dsidjk = fvec::setzero() - cos321 / sin321 * dcidjk;
fvec dsjdji = fvec::setzero() - cos234 / sin234 * dcjdji;
fvec dsjdjl = fvec::setzero() - cos234 / sin234 * dcjdjl;
fvec dsjdil = fvec::setzero() - cos234 / sin234 * dcjdil;
fvec dxidij = r21 * sin321 + r23 * r21 * dsidij;
fvec dxidik = r23 * sin321 + r23 * r21 * dsidik;
fvec dxidjk = r23 * r21 * dsidjk;
fvec dxjdji = r34 * sin234 + r23 * r34 * dsjdji;
fvec dxjdjl = r23 * sin234 + r23 * r34 * dsjdjl;
fvec dxjdil = r23 * r34 * dsjdil;
fvec ddndij = dxidij * cross234mag + cross321mag * dxjdji;
fvec ddndik = dxidik * cross234mag;
fvec ddndjk = dxidjk * cross234mag;
fvec ddndjl = cross321mag * dxjdjl;
fvec ddndil = cross321mag * dxjdil;
fvec dcwddn = fvec::setzero() - cwnum / ( cwnom * cwnom);
fvec dcwdn = fvec::recip(cwnom);
fvec cw2_4 = cw2 * cw2 * cw2 * cw2;
fvec dvpdcw = c_2_5 * Ec * cw2_4 * w23 * w21 * w34 * (c_1_0 - tspjik) *
(c_1_0 - tspijl);
fvec Ftmpx = dvpdcw * (dcwdn * dndijx + dcwddn * ddndij * del23x / r23);
fvec Ftmpy = dvpdcw * (dcwdn * dndijy + dcwddn * ddndij * del23y / r23);
fvec Ftmpz = dvpdcw * (dcwdn * dndijz + dcwddn * ddndij * del23z / r23);
fvec fix = Ftmpx;
fvec fiy = Ftmpy;
fvec fiz = Ftmpz;
fvec fjx = fvec::setzero() - Ftmpx;
fvec fjy = fvec::setzero() - Ftmpy;
fvec fjz = fvec::setzero() - Ftmpz;
Ftmpx = dvpdcw * (dcwdn * dndikx + dcwddn * ddndik * del21x / r21);
Ftmpy = dvpdcw * (dcwdn * dndiky + dcwddn * ddndik * del21y / r21);
Ftmpz = dvpdcw * (dcwdn * dndikz + dcwddn * ddndik * del21z / r21);
fix = fix + Ftmpx;
fiy = fiy + Ftmpy;
fiz = fiz + Ftmpz;
fvec fkx = fvec::setzero() - Ftmpx;
fvec fky = fvec::setzero() - Ftmpy;
fvec fkz = fvec::setzero() - Ftmpz;
Ftmpx = dvpdcw * dcwddn * ddndjk * deljkx / rjk;
Ftmpy = dvpdcw * dcwddn * ddndjk * deljky / rjk;
Ftmpz = dvpdcw * dcwddn * ddndjk * deljkz / rjk;
fjx = fjx + Ftmpx;
fjy = fjy + Ftmpy;
fjz = fjz + Ftmpz;
fkx = fkx - Ftmpx;
fky = fky - Ftmpy;
fkz = fkz - Ftmpz;
Ftmpx = dvpdcw * (dcwdn * dndjlx + dcwddn * ddndjl * del34x / r34);
Ftmpy = dvpdcw * (dcwdn * dndjly + dcwddn * ddndjl * del34y / r34);
Ftmpz = dvpdcw * (dcwdn * dndjlz + dcwddn * ddndjl * del34z / r34);
fjx = fjx + Ftmpx;
fjy = fjy + Ftmpy;
fjz = fjz + Ftmpz;
fvec flx = fvec::setzero() - Ftmpx;
fvec fly = fvec::setzero() - Ftmpy;
fvec flz = fvec::setzero() - Ftmpz;
Ftmpx = dvpdcw * dcwddn * ddndil * delilx / ril;
Ftmpy = dvpdcw * dcwddn * ddndil * delily / ril;
Ftmpz = dvpdcw * dcwddn * ddndil * delilz / ril;
fix = fix + Ftmpx;
fiy = fiy + Ftmpy;
fiz = fiz + Ftmpz;
flx = flx - Ftmpx;
fly = fly - Ftmpy;
flz = flz - Ftmpz;
// coordination forces
fvec fpair = Vtors * dw21 * w23 * w34 * (c_1_0 - tspjik) *
(c_1_0 - tspijl) / r21;
fix = fix - del21x * fpair;
fiy = fiy - del21y * fpair;
fiz = fiz - del21z * fpair;
fkx = fkx + del21x * fpair;
fky = fky + del21y * fpair;
fkz = fkz + del21z * fpair;
fpair = Vtors * w21 * dw23 * w34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) /
r23;
fix = fix - del23x * fpair;
fiy = fiy - del23y * fpair;
fiz = fiz - del23z * fpair;
fjx = fjx + del23x * fpair;
fjy = fjy + del23y * fpair;
fjz = fjz + del23z * fpair;
fpair = Vtors * w21 * w23 * dw34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) /
r34;
fjx = fjx - del34x * fpair;
fjy = fjy - del34y * fpair;
fjz = fjz - del34z * fpair;
flx = flx + del34x * fpair;
fly = fly + del34y * fpair;
flz = flz + del34z * fpair;
// additional cut off function forces
fvec fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * dtsjik * (c_1_0 -
tspijl);
fpair = fcpc * dcidij / rij;
fix = fix + fpair * del23x;
fiy = fiy + fpair * del23y;
fiz = fiz + fpair * del23z;
fjx = fjx - fpair * del23x;
fjy = fjy - fpair * del23y;
fjz = fjz - fpair * del23z;
fpair = fcpc * dcidik / rik;
fix = fix + fpair * del21x;
fiy = fiy + fpair * del21y;
fiz = fiz + fpair * del21z;
fkx = fkx - fpair * del21x;
fky = fky - fpair * del21y;
fkz = fkz - fpair * del21z;
fpair = fcpc * dcidjk / rjk;
fjx = fjx + fpair * deljkx;
fjy = fjy + fpair * deljky;
fjz = fjz + fpair * deljkz;
fkx = fkx - fpair * deljkx;
fky = fky - fpair * deljky;
fkz = fkz - fpair * deljkz;
fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * (c_1_0 - tspjik) *
dtsijl;
fpair = fcpc * dcjdji / rij;
fix = fix + fpair * del23x;
fiy = fiy + fpair * del23y;
fiz = fiz + fpair * del23z;
fjx = fjx - fpair * del23x;
fjy = fjy - fpair * del23y;
fjz = fjz - fpair * del23z;
fpair = fcpc * dcjdjl / rjl;
fjx = fjx + fpair * del34x;
fjy = fjy + fpair * del34y;
fjz = fjz + fpair * del34z;
flx = flx - fpair * del34x;
fly = fly - fpair * del34y;
flz = flz - fpair * del34z;
fpair = fcpc * dcjdil / ril;
fix = fix + fpair * delilx;
fiy = fiy + fpair * delily;
fiz = fiz + fpair * delilz;
flx = flx - fpair * delilx;
fly = fly - fpair * delily;
flz = flz - fpair * delilz;
// sum per-atom forces into atom force array
i_data->force_i_x = fvec::mask_add(i_data->force_i_x, mask_inner_0,
i_data->force_i_x, fix);
i_data->force_i_y = fvec::mask_add(i_data->force_i_y, mask_inner_0,
i_data->force_i_y, fiy);
i_data->force_i_z = fvec::mask_add(i_data->force_i_z, mask_inner_0,
i_data->force_i_z, fiz);
i_data->force_j_x = fvec::mask_add(i_data->force_j_x, mask_inner_0,
i_data->force_j_x, fjx);
i_data->force_j_y = fvec::mask_add(i_data->force_j_y, mask_inner_0,
i_data->force_j_y, fjy);
i_data->force_j_z = fvec::mask_add(i_data->force_j_z, mask_inner_0,
i_data->force_j_z, fjz);
i_data->force_k_x_buf[buf_idx_i] =
fvec::mask_add(i_data->force_k_x_buf[buf_idx_i], mask_inner_0,
i_data->force_k_x_buf[buf_idx_i], fkx);
i_data->force_k_y_buf[buf_idx_i] =
fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_0,
i_data->force_k_y_buf[buf_idx_i], fky);
i_data->force_k_z_buf[buf_idx_i] =
fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_0,
i_data->force_k_z_buf[buf_idx_i], fkz);
j_data->force_k_x_buf[buf_idx_j] =
fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_0,
j_data->force_k_x_buf[buf_idx_j], flx);
j_data->force_k_y_buf[buf_idx_j] =
fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_0,
j_data->force_k_y_buf[buf_idx_j], fly);
j_data->force_k_z_buf[buf_idx_j] =
fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_0,
j_data->force_k_z_buf[buf_idx_j], flz);
}
}
}
/*
* Processes VL elements of the same type itype/jtype for REBO and TORSION
* interactions. This allows us to reuse the aut_frebo_data buffes in the
* torsion calculaltion.
*/
static void aut_frebo_batch_of_kind(KernelArgsAIREBOT<flt_t,acc_t> * ka,
int torflag, int itype, int jtype,
int * i_buf, int * j_buf) {
{ // jump-scope for exceed_limits
AtomAIREBOT<flt_t> * x = ka->x;
int * tag = ka->tag;
int * map = ka->map;
ResultForceT<acc_t> * result_f = ka->result_f;
flt_t rcminij = ka->params.rcmin[itype][jtype];
flt_t rcmaxij = ka->params.rcmax[itype][jtype];
flt_t Qij = ka->params.Q[itype][jtype];
flt_t Aij = ka->params.A[itype][jtype];
flt_t alphaij = ka->params.alpha[itype][jtype];
fvec vrcminij = fvec::set1(ka->params.rcmin[itype][jtype]);
fvec vrcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]);
fvec vQij = fvec::set1(ka->params.Q[itype][jtype]);
fvec vAij = fvec::set1(ka->params.A[itype][jtype]);
fvec malphaij = fvec::set1(-ka->params.alpha[itype][jtype]);
fvec c_1_0 = fvec::set1(1);
fvec c_0_5 = fvec::set1(0.5);
fvec c_TOL = fvec::set1(1e-9);
struct aut_frebo_data i_data, j_data;
fvec evdwl_vacc = fvec::setzero();
ivec vi = ivec::maskz_loadu(bvec::full(), i_buf);
int tmp;
ivec vj = ivec::maskz_loadu(bvec::full(), j_buf);
fvec x_i, y_i, z_i;
fvec x_j, y_j, z_j;
aut_loadatoms_vec_notype(x, vi, &x_i, &y_i, &z_i);
aut_loadatoms_vec_notype(x, vj, &x_j, &y_j, &z_j);
i_data.x_i = x_i;
i_data.y_i = y_i;
i_data.z_i = z_i;
i_data.x_j = x_j;
i_data.y_j = y_j;
i_data.z_j = z_j;
j_data.x_i = x_j;
j_data.y_i = y_j;
j_data.z_i = z_j;
j_data.x_j = x_i;
j_data.y_j = y_i;
j_data.z_j = z_i;
fvec delx = x_i - x_j;
fvec dely = y_i - y_j;
fvec delz = z_i - z_j;
fvec rsq = delx * delx + dely * dely + delz * delz;
fvec rij = fvec::sqrt(rsq);
fvec dwij;
fvec wij = aut_Sp_deriv(rij, vrcminij, vrcmaxij, &dwij);
fvec exp_alphar = fvec::exp(malphaij * rij);
fvec Qij_over_rij = vQij / rij;
fvec Qij_over_rsq = vQij / rsq;
fvec VR_by_wij = ( c_1_0 + Qij_over_rij) * vAij * exp_alphar;
fvec VR = wij * VR_by_wij;
fvec pre = wij * vAij * exp_alphar;
fvec dVRdi = pre * ( malphaij + malphaij * Qij_over_rij - Qij_over_rsq);
dVRdi = dVRdi + VR_by_wij * dwij;
fvec VA_by_wij = fvec::setzero();
fvec dVA = fvec::setzero();
int k;
for (k = 0; k < 3; k++) {
fvec mBIJc = fvec::set1(-ka->params.BIJc[itype][jtype][k]);
fvec mBetaij = fvec::set1(-ka->params.Beta[itype][jtype][k]);
fvec term = mBIJc * fvec::exp(mBetaij * rij);
VA_by_wij = VA_by_wij + term;
dVA = dVA + mBetaij * wij * term;
}
dVA = dVA + dwij * VA_by_wij;
fvec VA = wij * VA_by_wij;
bvec tol_check = fvec::cmplt(wij, c_TOL);
VA = fvec::mask_blend(tol_check, VA, fvec::setzero());
dVA = fvec::mask_blend(tol_check, dVA, fvec::setzero());
VR = fvec::mask_blend(tol_check, VR, fvec::setzero());
dVRdi = fvec::mask_blend(tol_check, dVRdi, fvec::setzero());
fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t));
fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t));
fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t));
fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t));
fvec Nij = (nHi + nCi) - wij;
fvec Nji = (nHj + nCj) - wij;
i_data.nHi = nHi;
i_data.nCi = nCi;
j_data.nHi = nHj;
j_data.nCi = nCj;
fvec fij[3], fji[3];
fij[0] = fvec::setzero(); fij[1] = fvec::setzero();
fij[2] = fvec::setzero();
fji[0] = fvec::setzero(); fji[1] = fvec::setzero();
fji[2] = fvec::setzero();
fvec NconjtmpI;
fvec pij = aut_frebo_pij_pd_2(
ka, &i_data, itype, jtype, vi, vj,
delx, dely, delz, rij, wij, VA, &NconjtmpI, fij);
if (i_data.buf_len < 0) goto exceed_limits;
fvec NconjtmpJ;
fvec rjix = fvec::setzero() - delx;
fvec rjiy = fvec::setzero() - dely;
fvec rjiz = fvec::setzero() - delz;
fvec pji = aut_frebo_pij_pd_2(
ka, &j_data, jtype, itype, vj, vi,
rjix, rjiy, rjiz, rij, wij, VA, &NconjtmpJ, fji);
fij[0] = fij[0] - fji[0];
fij[1] = fij[1] - fji[1];
fij[2] = fij[2] - fji[2];
if (j_data.buf_len < 0) goto exceed_limits;
if (torflag && itype == 0 && jtype == 0)
aut_torsion_vec(ka, &i_data, &j_data, vi, vj, wij, dwij);
fvec Nijconj = c_1_0 + NconjtmpI * NconjtmpI + NconjtmpJ * NconjtmpJ;
fvec dN3[3];
fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, VA, dN3[0],
dN3[2], NconjtmpI);
aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, VA, dN3[1],
dN3[2], NconjtmpJ);
fvec pi_dh = aut_frebo_pi_dh(ka, &i_data, &j_data, itype, jtype, vi, vj,
delx, dely, delz, rij, VA, Nij, Nji, Nijconj,
NconjtmpI, NconjtmpJ, fij);
fvec bij = c_0_5 * ( pij + pji) + pi_rc + pi_dh;
fvec dVAdi = bij * dVA;
fvec fpair = (dVAdi + dVRdi) * fvec::recip(rij);
fvec result_f_j_x = fpair * delx - fij[0];
fvec result_f_j_y = fpair * dely - fij[1];
fvec result_f_j_z = fpair * delz - fij[2];
fvec result_f_i_x = fvec::setzero() - result_f_j_x;
fvec result_f_i_y = fvec::setzero() - result_f_j_y;
fvec result_f_i_z = fvec::setzero() - result_f_j_z;
fvec evdwl = VR + bij * VA;
evdwl_vacc = evdwl_vacc + evdwl;
aut_frebo_data_writeback(ka, &i_data);
aut_frebo_data_writeback(ka, &j_data);
flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64)));
int fi_i_buf[ivec::VL] __attribute__((aligned(64)));
flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64)));
int fj_j_buf[ivec::VL] __attribute__((aligned(64)));
flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64)));
result_f_i_x = i_data.force_i_x + result_f_i_x;
result_f_i_y = i_data.force_i_y + result_f_i_y;
result_f_i_z = i_data.force_i_z + result_f_i_z;
result_f_j_x = i_data.force_j_x + result_f_j_x;
result_f_j_y = i_data.force_j_y + result_f_j_y;
result_f_j_z = i_data.force_j_z + result_f_j_z;
result_f_i_x = j_data.force_j_x + result_f_i_x;
result_f_i_y = j_data.force_j_y + result_f_i_y;
result_f_i_z = j_data.force_j_z + result_f_i_z;
result_f_j_x = j_data.force_i_x + result_f_j_x;
result_f_j_y = j_data.force_i_y + result_f_j_y;
result_f_j_z = j_data.force_i_z + result_f_j_z;
fvec::store(fi_x_buf, result_f_i_x);
fvec::store(fi_y_buf, result_f_i_y);
fvec::store(fi_z_buf, result_f_i_z);
ivec::store(fi_i_buf, vi);
fvec::store(fj_x_buf, result_f_j_x);
fvec::store(fj_y_buf, result_f_j_y);
fvec::store(fj_z_buf, result_f_j_z);
ivec::store(fj_j_buf, vj);
fvec::store(evdwl_buf, evdwl);
int lane;
for (lane = 0; lane < fvec::VL; lane++) {
int ii = fi_i_buf[lane];
result_f[ii].x += fi_x_buf[lane];
result_f[ii].y += fi_y_buf[lane];
result_f[ii].z += fi_z_buf[lane];
result_f[ii].w += 0.5 * evdwl_buf[lane];
int jj = fj_j_buf[lane];
result_f[jj].x += fj_x_buf[lane];
result_f[jj].y += fj_y_buf[lane];
result_f[jj].z += fj_z_buf[lane];
result_f[jj].w += 0.5 * evdwl_buf[lane];
}
ka->result_eng += fvec::reduce_add(evdwl_vacc);
return;
}
exceed_limits:
for (int l = 0; l < fvec::VL; l++) {
int i = i_buf[l];
int j = j_buf[l];
ref_frebo_single_interaction(ka, i, j);
if (torflag && itype == 0 && jtype == 0)
ref_torsion_single_interaction(ka, i, j);
}
}
/*
Orders the interactions by itype and jtype and passes chunks to the above
method.
*/
static void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torflag) {
AtomAIREBOT<flt_t> * _noalias x = ka->x;
int * _noalias tag = ka->tag;
int * _noalias map = ka->map;
int i_buf[2][2][fvec::VL];
int j_buf[2][2][fvec::VL];
int n_buf[2][2] = {0};
for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
int itag = tag[i];
int itype = map[x[i].w];
flt_t x_i = x[i].x;
flt_t y_i = x[i].y;
flt_t z_i = x[i].z;
int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
int jnum = ka->neigh_rebo.num[i];
for (int jj = 0; jj < jnum; jj++) {
int j = neighs[jj];
int jtag = tag[j];
if (itag > jtag) {
if (((itag + jtag) & 1) == 0)
continue;
} else if (itag < jtag) {
if (((itag + jtag) & 1) == 1)
continue;
} else {
if (x[j].z < z_i)
continue;
if (x[j].z == z_i && x[j].y < y_i)
continue;
if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i)
continue;
}
int jtype = map[x[j].w];
int ins = n_buf[itype][jtype];
i_buf[itype][jtype][ins] = i;
j_buf[itype][jtype][ins] = j;
n_buf[itype][jtype] += 1;
if (n_buf[itype][jtype] == fvec::VL) {
aut_frebo_batch_of_kind(ka, torflag, itype, jtype,
i_buf[itype][jtype], j_buf[itype][jtype]);
n_buf[itype][jtype] = 0;
}
}
}
for (int itype = 0; itype < 2; itype++) {
for (int jtype = 0; jtype < 2; jtype++) {
for (int l = 0; l < n_buf[itype][jtype]; l++) {
int i = i_buf[itype][jtype][l];
int j = j_buf[itype][jtype][l];
ref_frebo_single_interaction(ka, i, j);
if (torflag && itype == 0 && jtype == 0)
ref_torsion_single_interaction(ka, i, j);
}
}
}
}
/*
* Apply paths in scalar fashion, not crucial for performance.
*/
static void aut_airebo_lj_force_path(KernelArgsAIREBOT<flt_t,acc_t> * ka,
bvec mask, fvec dC, LennardJonesPathAIREBOT<flt_t> path[fvec::VL]) {
for (int i = 0; i < fvec::VL; i++) {
if (bvec::test_at(mask, i)) {
ref_lennard_jones_force_path(ka, fvec::at(dC, i), &path[i]);
}
}
}
/*
* Hash-Map for efficient calculation of C_ij.
* Can have up to ITEMS entries with associated paths, as well as
* 1024 entries. Open addressing, invalidation by using a different i.
* Only needs to be reset once per timestep.
*/
static const int OPT_TEST_PATH_SIZE = 1024;
static const int OPT_TEST_PATH_ITEMS = 128;
struct aut_airebo_lj_test_path_result_data {
LennardJonesPathAIREBOT<flt_t> testpath[OPT_TEST_PATH_ITEMS];
int i[OPT_TEST_PATH_SIZE];
int j[OPT_TEST_PATH_SIZE];
flt_t cij[OPT_TEST_PATH_SIZE];
int testpath_idx[OPT_TEST_PATH_SIZE];
};
static const unsigned int OPT_TEST_PATH_HASH = 2654435761;
static int aut_lj_tap_hash_fn(int j, int attempt) {
uint32_t result = j;
result *= (uint32_t) OPT_TEST_PATH_HASH;
result += (uint32_t) attempt;
result %= (uint32_t) OPT_TEST_PATH_SIZE;
return result;
}
static ivec aut_airebo_lj_tap_hash_fn_vec(ivec val, ivec attempt) {
const ivec golden = ivec::set1(OPT_TEST_PATH_HASH);
const ivec mask = ivec::set1(OPT_TEST_PATH_SIZE - 1);
ivec a = ivec::mullo(golden, val);
ivec b = a + attempt;
ivec c = ivec::the_and(b, mask);
return c;
}
/*
* Enter all those (potential) neighbors of i (including 2nd and 3rd degree)
* into the hash-map. There is no good way to vectorize this, and it does not
* seem time-critical.
*/
static bool aut_airebo_lj_test_all_paths(KernelArgsAIREBOT<flt_t,acc_t> * ka,
int i, struct aut_airebo_lj_test_path_result_data * result) {
AtomAIREBOT<flt_t> * x = ka->x;
int * map = ka->map;
flt_t (*rcmin)[2] = &ka->params.rcmin[0];
flt_t (*rcmax)[2] = &ka->params.rcmax[0];
flt_t rcminsq[2][2];
rcminsq[0][0] = rcmin[0][0] * rcmin[0][0];
rcminsq[0][1] = rcmin[0][1] * rcmin[0][1];
rcminsq[1][0] = rcmin[1][0] * rcmin[1][0];
rcminsq[1][1] = rcmin[1][1] * rcmin[1][1];
int * neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
int itype = map[x[i].w];
int path_insert_pos = 0;
for (int jj = 0; jj < ka->neigh_rebo.num[i]; jj++) {
int j = neighs_i[jj];
int jtype = map[x[j].w];
flt_t dijx = x[j].x - x[i].x;
flt_t dijy = x[j].y - x[i].y;
flt_t dijz = x[j].z - x[i].z;
flt_t rijsq = dijx * dijx + dijy * dijy + dijz * dijz;
flt_t wj = 1, dwj = 0;
flt_t rij = 0;
if (rijsq >= rcminsq[itype][jtype]) {
rij = overloaded::sqrt(rijsq);
wj = Sp(rij, rcmin[itype][jtype], rcmax[itype][jtype], &dwj);
}
int attempt = 0;
int start_hash_slot = aut_lj_tap_hash_fn(j, attempt);
int hash_slot = start_hash_slot;
while (result->i[hash_slot] == i && result->j[hash_slot] != j &&
attempt < OPT_TEST_PATH_SIZE) {
hash_slot = aut_lj_tap_hash_fn(j, ++attempt);
}
if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
bool init_slot = result->i[hash_slot] != i;
if (init_slot || (1 - wj < result->cij[hash_slot])) {
result->i[hash_slot] = i;
result->j[hash_slot] = j;
result->cij[hash_slot] = 1 - wj;
if (wj != 1.0) {
if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
result->testpath_idx[hash_slot] = path_insert_pos;
LennardJonesPathAIREBOT<flt_t> *path =
&result->testpath[path_insert_pos++];
path->num = 2;
path->del[0].x = dijx;
path->del[0].y = dijy;
path->del[0].z = dijz;
if (rij == 0) rij = sqrt(rijsq);
path->r[0] = rij;
path->w[0] = wj;
path->dw[0] = dwj;
path->idx[0] = i;
path->idx[1] = j;
}
}
int * neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]];
for (int kk = 0; kk < ka->neigh_rebo.num[j]; kk++) {
int k = neighs_j[kk];
if (k == i) continue;
int ktype = map[x[k].w];
flt_t djkx = x[k].x - x[j].x;
flt_t djky = x[k].y - x[j].y;
flt_t djkz = x[k].z - x[j].z;
flt_t rjksq = djkx * djkx + djky * djky + djkz * djkz;
flt_t wk = 1, dwk = 0;
flt_t rjk = 0;
if (rjksq >= rcminsq[jtype][ktype]) {
rjk = overloaded::sqrt(rjksq);
wk = Sp(rjk, rcmin[jtype][ktype], rcmax[jtype][ktype], &dwk);
}
int attempt = 0;
int start_hash_slot = aut_lj_tap_hash_fn(k, attempt);
int hash_slot = start_hash_slot;
while (result->i[hash_slot] == i && result->j[hash_slot] != k &&
attempt < OPT_TEST_PATH_SIZE) {
hash_slot = aut_lj_tap_hash_fn(k, ++attempt);
}
if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
bool init_slot = result->i[hash_slot] != i;
if (init_slot || (1 - wj * wk < result->cij[hash_slot])) {
result->i[hash_slot] = i;
result->j[hash_slot] = k;
result->cij[hash_slot] = 1 - wj * wk;
if (wj * wk != 1.0) {
if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
result->testpath_idx[hash_slot] = path_insert_pos;
LennardJonesPathAIREBOT<flt_t> *path =
&result->testpath[path_insert_pos++];
path->num = 3;
path->del[0].x = dijx;
path->del[0].y = dijy;
path->del[0].z = dijz;
if (rij == 0) rij = sqrt(rijsq);
path->r[0] = rij;
path->del[1].x = djkx;
path->del[1].y = djky;
path->del[1].z = djkz;
if (rjk == 0) rjk = sqrt(rjksq);
path->r[1] = rjk;
path->w[0] = wj;
path->dw[0] = dwj;
path->w[1] = wk;
path->dw[1] = dwk;
path->idx[0] = i;
path->idx[1] = j;
path->idx[2] = k;
}
}
int * neighs_k = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[k]];
for (int ll = 0; ll < ka->neigh_rebo.num[k]; ll++) {
int l = neighs_k[ll];
if ((l == i) || (l == j)) continue;
int ltype = map[x[l].w];
flt_t dklx = x[l].x - x[k].x;
flt_t dkly = x[l].y - x[k].y;
flt_t dklz = x[l].z - x[k].z;
flt_t rklsq = dklx * dklx + dkly * dkly + dklz * dklz;
flt_t wl = 1, dwl = 0;
flt_t rkl = 0;
if (rklsq >= rcminsq[ktype][ltype]) {
rkl = overloaded::sqrt(rklsq);
wl = Sp(rkl, rcmin[ktype][ltype], rcmax[ktype][ltype], &dwl);
}
int attempt = 0;
int start_hash_slot = aut_lj_tap_hash_fn(l, attempt);
int hash_slot = start_hash_slot;
while (result->i[hash_slot] == i && result->j[hash_slot] != l &&
attempt < OPT_TEST_PATH_SIZE) {
hash_slot = aut_lj_tap_hash_fn(l, ++attempt);
}
if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
bool init_slot = result->i[hash_slot] != i;
if (init_slot || (1 - wj * wk * wl < result->cij[hash_slot])) {
result->i[hash_slot] = i;
result->j[hash_slot] = l;
result->cij[hash_slot] = 1 - wj * wk * wl;
if (wj * wk * wl != 1.0) {
if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
result->testpath_idx[hash_slot] = path_insert_pos;
LennardJonesPathAIREBOT<flt_t> *path =
&result->testpath[path_insert_pos++];
path->num = 4;
path->del[0].x = dijx;
path->del[0].y = dijy;
path->del[0].z = dijz;
if (rij == 0) rij = sqrt(rijsq);
path->r[0] = rij;
path->del[1].x = djkx;
path->del[1].y = djky;
path->del[1].z = djkz;
if (rjk == 0) rjk = sqrt(rjksq);
path->r[1] = rjk;
path->del[2].x = dklx;
path->del[2].y = dkly;
path->del[2].z = dklz;
if (rkl == 0) rkl = sqrt(rklsq);
path->r[2] = rkl;
path->w[0] = wj;
path->dw[0] = dwj;
path->w[1] = wk;
path->dw[1] = dwk;
path->w[2] = wl;
path->dw[2] = dwl;
path->idx[0] = i;
path->idx[1] = j;
path->idx[2] = k;
path->idx[3] = l;
}
}
}
}
}
return true;
exceed_limits:
return false;
}
/*
* Attempt to look up an element in the hash-map.
*/
static fvec aut_airebo_lj_tap_test_path(KernelArgsAIREBOT<flt_t,acc_t> * ka,
struct aut_airebo_lj_test_path_result_data * test_path_result,
bvec need_search, ivec i_bc, ivec j,
LennardJonesPathAIREBOT<flt_t> path[fvec::VL]
) {
const ivec c_i1 = ivec::set1(1);
fvec cij = fvec::set1(1.0);
// first round: hash all j
// lookup i/j in hash list.
// if i matches and j matches: congrats
// if i matches and j does not: look up attempts
// if attempts > current_attempts:
// do another round of hashing
// for all those found:
// fill in the path
// -----------------------------------------------
// find all the correct hash slots, and a mask of where found.
ivec attempt = ivec::setzero();
ivec hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt);
ivec lookup_i = ivec::mask_gather(ivec::undefined(), need_search, hash_slot,
&test_path_result->i[0], sizeof(int));
bvec correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc);
ivec lookup_j = ivec::mask_gather(ivec::undefined(), correct_i, hash_slot,
&test_path_result->j[0], sizeof(int));
bvec found_items = ivec::mask_cmpeq(correct_i, lookup_j, j);
bvec another_attempt = correct_i & ~ found_items;
while (bvec::test_any_set(another_attempt)) {
attempt = ivec::mask_add(attempt, another_attempt, attempt, c_i1);
hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt);
ivec lookup_i_2 = ivec::mask_gather(lookup_i, another_attempt, hash_slot,
&test_path_result->i[0], sizeof(int));
lookup_i = lookup_i_2;
correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc);
lookup_j = ivec::mask_gather(lookup_j, another_attempt, hash_slot,
&test_path_result->j[0], sizeof(int));
found_items = ivec::mask_cmpeq(correct_i, lookup_j, j);
another_attempt = correct_i & ~ found_items;
}
cij = fvec::mask_gather(cij, found_items, hash_slot,
&test_path_result->cij[0], sizeof(flt_t));
bvec need_testpath = fvec::mask_cmplt(found_items, fvec::setzero(), cij);
if (bvec::test_any_set(need_testpath)) {
for (int i = 0; i < fvec::VL; i++) {
if (bvec::test_at(need_testpath, i)) {
int testpath_idx =
test_path_result->testpath_idx[ivec::at(hash_slot, i)];
path[i] = test_path_result->testpath[testpath_idx];
}
}
}
return cij;
}
/*
* This function calculates the Lennard-Jones interaciton for those
* elements that require a bond-order calculation.
* It is similarly structured as the aut_frebo_batch_of_kind function.
* The forces due to bondorders are calculated speculatively and later
* updated with the correct outer derivative.
*/
template<int MORSEFLAG>
static void aut_lj_with_bo(
KernelArgsAIREBOT<flt_t,acc_t> * ka,
int itype, int jtype,
ivec i, ivec j,
fvec cij, LennardJonesPathAIREBOT<flt_t> testpath[fvec::VL]
) {
{ // jump-scope for exceed_limits
AtomAIREBOT<flt_t> * _noalias x = ka->x;
ResultForceT<acc_t> * result_f = ka->result_f;
ivec c_i4 = ivec::set1(4);
fvec c_1_0 = fvec::set1(1.0);
fvec c_2_0 = fvec::set1(2.0);
fvec c_0_5 = fvec::set1(0.5);
fvec x_i, y_i, z_i;
aut_loadatoms_vec_notype(x, i, &x_i, &y_i, &z_i);
fvec x_j, y_j, z_j;
aut_loadatoms_vec_notype(x, j, &x_j, &y_j, &z_j);
fvec delx = x_i - x_j;
fvec dely = y_i - y_j;
fvec delz = z_i - z_j;
fvec rsq = delx * delx + dely * dely + delz * delz;
fvec rij = fvec::sqrt(rsq);
bvec need_path_force = fvec::cmplt(cij, c_1_0);
flt_t sigcut = ka->params.sigcut;
flt_t sigmin = ka->params.sigmin;
flt_t sigma = ka->params.sigma[itype][jtype];
flt_t rljmax = sigcut * sigma;
flt_t rljmin = sigmin * sigma;
fvec p_rljmin = fvec::set1(rljmin);
fvec p_rljmax = fvec::set1(rljmax);
fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw);
fvec p_lj1 = fvec::set1(ka->params.lj1[itype][jtype]);
fvec p_lj2 = fvec::set1(ka->params.lj2[itype][jtype]);
fvec p_lj3 = fvec::set1(ka->params.lj3[itype][jtype]);
fvec p_lj4 = fvec::set1(ka->params.lj4[itype][jtype]);
fvec r2inv = fvec::recip(rsq);
fvec vdw, dvdw;
if (MORSEFLAG) {
fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4);
vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0);
dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr);
} else {
fvec r6inv = r2inv * r2inv * r2inv;
vdw = r6inv * ( p_lj3 * r6inv - p_lj4);
fvec r7inv = r6inv * rij * r2inv;
dvdw = r7inv * ( p_lj2 - p_lj1 * r6inv);
}
fvec VLJ = vdw * slw;
fvec dVLJ = dvdw * slw + vdw * dslw;
fvec p_rcLJmin = fvec::set1(ka->params.rcLJmin[itype][jtype]);
fvec p_rcLJmax = fvec::set1(ka->params.rcLJmax[itype][jtype]);
fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr);
fvec VA = cij * VLJ * Str;
fvec fij[3], fji[3];
fij[0] = fvec::setzero(); fij[1] = fvec::setzero();
fij[2] = fvec::setzero();
fji[0] = fvec::setzero(); fji[1] = fvec::setzero();
fji[2] = fvec::setzero();
ivec vi = i;
ivec vj = j;
struct aut_frebo_data i_data, j_data;
i_data.x_i = x_i;
i_data.y_i = y_i;
i_data.z_i = z_i;
i_data.x_j = x_j;
i_data.y_j = y_j;
i_data.z_j = z_j;
j_data.x_i = x_j;
j_data.y_i = y_j;
j_data.z_i = z_j;
j_data.x_j = x_i;
j_data.y_j = y_i;
j_data.z_j = z_i;
fvec p_rcmin = fvec::set1(ka->params.rcmin[itype][jtype]);
fvec p_rcmax = fvec::set1(ka->params.rcmax[itype][jtype]);
fvec dwij;
fvec wij = aut_Sp_deriv(rij, p_rcmin, p_rcmax, &dwij);
fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t));
fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t));
fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t));
fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t));
fvec Nij = nHi + nCi - wij;
fvec Nji = nHj + nCj - wij;
i_data.nHi = nHi;
i_data.nCi = nCi;
j_data.nHi = nHj;
j_data.nCi = nCj;
fvec the_r = fvec::set1(ka->params.rcmin[itype][jtype]);
fvec scale = the_r / rij;
fvec NconjtmpI;
fvec pij = aut_frebo_pij_pd_2(ka, &i_data, itype, jtype, vi, vj,
delx * scale, dely * scale, delz * scale,
the_r, wij, VA, &NconjtmpI, fij);
if (i_data.buf_len < 0) goto exceed_limits;
fvec NconjtmpJ;
fvec rjix = fvec::setzero() - delx;
fvec rjiy = fvec::setzero() - dely;
fvec rjiz = fvec::setzero() - delz;
fvec pji = aut_frebo_pij_pd_2(ka, &j_data, jtype, itype, vj, vi,
rjix * scale, rjiy * scale, rjiz * scale,
the_r, wij, VA, &NconjtmpJ, fji);
fij[0] = fij[0] - fji[0];
fij[1] = fij[1] - fji[1];
fij[2] = fij[2] - fji[2];
if (j_data.buf_len < 0) goto exceed_limits;
fvec Nijconj = c_1_0 + NconjtmpI * NconjtmpI + NconjtmpJ * NconjtmpJ;
fvec dN3[3];
fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
fvec c_TOL = fvec::set1(TOL);
fvec dN3_dh[3];
fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3_dh[0]);
bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL);
fvec sum_omega = fvec::setzero();
if (bvec::test_any_set(TijgtTOLmask)) {
sum_omega = aut_frebo_sum_omega(
ka, &i_data, &j_data, itype, jtype, vi, vj,
delx * scale, dely * scale, delz * scale, the_r, VA * Tij, fij);
sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega);
}
fvec pi_dh = Tij * sum_omega;
fvec bij = c_0_5 * ( pij + pji) + pi_rc + pi_dh;
fvec p_bLJmin = fvec::set1(ka->params.bLJmin[itype][jtype]);
fvec p_bLJmax = fvec::set1(ka->params.bLJmax[itype][jtype]);
fvec dStb, Stb = aut_Sp2_deriv(bij, p_bLJmin, p_bLJmax, &dStb);
bvec need_bo_deriv = fvec::cmpneq(dStb, fvec::setzero());
// fix up j_data, i_data, fij:
// multiply each by dStb
if (bvec::test_any_set(need_bo_deriv)) {
i_data.force_i_x = dStb * i_data.force_i_x;
i_data.force_i_y = dStb * i_data.force_i_y;
i_data.force_i_z = dStb * i_data.force_i_z;
i_data.force_j_x = dStb * i_data.force_j_x;
i_data.force_j_y = dStb * i_data.force_j_y;
i_data.force_j_z = dStb * i_data.force_j_z;
j_data.force_i_x = dStb * j_data.force_i_x;
j_data.force_i_y = dStb * j_data.force_i_y;
j_data.force_i_z = dStb * j_data.force_i_z;
j_data.force_j_x = dStb * j_data.force_j_x;
j_data.force_j_y = dStb * j_data.force_j_y;
j_data.force_j_z = dStb * j_data.force_j_z;
for (int k = 0; k < i_data.buf_len; k++) {
i_data.force_k_x_buf[k] = dStb * i_data.force_k_x_buf[k];
i_data.force_k_y_buf[k] = dStb * i_data.force_k_y_buf[k];
i_data.force_k_z_buf[k] = dStb * i_data.force_k_z_buf[k];
}
for (int k = 0; k < j_data.buf_len; k++) {
j_data.force_k_x_buf[k] = dStb * j_data.force_k_x_buf[k];
j_data.force_k_y_buf[k] = dStb * j_data.force_k_y_buf[k];
j_data.force_k_z_buf[k] = dStb * j_data.force_k_z_buf[k];
}
fvec fijc[3];
fijc[0] = dStb * fij[0];
fijc[1] = dStb * fij[1];
fijc[2] = dStb * fij[2];
fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx *
fijc[1] + delz * delx * fijc[2]) / rsq);
fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely *
fijc[1] + delz * dely * fijc[2]) / rsq);
fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz *
fijc[1] + delz * delz * fijc[2]) / rsq);
aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, dStb * VA,
dN3[0], dN3[2], NconjtmpI);
aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, dStb * VA,
dN3[1], dN3[2], NconjtmpJ);
if (bvec::test_any_set(TijgtTOLmask)) {
aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj,
dStb * VA * sum_omega, dN3_dh[0], dN3_dh[2],
NconjtmpI);
aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi,
dStb * VA * sum_omega, dN3_dh[1], dN3_dh[2],
NconjtmpJ);
}
aut_frebo_data_writeback(ka, &i_data);
aut_frebo_data_writeback(ka, &j_data);
} else {
fij[0] = fvec::setzero();
fij[1] = fvec::setzero();
fij[2] = fvec::setzero();
}
fvec fpdVLJ = cij * dVLJ * ( c_1_0 + Str * ( Stb - c_1_0));
fvec fpdStr = dStr * cij * ( Stb * VLJ - VLJ);
fvec fpair = r2inv * rij * ( fvec::setzero() - ( fpdVLJ + fpdStr));
fvec evdwl = VA * Stb + cij * VLJ * ( c_1_0 - Str);
fvec result_f_i_x = fpair * delx + fij[0];
fvec result_f_i_y = fpair * dely + fij[1];
fvec result_f_i_z = fpair * delz + fij[2];
fvec result_f_j_x = fvec::setzero() - result_f_i_x;
fvec result_f_j_y = fvec::setzero() - result_f_i_y;
fvec result_f_j_z = fvec::setzero() - result_f_i_z;
flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64)));
int fi_i_buf[ivec::VL] __attribute__((aligned(64)));
flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64)));
flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64)));
int fj_j_buf[ivec::VL] __attribute__((aligned(64)));
flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64)));
if (bvec::test_any_set(need_bo_deriv)) {
result_f_i_x = i_data.force_i_x + result_f_i_x;
result_f_i_y = i_data.force_i_y + result_f_i_y;
result_f_i_z = i_data.force_i_z + result_f_i_z;
result_f_j_x = i_data.force_j_x + result_f_j_x;
result_f_j_y = i_data.force_j_y + result_f_j_y;
result_f_j_z = i_data.force_j_z + result_f_j_z;
result_f_i_x = j_data.force_j_x + result_f_i_x;
result_f_i_y = j_data.force_j_y + result_f_i_y;
result_f_i_z = j_data.force_j_z + result_f_i_z;
result_f_j_x = j_data.force_i_x + result_f_j_x;
result_f_j_y = j_data.force_i_y + result_f_j_y;
result_f_j_z = j_data.force_i_z + result_f_j_z;
}
fvec::store(fi_x_buf, result_f_i_x);
fvec::store(fi_y_buf, result_f_i_y);
fvec::store(fi_z_buf, result_f_i_z);
ivec::store(fi_i_buf, vi);
fvec::store(fj_x_buf, result_f_j_x);
fvec::store(fj_y_buf, result_f_j_y);
fvec::store(fj_z_buf, result_f_j_z);
ivec::store(fj_j_buf, vj);
fvec::store(evdwl_buf, evdwl);
int lane;
for (lane = 0; lane < fvec::VL; lane++) {
int ii = fi_i_buf[lane];
result_f[ii].x += fi_x_buf[lane];
result_f[ii].y += fi_y_buf[lane];
result_f[ii].z += fi_z_buf[lane];
result_f[ii].w += 0.5 * evdwl_buf[lane];
int jj = fj_j_buf[lane];
result_f[jj].x += fj_x_buf[lane];
result_f[jj].y += fj_y_buf[lane];
result_f[jj].z += fj_z_buf[lane];
result_f[jj].w += 0.5 * evdwl_buf[lane];
}
ka->result_eng += fvec::reduce_add(evdwl);
if (bvec::test_any_set(need_path_force)) {
fvec dC = VLJ * ( Str * Stb + c_1_0 - Str);
aut_airebo_lj_force_path(ka, need_path_force, dC, testpath);
}
return;
}
exceed_limits:
for (int l = 0; l < fvec::VL; l++) {
ref_lennard_jones_single_interaction(ka, ivec::at(i, l), ivec::at(j, l),
MORSEFLAG);
}
return;
}
/*
* Calculate the lennard-jones interaction.
* Uses the above hash-map, and outlines the calculation if the bondorder is
* needed.
* Agressively compresses to get the most values calculated.
*/
template<int MORSEFLAG>
static void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
AtomAIREBOT<flt_t> * x = ka->x;
int * tag = ka->tag;
int * map = ka->map;
ResultForceT<acc_t> * result_f = ka->result_f;
ivec c_i1 = ivec::set1(1);
ivec c_i4 = ivec::set1(4);
fvec c_1_0 = fvec::set1(1.0);
fvec c_2_0 = fvec::set1(2.0);
fvec c_0_0 = fvec::set1(0.0);
int map_i_scalar = 0;
{
int i;
for (i = 1; i < ka->num_types; i++) {
if (ka->map[i])
map_i_scalar |= (1 << i);
}
}
ivec map_i = ivec::set1(map_i_scalar);
fvec result_eng = fvec::setzero();
struct aut_airebo_lj_test_path_result_data test_path_result;
for (int i = 0; i < OPT_TEST_PATH_SIZE; i++) {
test_path_result.i[i] = -1;
}
ivec i_bo[2][2];
ivec j_bo[2][2];
fvec cij_bo[2][2];
LennardJonesPathAIREBOT<flt_t> testpath_bo[2][2][fvec::VL];
int num_bo[2][2] = {0};
for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
ivec itag_bc = ivec::set1(tag[i]);
int itype = map[x[i].w];
fvec x_i = fvec::set1(x[i].x);
fvec y_i = fvec::set1(x[i].y);
fvec z_i = fvec::set1(x[i].z);
ivec i_bc = ivec::set1(i);
fvec cutljsq0 = fvec::set1(ka->params.cutljsq[itype][0]);
fvec cutljsq1 = fvec::set1(ka->params.cutljsq[itype][1]);
fvec p_rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
fvec p_rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
flt_t sigcut = ka->params.sigcut;
flt_t sigmin = ka->params.sigmin;
flt_t sigma0 = ka->params.sigma[itype][0];
flt_t rljmax0 = sigcut * sigma0;
flt_t rljmin0 = sigmin * sigma0;
flt_t sigma1 = ka->params.sigma[itype][1];
flt_t rljmax1 = sigcut * sigma1;
flt_t rljmin1 = sigmin * sigma1;
fvec p_rljmax0 = fvec::set1(rljmax0);
fvec p_rljmax1 = fvec::set1(rljmax1);
fvec p_rljmin0 = fvec::set1(rljmin0);
fvec p_rljmin1 = fvec::set1(rljmin1);
fvec p_rcLJmax0 = fvec::set1(ka->params.rcLJmax[itype][0]);
fvec p_rcLJmax1 = fvec::set1(ka->params.rcLJmax[itype][1]);
fvec p_rcLJmin0 = fvec::set1(ka->params.rcLJmin[itype][0]);
fvec p_rcLJmin1 = fvec::set1(ka->params.rcLJmin[itype][1]);
fvec p_lj10 = fvec::set1(ka->params.lj1[itype][0]);
fvec p_lj11 = fvec::set1(ka->params.lj1[itype][1]);
fvec p_lj20 = fvec::set1(ka->params.lj2[itype][0]);
fvec p_lj21 = fvec::set1(ka->params.lj2[itype][1]);
fvec p_lj30 = fvec::set1(ka->params.lj3[itype][0]);
fvec p_lj31 = fvec::set1(ka->params.lj3[itype][1]);
fvec p_lj40 = fvec::set1(ka->params.lj4[itype][0]);
fvec p_lj41 = fvec::set1(ka->params.lj4[itype][1]);
int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
int jnum = ka->neigh_lmp.num_half[i];
bool tap_success = aut_airebo_lj_test_all_paths(ka, i, &test_path_result);
if (! tap_success) {
for (int jj = 0; jj < jnum; jj++) {
ref_lennard_jones_single_interaction(ka, i, neighs[jj], MORSEFLAG);
}
continue;
}
ivec j_2;
fvec delx_2, dely_2, delz_2, rsq_2;
bvec jtype_mask_2;
int num_2 = 0;
fvec result_f_i_x = fvec::setzero();
fvec result_f_i_y = fvec::setzero();
fvec result_f_i_z = fvec::setzero();
int jj = 0;
bool rest_j = jj < jnum;
bool rest_2 = fvec::fast_compress();
#pragma forceinline recursive
while (rest_j || rest_2) {
fvec delx, dely, delz, rsq;
bvec jtype_mask, within_cutoff;
ivec j;
if (rest_j) {
bvec mask_0 = bvec::full();
//0xFF >> (8 - (jnum - jj));
if (jj + (fvec::VL - 1) >= jnum) mask_0 = bvec::only(jnum - jj);
j = ivec::maskz_loadu(mask_0, &neighs[jj]);
fvec x_j, y_j, z_j;
aut_loadatoms_vec(x, j, &x_j, &y_j, &z_j, &jtype_mask, map, map_i,
c_i1);
fvec::gather_prefetch0(ivec::mullo(c_i4,
ivec::maskz_loadu(bvec::full(), &neighs[jj + fvec::VL])), x);
_mm_prefetch((const char*)&neighs[jj + 2 * fvec::VL], _MM_HINT_T0);
delx = x_i - x_j;
dely = y_i - y_j;
delz = z_i - z_j;
rsq = delx * delx + dely * dely + delz * delz;
fvec cutoff_sq = fvec::mask_blend(jtype_mask, cutljsq0, cutljsq1);
within_cutoff = fvec::mask_cmplt(mask_0, rsq, cutoff_sq);
if (fvec::fast_compress()) {
j = ivec::masku_compress(within_cutoff, j);
delx = fvec::masku_compress(within_cutoff, delx);
dely = fvec::masku_compress(within_cutoff, dely);
delz = fvec::masku_compress(within_cutoff, delz);
rsq = fvec::masku_compress(within_cutoff, rsq);
jtype_mask = bvec::masku_compress(within_cutoff, jtype_mask);
//within_cutoff = 0xFF >> (8 - _cc_popcnt(within_cutoff));
bvec mask_2 = bvec::after(num_2);//0xFF << num_2;
j_2 = ivec::mask_expand(j_2, mask_2, j);
delx_2 = fvec::mask_expand(delx_2, mask_2, delx);
dely_2 = fvec::mask_expand(dely_2, mask_2, dely);
delz_2 = fvec::mask_expand(delz_2, mask_2, delz);
rsq_2 = fvec::mask_expand(rsq_2, mask_2, rsq);
jtype_mask_2 = bvec::mask_expand(jtype_mask_2, mask_2, jtype_mask);
num_2 = num_2 + bvec::popcnt(within_cutoff);
if (num_2 < fvec::VL) {
jj += fvec::VL;
rest_j = jj < jnum;
continue;
}
num_2 -= fvec::VL;
//(0xFF >> (8 - num_2)) << (_cc_popcnt(within_cutoff) - num_2);
mask_2 = bvec::onlyafter(num_2, bvec::popcnt(within_cutoff) - num_2);
{
ivec tmp_j = j_2;
j_2 = ivec::masku_compress(mask_2, j);
j = tmp_j;
fvec tmp_delx = delx_2;
delx_2 = fvec::masku_compress(mask_2, delx);
delx = tmp_delx;
fvec tmp_dely = dely_2;
dely_2 = fvec::masku_compress(mask_2, dely);
dely = tmp_dely;
fvec tmp_delz = delz_2;
delz_2 = fvec::masku_compress(mask_2, delz);
delz = tmp_delz;
fvec tmp_rsq = rsq_2;
rsq_2 = fvec::masku_compress(mask_2, rsq);
rsq = tmp_rsq;
bvec tmp_jtype_mask = jtype_mask_2;
jtype_mask_2 = bvec::masku_compress(mask_2, jtype_mask);
jtype_mask = tmp_jtype_mask;
within_cutoff = bvec::full();
}
}
} else if (rest_2) {
rest_2 = false;
j = j_2;
delx = delx_2;
dely = dely_2;
delz = delz_2;
rsq = rsq_2;
jtype_mask = jtype_mask_2;
within_cutoff = bvec::only(num_2);
num_2 = 0;
}
bvec current_mask = within_cutoff;
if (bvec::test_all_unset(current_mask)) {
jj += fvec::VL;
rest_j = jj < jnum;
continue;
}
fvec rij = fvec::sqrt(rsq);
LennardJonesPathAIREBOT<flt_t> testpath[fvec::VL];
fvec cij = c_1_0;
fvec p_cut3rebo = fvec::set1(ka->params.cut3rebo);
bvec need_search = fvec::mask_cmplt(current_mask, rij, p_cut3rebo);
if (bvec::test_any_set(need_search)) {
fvec p_rcmax = fvec::mask_blend(jtype_mask, p_rcmax0, p_rcmax1);
#pragma noinline
cij = aut_airebo_lj_tap_test_path(ka, &test_path_result, need_search,
i_bc, j, testpath);
}
current_mask = fvec::mask_cmplt(current_mask, c_0_0, cij);
if (bvec::test_all_unset(current_mask)) {
jj += fvec::VL;
rest_j = jj < jnum;
continue;
}
bvec need_path_force = fvec::mask_cmplt(current_mask, cij, c_1_0);
fvec p_rljmax = fvec::mask_blend(jtype_mask, p_rljmax0, p_rljmax1);
fvec p_rljmin = fvec::mask_blend(jtype_mask, p_rljmin0, p_rljmin1);
fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw);
fvec p_lj1 = fvec::mask_blend(jtype_mask, p_lj10, p_lj11);
fvec p_lj2 = fvec::mask_blend(jtype_mask, p_lj20, p_lj21);
fvec p_lj3 = fvec::mask_blend(jtype_mask, p_lj30, p_lj31);
fvec p_lj4 = fvec::mask_blend(jtype_mask, p_lj40, p_lj41);
fvec vdw, dvdw;
fvec r2inv = fvec::recip(rsq);
if (MORSEFLAG) {
fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4);
vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0);
dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr);
} else {
fvec r6inv = r2inv * r2inv * r2inv;
vdw = r6inv * ( p_lj3 * r6inv - p_lj4);
fvec r7inv = r6inv * rij * r2inv;
dvdw = r7inv * ( p_lj2 - p_lj1 * r6inv);
}
fvec VLJ = vdw * slw;
fvec dVLJ = dvdw * slw + vdw * dslw;
fvec p_rcLJmin = fvec::mask_blend(jtype_mask, p_rcLJmin0, p_rcLJmin1);
fvec p_rcLJmax = fvec::mask_blend(jtype_mask, p_rcLJmax0, p_rcLJmax1);
fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr);
fvec VA = cij * VLJ * Str;
bvec need_bondorder = fvec::mask_cmplt(current_mask, c_0_0, Str);
fvec Stb = fvec::setzero();
fvec fij[3];
fij[0] = fvec::setzero();
fij[1] = fvec::setzero();
fij[2] = fvec::setzero();
if (bvec::test_any_set(need_bondorder)) {
for (int jtype = 0; jtype < 2; jtype++) {
bvec need_bo_with_jtype = need_bondorder;
if (jtype) need_bo_with_jtype = need_bo_with_jtype & jtype_mask;
else need_bo_with_jtype = need_bo_with_jtype & ~ jtype_mask;
ivec jtmp = ivec::masku_compress(need_bo_with_jtype, j);
ivec itmp = ivec::masku_compress(need_bo_with_jtype, ivec::set1(i));
fvec cijtmp = fvec::masku_compress(need_bo_with_jtype, cij);
bvec insert_mask = bvec::after(num_bo[itype][jtype]);
i_bo[itype][jtype] = ivec::mask_expand(i_bo[itype][jtype],
insert_mask, itmp);
j_bo[itype][jtype] = ivec::mask_expand(j_bo[itype][jtype],
insert_mask, jtmp);
cij_bo[itype][jtype] = fvec::mask_expand(cij_bo[itype][jtype],
insert_mask, cijtmp);
bvec need_path_force_with_jtype = need_bo_with_jtype &
need_path_force;
int testpath_end = fvec::VL;
if (bvec::test_any_set(need_path_force_with_jtype)) {
int pos = num_bo[itype][jtype];
for (int l = 0; l < fvec::VL; l++) {
if (pos >= fvec::VL) {
testpath_end = l;
break;
}
if (bvec::test_at(need_path_force_with_jtype, l)) {
testpath_bo[itype][jtype][pos] = testpath[l];
}
if (bvec::test_at(need_bo_with_jtype, l)) {
pos += 1;
}
}
}
num_bo[itype][jtype] = num_bo[itype][jtype] +
bvec::popcnt(need_bo_with_jtype);
if (num_bo[itype][jtype] >= fvec::VL) {
#pragma noinline
aut_lj_with_bo<MORSEFLAG>(ka, itype, jtype, i_bo[itype][jtype],
j_bo[itype][jtype], cij_bo[itype][jtype],
testpath_bo[itype][jtype]);
num_bo[itype][jtype] -= fvec::VL;
insert_mask = bvec::onlyafter(num_bo[itype][jtype],
bvec::popcnt(need_bo_with_jtype) -
num_bo[itype][jtype]);
i_bo[itype][jtype] = ivec::masku_compress(insert_mask, itmp);
j_bo[itype][jtype] = ivec::masku_compress(insert_mask, jtmp);
cij_bo[itype][jtype] = fvec::masku_compress(insert_mask, cijtmp);
if (bvec::test_any_set(need_path_force_with_jtype)) {
int pos = 0;
for (int l = testpath_end; l < fvec::VL; l++) {
if (bvec::test_at(need_path_force_with_jtype, l)) {
testpath_bo[itype][jtype][pos] = testpath[l];
}
if (bvec::test_at(need_bo_with_jtype, l)) {
pos += 1;
}
}
}
}
}
current_mask = current_mask & ~ need_bondorder;
need_path_force = need_path_force & ~ need_bondorder;
}
fvec fpdVLJ = cij * dVLJ * ( c_1_0 + Str * ( Stb - c_1_0));
fvec fpdStr = dStr * cij * ( Stb * VLJ - VLJ);
fvec fpair = r2inv * rij * ( fvec::setzero() - ( fpdVLJ + fpdStr));
fvec evdwl = VA * Stb + cij * VLJ * ( c_1_0 - Str);
fvec fix = fpair * delx + fij[0];
fvec fiy = fpair * dely + fij[1];
fvec fiz = fpair * delz + fij[2];
result_f_i_x = fvec::mask_add(result_f_i_x, current_mask, result_f_i_x,
fix);
result_f_i_y = fvec::mask_add(result_f_i_y, current_mask, result_f_i_y,
fiy);
result_f_i_z = fvec::mask_add(result_f_i_z, current_mask, result_f_i_z,
fiz);
result_eng = fvec::mask_add(result_eng, current_mask, result_eng, evdwl);
ivec j_dbl_idx = ivec::mullo(j, c_i4);
avec fjx = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx,
&ka->result_f[0].x, sizeof(acc_t));
avec fjy = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx,
&ka->result_f[0].y, sizeof(acc_t));
avec fjz = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx,
&ka->result_f[0].z, sizeof(acc_t));
fjx = fjx - fix;
fjy = fjy - fiy;
fjz = fjz - fiz;
avec::mask_i32loscatter(&ka->result_f[0].x, current_mask, j_dbl_idx, fjx,
sizeof(acc_t));
avec::mask_i32loscatter(&ka->result_f[0].y, current_mask, j_dbl_idx, fjy,
sizeof(acc_t));
avec::mask_i32loscatter(&ka->result_f[0].z, current_mask, j_dbl_idx, fjz,
sizeof(acc_t));
if (bvec::test_any_set(need_path_force)) {
fvec dC = VLJ * ( Str * Stb + c_1_0 - Str);
#pragma noinline
aut_airebo_lj_force_path(ka, need_path_force, dC, testpath);
}
jj += fvec::VL;
rest_j = jj < jnum;
}
ka->result_f[i].x += fvec::reduce_add(result_f_i_x);
ka->result_f[i].y += fvec::reduce_add(result_f_i_y);
ka->result_f[i].z += fvec::reduce_add(result_f_i_z);
}
for (int itype = 0; itype < 2; itype++) {
for (int jtype = 0; jtype < 2; jtype++) {
for (int l = 0; l < num_bo[itype][jtype]; l++) {
ref_lennard_jones_single_interaction(ka,ivec::at(i_bo[itype][jtype],l),
ivec::at(j_bo[itype][jtype], l),
MORSEFLAG);
}
}
}
ka->result_eng += fvec::reduce_add(result_eng);
}
};
template<typename flt_t, typename acc_t>
void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag) {
#ifdef LMP_INTEL_AIREBO_REF
ref_lennard_jones(ka, morseflag);
#else
if (morseflag) {
aut_wrap<flt_t,acc_t>::template aut_lennard_jones<1>(ka);
} else {
aut_wrap<flt_t,acc_t>::template aut_lennard_jones<0>(ka);
}
#endif
}
template<typename flt_t, typename acc_t>
void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
#ifdef LMP_INTEL_AIREBO_REF
ref_rebo_neigh(ka);
#else
aut_wrap<flt_t,acc_t>::aut_rebo_neigh(ka);
#endif
}
template<typename flt_t, typename acc_t>
void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torsion_flag) {
#ifdef LMP_INTEL_AIREBO_REF
ref_frebo(ka, torsion_flag);
#else
aut_wrap<flt_t,acc_t>::aut_frebo(ka, torsion_flag);
#endif
}
#ifdef __INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
}

Event Timeline