pair_tersoff_intel.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Feb 18, 23:52

pair_tersoff_intel.cpp
View Options

This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing author: Markus Höhnerbach (RWTH)
	------------------------------------------------------------------------- */

	#include "math.h"
	#include "stdio.h"
	#include "stdlib.h"
	#include "string.h"
	#include "pair_tersoff_intel.h"
	#include "atom.h"
	#include "neighbor.h"
	#include "neigh_list.h"
	#include "neigh_request.h"
	#include "force.h"
	#include "comm.h"
	#include "memory.h"
	#include "error.h"

	// Currently Intel compiler is required for this pair style.
	// For convenience, base class routines are called if not using Intel compiler.
	#ifndef __INTEL_COMPILER
	using namespace LAMMPS_NS;

	PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
	{
	}

	void PairTersoffIntel::compute(int eflag, int vflag)
	{
	PairTersoff::compute(eflag, vflag);
	}

	void PairTersoffIntel::init_style()
	{
	if (comm->me == 0) {
	error->warning(FLERR, "Tersoff/intel currently requires intel compiler. "
	"Using MANYBODY version.");
	}
	PairTersoff::init_style();
	}

	#else

	#ifdef _LMP_INTEL_OFFLOAD
	#pragma offload_attribute(push,target(mic))
	#endif

	#include "intel_intrinsics.h"
	#include "math_const.h"

	#ifdef _LMP_INTEL_OFFLOAD
	#pragma offload_attribute(pop)
	#endif

	#include "group.h"
	#include "kspace.h"
	#include "modify.h"
	#include "suffix.h"

	using namespace LAMMPS_NS;
	using namespace MathConst;

	/* ---------------------------------------------------------------------- */

	PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
	{
	suffix_flag \|= Suffix::INTEL;
	respa_enable = 0;
	}

	/* ---------------------------------------------------------------------- */

	// Dispatch the requested precision
	void PairTersoffIntel::compute(int eflag, int vflag)
	{
	if (fix->precision()==FixIntel::PREC_MODE_MIXED) {
	compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
	force_const_single);
	} else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) {
	compute<double,double>(eflag, vflag, fix->get_double_buffers(),
	force_const_double);
	} else {
	compute<float,float>(eflag, vflag, fix->get_single_buffers(),
	force_const_single);
	}
	fix->balance_stamp();
	vflag_fdotr = 0;
	}

	// Dispatch the extent of computation:
	// do we need to calculate energy/virial
	template <class flt_t, class acc_t>
	void PairTersoffIntel::compute(int eflag, int vflag,
	IntelBuffers<flt_t,acc_t> *buffers,
	const ForceConst<flt_t> &fc)
	{
	if (eflag \|\| vflag) {
	ev_setup(eflag,vflag);
	} else evflag = vflag_fdotr = 0;

	const int inum = list->inum;
	const int nthreads = comm->nthreads;
	const int host_start = fix->host_start_pair();
	const int offload_end = fix->offload_end_pair();
	const int ago = neighbor->ago;

	if (ago != 0 && fix->separate_buffers() == 0) {
	fix->start_watch(TIME_PACK);
	#if defined(_OPENMP)
	#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
	#endif
	{
	int ifrom, ito, tid;
	IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
	nthreads, sizeof(ATOM_T));
	buffers->thr_pack(ifrom,ito,ago);
	}
	fix->stop_watch(TIME_PACK);
	}

	if (evflag \|\| vflag_fdotr) {
	int ovflag = 0;
	if (vflag_fdotr) ovflag = 2;
	else if (vflag) ovflag = 1;
	if (eflag) {
	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
	} else {
	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
	}
	} else {
	eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
	eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
	}
	}

	#ifdef _LMP_INTEL_OFFLOAD
	#pragma offload_attribute(push, target(mic))
	#endif

	// The complete Tersoff computation kernel is encapsulated here
	// everything is static, the class just serves as a unit of organization
	template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
	struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> {
	// instantiate the vector library and import the types
	typedef typename lmp_intel::vector_routines<flt_t, acc_t, mic> v;
	typedef typename v::fvec fvec;
	typedef typename v::ivec ivec;
	typedef typename v::bvec bvec;
	typedef typename v::avec avec;
	typedef typename v::iarr iarr;
	typedef typename v::farr farr;
	typedef typename v::aarr aarr;
	typedef typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t c_inner_t;
	typedef typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t c_outer_t;

	// for descriptions of these methods, please have a look at the original code
	// what's done in here is that they are inlined and vectorized
	// attractive() also provides an option to compute zeta as well
	static fvec zeta_vector(
	const c_inner_t * param,
	ivec xjw, bvec mask,
	fvec vrij, fvec rsq2,
	fvec vdijx, fvec vdijy, fvec vdijz,
	fvec dikx, fvec diky, fvec dikz
	);
	static void force_zeta_vector(
	const c_outer_t * param,
	ivec xjw,
	bvec mask,
	fvec vrijsq, fvec vzeta_ij,
	fvec vfpair, fvec vprefactor, int EVDWL, fvec *vevdwl,
	bvec vmask_repulsive
	);
	template<bool ZETA>
	static void attractive_vector(
	const c_inner_t * param,
	ivec xjw,
	bvec mask,
	fvec vprefactor,
	fvec vrijsq, fvec rsq2,
	fvec vdijx, fvec vdijy, fvec vdijz,
	fvec dikx, fvec diky, fvec dikz,
	fvec fix, fvec fiy, fvec *fiz,
	fvec fjx, fvec fjy, fvec *fjz,
	fvec fkx, fvec fky, fvec *fkz,
	fvec *zeta
	);

	// perform the actual computation
	template<bool EVFLAG, bool EFLAG>
	static void kernel(
	int iito, int iifrom, int eatom, int vflag,
	const int * _noalias const numneigh,
	const int * _noalias const numneighhalf,
	const int * _noalias const cnumneigh,
	const int * _noalias const firstneigh, int ntypes,
	typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
	const c_inner_t * _noalias const c_inner,
	const c_outer_t * _noalias const c_outer,
	typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
	acc_t evdwl, acc_t ov0, acc_t * ov1, acc_t ov2, acc_t ov3, acc_t ov4, acc_t ov5
	);

	// perform one step of calculation, pass in i-j pairs of atoms (is, js)
	template<int EVFLAG, int EFLAG>
	static void kernel_step(
	int eatom, int vflag,
	const int * _noalias const numneigh,
	const int * _noalias const cnumneigh,
	const int * _noalias const firstneigh,
	int ntypes,
	typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
	const c_inner_t * _noalias const c_inner,
	const c_outer_t * _noalias const c_outer,
	typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
	avec vsevdwl, avec vsv0, avec * vsv1, avec vsv2, avec vsv3, avec vsv4, avec vsv5,
	int compress_idx, iarr is, iarr js, bvec vmask_repulsive
	);

	// perform one step of calculation, as opposed to the previous method now
	// with fixed i and a number of js
	template<int EVFLAG, int EFLAG>
	static void kernel_step_const_i(
	int eatom, int vflag,
	const int * _noalias const numneigh, const int * _noalias const cnumneigh,
	const int * _noalias const firstneigh, int ntypes,
	typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
	const c_inner_t * _noalias const c_inner,
	const c_outer_t * _noalias const c_outer,
	typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
	avec vsevdwl, avec vsv0, avec vsv1, avec vsv2, avec vsv3, avec vsv4, avec *vsv5,
	int compress_idx, int i, iarr js, bvec vmask_repulsive
	);
	};

	#ifdef _LMP_INTEL_OFFLOAD
	#pragma offload_attribute(pop)
	#endif

	/* ---------------------------------------------------------------------- */

	// Dispatch to correct kernel instatiation and perform all the work neccesary
	// for offloading. In this routine we enter the Phi.
	// This method is nearly identical to what happens in the other /intel styles
	template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
	void PairTersoffIntel::eval(const int offload, const int vflag,
	IntelBuffers<flt_t,acc_t> *buffers,
	const ForceConst<flt_t> &fc,
	const int astart, const int aend)
	{
	const int inum = aend - astart;
	if (inum == 0) return;
	int nlocal, nall, minlocal;
	fix->get_buffern(offload, nlocal, nall, minlocal);

	const int ago = neighbor->ago;
	IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);

	ATOM_T * _noalias const x = buffers->get_x(offload);
	tagint * _noalias tag = this->atom->tag;
	flt_t * _noalias const q = buffers->get_q(offload);

	const int * _noalias const numneigh = list->numneigh;
	const int * _noalias const cnumneigh = buffers->cnumneigh(list);
	const int * _noalias const numneighhalf = buffers->get_atombin();
	const int * _noalias const firstneigh = buffers->firstneigh(list);

	typedef typename ForceConst<flt_t>::c_inner_t c_inner_t;
	typedef typename ForceConst<flt_t>::c_outer_t c_outer_t;
	typedef typename ForceConst<flt_t>::c_cutoff_t c_cutoff_t;
	const c_outer_t * _noalias const c_outer = fc.c_outer[0];
	const c_inner_t * _noalias const c_inner = fc.c_inner[0][0];
	const c_cutoff_t * _noalias const c_inner_cutoff = fc.c_cutoff_inner[0][0];

	const int ntypes = atom->ntypes + 1;
	const int eatom = this->eflag_atom;

	// Determine how much data to transfer
	int x_size, q_size, f_stride, ev_size, separate_flag;
	IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
	buffers, offload, fix, separate_flag,
	x_size, q_size, ev_size, f_stride);

	int tc;
	FORCE_T * _noalias f_start;
	acc_t * _noalias ev_global;
	IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);

	const int nthreads = tc;

	#ifdef _LMP_INTEL_OFFLOAD
	int *overflow = fix->get_off_overflow_flag();
	double *timer_compute = fix->off_watch_pair();
	if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
	#pragma offload target(mic:_cop) if(offload) \
	in(c_inner, c_outer :length(0) alloc_if(0) free_if(0)) \
	in(c_inner_cutoff :length(0) alloc_if(0) free_if(0)) \
	in(firstneigh:length(0) alloc_if(0) free_if(0)) \
	in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
	in(numneigh:length(0) alloc_if(0) free_if(0)) \
	in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
	in(x:length(x_size) alloc_if(0) free_if(0)) \
	in(overflow:length(0) alloc_if(0) free_if(0)) \
	in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
	in(f_stride,nlocal,minlocal,separate_flag,offload) \
	out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
	out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
	out(timer_compute:length(1) alloc_if(0) free_if(0)) \
	signal(f_start)
	#endif
	{
	#ifdef _LMP_INTEL_OFFLOAD
	#ifdef __MIC__
	*timer_compute = MIC_Wtime();
	#endif
	#endif

	IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
	f_stride, x, 0);

	acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
	if (EVFLAG) {
	oevdwl = oecoul = (acc_t)0;
	if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
	}

	// loop over neighbors of my atoms
	#if defined(_OPENMP)
	#pragma omp parallel default(none) \
	shared(f_start,f_stride,nlocal,nall,minlocal) \
	reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
	#endif
	{
	int iifrom, iito, tid;
	IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
	iifrom += astart;
	iito += astart;

	FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
	memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));

	{
	acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
	sevdwl = sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = 0.;
	#define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
	firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5
	// Pick the variable i algorithm under specific conditions
	// do use scalar algorithm with very short vectors
	int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
	bool pack_i = VL >= 8 &&
	lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
	bool use_scalar = VL < 4;
	if (use_scalar) {
	IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EVFLAG,EFLAG>(ARGS);
	} else if (pack_i) {
	IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EVFLAG,EFLAG>(ARGS);
	} else {
	IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EVFLAG,EFLAG>(ARGS);
	}
	if (EVFLAG) {
	if (EFLAG) oevdwl += sevdwl;
	if (vflag == 1) {
	ov0 += sv0;
	ov1 += sv1;
	ov2 += sv2;
	ov3 += sv3;
	ov4 += sv4;
	ov5 += sv5;
	}
	}
	}

	#ifndef _LMP_INTEL_OFFLOAD
	if (vflag == 2)
	#endif
	{
	#if defined(_OPENMP)
	#pragma omp barrier
	#endif
	IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
	nlocal, minlocal, nthreads, f_start, f_stride,
	x, offload);
	}
	} // end of omp parallel region
	if (EVFLAG) {
	if (EFLAG) {
	ev_global[0] = oevdwl;
	ev_global[1] = 0.0;
	}
	if (vflag) {
	ev_global[2] = ov0;
	ev_global[3] = ov1;
	ev_global[4] = ov2;
	ev_global[5] = ov3;
	ev_global[6] = ov4;
	ev_global[7] = ov5;
	}
	}

	#ifdef _LMP_INTEL_OFFLOAD
	#ifdef __MIC__
	timer_compute = MIC_Wtime() - timer_compute;
	#endif
	#endif
	} // end of offload region

	if (offload)
	fix->stop_watch(TIME_OFFLOAD_LATENCY);
	else
	fix->stop_watch(TIME_HOST_PAIR);

	if (EVFLAG)
	fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
	else
	fix->add_result_array(f_start, 0, offload);

	}

	/* ----------------------------------------------------------------------
	init specific to this pair style
	------------------------------------------------------------------------- */

	// As in any other /intel pair style
	void PairTersoffIntel::init_style()
	{
	if (atom->tag_enable == 0)
	error->all(FLERR,"Pair style Tersoff requires atom IDs");
	if (force->newton_pair == 0)
	error->all(FLERR,"Pair style Tersoff requires newton pair on");

	// need a full neighbor list

	int irequest = neighbor->request(this);
	neighbor->requests[irequest]->half = 0;
	neighbor->requests[irequest]->full = 1;
	neighbor->requests[irequest]->intel = 1;

	int ifix = modify->find_fix("package_intel");
	if (ifix < 0)
	error->all(FLERR,
	"The 'package intel' command is required for /intel styles");
	fix = static_cast<FixIntel *>(modify->fix[ifix]);

	fix->pair_init_check();
	#ifdef _LMP_INTEL_OFFLOAD
	_cop = fix->coprocessor_number();
	#endif
	if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
	pack_force_const(force_const_single, fix->get_mixed_buffers());
	fix->get_mixed_buffers()->need_tag(1);
	} else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
	fix->get_double_buffers()->need_tag(1);
	pack_force_const(force_const_double, fix->get_double_buffers());
	} else {
	pack_force_const(force_const_single, fix->get_single_buffers());
	fix->get_single_buffers()->need_tag(1);
	}
	}

	// As in any other /intel pair style
	template <class flt_t, class acc_t>
	void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
	IntelBuffers<flt_t,acc_t> *buffers)
	{
	int tp1 = atom->ntypes + 1;

	fc.set_ntypes(tp1, memory, _cop);
	buffers->set_ntypes(tp1);
	flt_t **cutneighsq = buffers->get_cutneighsq();

	// Repeat cutsq calculation because done after call to init_style
	double cut, cutneigh;
	for (int i = 1; i <= atom->ntypes; i++) {
	for (int j = i; j <= atom->ntypes; j++) {
	if (setflag[i][j] != 0 \|\| (setflag[i][i] != 0 && setflag[j][j] != 0)) {
	cut = init_one(i, j);
	cutneigh = cut + neighbor->skin;
	cutsq[i][j] = cutsq[j][i] = cut*cut;
	cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
	}
	}
	}

	for (int i = 1; i < tp1; i++) {
	for (int j = 1; j < tp1; j++) {
	fc.c_inner_loop[i][j][0].d2 = 1.0;
	fc.c_inner_loop[i][0][j].d2 = 1.0;
	fc.c_inner_loop[0][i][j].d2 = 1.0;
	for (int k = 1; k < tp1; k++) {
	Param * param = &params[elem2param[map[i]][map[j]][map[k]]];
	fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq);
	fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
	fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr);
	fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd);
	fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
	fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
	fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h);
	fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma);
	fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);

	fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq);
	fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
	fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr);
	fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd);
	fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
	fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
	fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h);
	fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma);
	fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);

	}
	Param * param = &params[elem2param[map[i]][map[j]][map[j]]];
	fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
	fc.c_first_loop[i][j].bigr = static_cast<flt_t>(param->bigr);
	fc.c_first_loop[i][j].bigd = static_cast<flt_t>(param->bigd);
	fc.c_first_loop[i][j].lam1 = static_cast<flt_t>(param->lam1);
	fc.c_first_loop[i][j].biga = static_cast<flt_t>(param->biga);
	fc.c_second_loop[i][j].lam2 = static_cast<flt_t>(param->lam2);
	fc.c_second_loop[i][j].beta = static_cast<flt_t>(param->beta);
	fc.c_second_loop[i][j].bigb = static_cast<flt_t>(param->bigb);
	fc.c_second_loop[i][j].powern = static_cast<flt_t>(param->powern);
	fc.c_second_loop[i][j].c1 = static_cast<flt_t>(param->c1);
	fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2);
	fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3);
	fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4);

	fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
	fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr);
	fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd);
	fc.c_outer[i][j].lam1 = static_cast<flt_t>(param->lam1);
	fc.c_outer[i][j].biga = static_cast<flt_t>(param->biga);
	fc.c_outer[i][j].lam2 = static_cast<flt_t>(param->lam2);
	fc.c_outer[i][j].beta = static_cast<flt_t>(param->beta);
	fc.c_outer[i][j].bigb = static_cast<flt_t>(param->bigb);
	fc.c_outer[i][j].powern = static_cast<flt_t>(param->powern);
	fc.c_outer[i][j].c1 = static_cast<flt_t>(param->c1);
	fc.c_outer[i][j].c2 = static_cast<flt_t>(param->c2);
	fc.c_outer[i][j].c3 = static_cast<flt_t>(param->c3);
	fc.c_outer[i][j].c4 = static_cast<flt_t>(param->c4);

	}
	fc.c_outer[i][0].cutsq = 0.;
	}

	#ifdef _LMP_INTEL_OFFLOAD
	if (_cop < 0) return;
	typename ForceConst<flt_t>::c_first_loop_t * c_first_loop = fc.c_first_loop[0];
	typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_outer = fc.c_cutoff_outer[0];
	typename ForceConst<flt_t>::c_outer_t * c_outer = fc.c_outer[0];
	typename ForceConst<flt_t>::c_second_loop_t * c_second_loop = fc.c_second_loop[0];
	typename ForceConst<flt_t>::c_inner_loop_t * c_inner_loop = fc.c_inner_loop[0][0];
	typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_inner = fc.c_cutoff_inner[0][0];
	typename ForceConst<flt_t>::c_inner_t * c_inner = fc.c_inner[0][0];
	flt_t * ocutneighsq = cutneighsq[0];
	size_t VL = 512 / 8 / sizeof(flt_t);
	int ntypes = tp1;
	int ntypes_pad = ntypes + VL - ntypes % VL;
	int tp1sq = tp1 * tp1;
	int tp1cb = tp1 * tp1 * tp1;
	int tp1cb_pad = tp1 * tp1 * ntypes_pad;
	#pragma offload_transfer target(mic:_cop) \
	in(c_first_loop, c_second_loop, c_cutoff_outer, c_outer : length(tp1sq) alloc_if(0) free_if(0)) \
	in(c_inner : length(tp1cb) alloc_if(0) free_if(0)) \
	in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) \
	in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
	#endif
	}

	/* ---------------------------------------------------------------------- */

	// As in any other /intel pair style
	template <class flt_t>
	void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
	Memory *memory,
	const int cop) {
	if ( (ntypes != _ntypes) ) {
	if (_ntypes > 0) {
	#ifdef _LMP_INTEL_OFFLOAD
	c_first_loop_t * oc_first_loop = c_first_loop[0];
	c_second_loop_t * oc_second_loop = c_second_loop[0];
	c_inner_loop_t * oc_inner_loop = c_inner_loop[0][0];
	c_cutoff_t * oc_cutoff_inner = c_cutoff_inner[0][0];
	c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
	c_inner_t * oc_inner = c_inner[0][0];
	c_outer_t * oc_outer = c_outer[0];
	if (c_first_loop != NULL && c_second_loop != NULL &&
	c_inner_loop != NULL && _cop >= 0) {

	#pragma offload_transfer target(mic:cop) \
	nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
	nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
	nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
	}
	#endif
	_memory->destroy(c_first_loop);
	_memory->destroy(c_second_loop);
	_memory->destroy(c_inner_loop);
	_memory->destroy(c_cutoff_outer);
	_memory->destroy(c_cutoff_inner);
	_memory->destroy(c_inner);
	_memory->destroy(c_outer);
	}
	if (ntypes > 0) {
	_cop = cop;
	size_t VL = 512 / 8 / sizeof(flt_t);
	int ntypes_pad = ntypes + VL - ntypes % VL;
	memory->create(c_first_loop,ntypes,ntypes,"fc.c_first_loop");
	memory->create(c_second_loop,ntypes,ntypes,"fc.c_second_loop");
	memory->create(c_cutoff_outer,ntypes,ntypes,"fc.c_cutoff_outer");
	memory->create(c_inner_loop,ntypes,ntypes,ntypes,"fc.c_inner_loop");
	memory->create(c_cutoff_inner,ntypes,ntypes,ntypes_pad,"fc.c_cutoff_inner");
	memory->create(c_inner,ntypes,ntypes,ntypes,"fc.c_inner");
	memory->create(c_outer,ntypes,ntypes,"fc.c_outer");
	#ifdef _LMP_INTEL_OFFLOAD
	c_first_loop_t * oc_first_loop = c_first_loop[0];
	c_second_loop_t * oc_second_loop = c_second_loop[0];
	c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
	c_inner_loop_t * oc_inner_loop = c_inner_loop[0][0];
	c_cutoff_t * oc_cutoff_inner = c_cutoff_inner[0][0];
	c_inner_t * oc_inner = c_inner[0][0];
	c_outer_t * oc_outer = c_outer[0];
	int tp1sq = ntypes * ntypes;
	int tp1cb = ntypes * ntypes * ntypes;
	int tp1cb_pad = ntypes * ntypes * ntypes_pad;
	if (oc_first_loop != NULL && oc_second_loop != NULL &&
	oc_inner_loop != NULL && cop >= 0) {
	#pragma offload_transfer target(mic:cop) \
	nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
	nocopy(oc_second_loop: length(tp1sq) alloc_if(1) free_if(0)) \
	nocopy(oc_cutoff_outer: length(tp1sq) alloc_if(1) free_if(0)) \
	nocopy(oc_outer: length(tp1sq) alloc_if(1) free_if(0)) \
	nocopy(oc_inner_loop: length(tp1cb) alloc_if(1) free_if(0)) \
	nocopy(oc_inner: length(tp1cb) alloc_if(1) free_if(0)) \
	nocopy(oc_cutoff_inner: length(tp1cb_pad) alloc_if(1) free_if(0))
	}
	#endif
	}
	}
	_ntypes=ntypes;
	_memory=memory;
	}

	#ifdef _LMP_INTEL_OFFLOAD
	#pragma offload_attribute(push,target(mic))
	#endif

	// The factor up to which we do caching
	static const int N_CACHE = 8;

	template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
	template<int EVFLAG, int EFLAG>
	void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
	int eatom, int vflag,
	const int * _noalias const numneigh, const int * _noalias const cnumneigh,
	const int * _noalias const firstneigh, int ntypes,
	typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
	const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
	const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
	typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
	avec *vsevdwl,
	avec *vsv0,
	avec *vsv1,
	avec *vsv2,
	avec* vsv3,
	avec *vsv4,
	avec *vsv5,
	int compress_idx,
	iarr is,
	iarr js,
	bvec vmask_repulsive
	) {
	ivec v_i4floats((int) (4 * sizeof(typename v::fscal)));
	ivec v_i1(1);
	fvec v_2(0.);
	fvec v_0_5(0.5);
	ivec v_i0(0);
	ivec v_i_ntypes(ntypes);
	ivec v_i_NEIGHMASK(NEIGHMASK);

	farr fx, fy, fz, fw;
	int cache_idx = 0;
	fvec vfkx_cache[N_CACHE];
	fvec vfky_cache[N_CACHE];
	fvec vfkz_cache[N_CACHE];
	ivec vks_cache[N_CACHE];
	bvec vmask_cache[N_CACHE];
	ivec vkks_final_cache;
	bvec vmask_final_cache;
	iarr ts;
	// compute all the stuff we know from i and j
	// TDO: We could extract this from the driver routine
	ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
	ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
	bvec vmask = v::mask_enable_lower(compress_idx);
	fvec vx_i = v::zero(), vy_i = v::zero(), vz_i = v::zero();
	ivec vw_i = v_i0;
	v::gather_x(vis, vmask, x, &vx_i, &vy_i, &vz_i, &vw_i);
	fvec vx_j = v::zero(), vy_j = v::zero(), vz_j = v::zero();
	ivec vw_j = v_i0;
	v::gather_x(vjs, vmask, x, &vx_j, &vy_j, &vz_j, &vw_j);
	fvec vdx_ij = vx_j - vx_i, vdy_ij = vy_j - vy_i, vdz_ij = vz_j - vz_i;
	fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij;
	fvec vrij = sqrt(vrijsq);
	ivec vis_orig = v::int_load_vl(is);
	ivec vcnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, cnumneigh);
	ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh);
	ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i));

	fvec vzeta = v::zero();
	fvec vfxtmp = v::zero(), vfytmp = v::zero(), vfztmp = v::zero();
	fvec vfjxtmp = v::zero(), vfjytmp = v::zero(), vfjztmp = v::zero();
	// This piece of code faciliates the traversal of the k loop assuming
	// nothing about i. As such, it uses masking to avoid superfluous loads
	// and fast-forwards each lane until work is available.
	// This is useful because we can not make assumptions as to where in the
	// neighbor list the atoms within the cutoff might be.
	// We also implement the caching in here, i.e. collect force contributions
	// due to zeta.
	// This means that you will see four loops:
	// 1. the loop that does zeta calculation and caches the force contributions
	// 2. the loop that processes the remaining zeta calculations
	// 3. the loop that updates the force based on the cached force contributions
	// 4. the loop that computes force contributions for the remainder
	{
	ivec vkks = v_i0;
	bvec vactive_mask = vmask;
	bvec veff_old_mask(0);
	ivec vks, vw_k;
	fvec vx_k, vy_k, vz_k, vcutsq;
	while (! v::mask_testz(vactive_mask) && cache_idx < N_CACHE) {
	bvec vnew_mask = vactive_mask & ~ veff_old_mask;
	vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
	v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
	v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
	fvec vdx_ik = (vx_k - vx_i);
	fvec vdy_ik = (vy_k - vy_i);
	fvec vdz_ik = (vz_k - vz_i);
	fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
	ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
	vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
	bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
	bvec vsame_mask = v::int_cmpneq(vjs, vks);
	bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
	if (v::mask_testz(~(veff_mask \| ~vactive_mask))) {
	fvec vzeta_contrib;
	fvec vfix, vfiy, vfiz;
	fvec vfjx, vfjy, vfjz;
	fvec vfkx, vfky, vfkz;

	attractive_vector<true>(c_inner,vc_idx,veff_mask,fvec(1.),
	vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
	&vfix,&vfiy,&vfiz,
	&vfjx,&vfjy,&vfjz,
	&vfkx,&vfky,&vfkz,
	&vzeta_contrib);
	vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
	vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
	vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
	vfjxtmp = v::mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
	vfjytmp = v::mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
	vfjztmp = v::mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);

	vfkx_cache[cache_idx] = vfkx;
	vfky_cache[cache_idx] = vfky;
	vfkz_cache[cache_idx] = vfkz;
	vks_cache[cache_idx] = vks;
	vmask_cache[cache_idx] = veff_mask;
	cache_idx += 1;

	vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
	vkks = vkks + v_i1;
	veff_old_mask = bvec(0);
	} else {
	vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
	veff_old_mask = veff_mask;
	}
	vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
	}
	vkks_final_cache = vkks;
	vmask_final_cache = vactive_mask;
	while (! v::mask_testz(vactive_mask)) {
	bvec vnew_mask = vactive_mask & ~ veff_old_mask;
	vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
	v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
	v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
	fvec vdx_ik = (vx_k - vx_i);
	fvec vdy_ik = (vy_k - vy_i);
	fvec vdz_ik = (vz_k - vz_i);
	fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
	ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
	vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
	bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
	bvec vsame_mask = v::int_cmpneq(vjs, vks);
	bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
	if (v::mask_testz(~(veff_mask \| ~vactive_mask))) {
	fvec vzeta_contrib;
	vzeta_contrib = zeta_vector(c_inner,vc_idx,veff_mask,vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
	vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
	vkks = vkks + v_i1;
	veff_old_mask = bvec(0);
	} else {
	vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
	veff_old_mask = veff_mask;
	}
	vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
	}
	}
	fvec vfpair, vevdwl, vprefactor, vfwtmp, vfjwtmp;
	force_zeta_vector(c_outer, vc_idx_ij, vmask, vrij, vzeta, &vfpair, &vprefactor, EFLAG, &vevdwl, vmask_repulsive);
	vfxtmp = vfxtmp * vprefactor + vdx_ij * vfpair;
	vfytmp = vfytmp * vprefactor + vdy_ij * vfpair;
	vfztmp = vfztmp * vprefactor + vdz_ij * vfpair;
	vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair;
	vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
	vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;

	if (EVFLAG) {
	if (EFLAG) {
	vsevdwl = v::acc_mask_add(vsevdwl, vmask, *vsevdwl, vevdwl);
	if (eatom) {
	v::store(fw, (v_0_5 * vevdwl));
	}
	}
	if (vflag == 1) {
	vsv0 = v::acc_mask_add(vsv0, vmask, vsv0, vdx_ij vdx_ij * vfpair);
	vsv1 = v::acc_mask_add(vsv1, vmask, vsv1, vdy_ij vdy_ij * vfpair);
	vsv2 = v::acc_mask_add(vsv2, vmask, vsv2, vdz_ij vdz_ij * vfpair);
	vsv3 = v::acc_mask_add(vsv3, vmask, vsv3, vdx_ij vdy_ij * vfpair);
	vsv4 = v::acc_mask_add(vsv4, vmask, vsv4, vdx_ij vdz_ij * vfpair);
	vsv5 = v::acc_mask_add(vsv5, vmask, vsv5, vdy_ij vdz_ij * vfpair);
	}
	}
	{
	while (cache_idx-- > 0) {
	fvec vfkx = vprefactor * vfkx_cache[cache_idx];
	fvec vfky = vprefactor * vfky_cache[cache_idx];
	fvec vfkz = vprefactor * vfkz_cache[cache_idx];
	ivec vks = vks_cache[cache_idx];
	bvec veff_mask = vmask_cache[cache_idx];
	v::store(fx, vfkx);
	v::store(fy, vfky);
	v::store(fz, vfkz);
	v::int_store(ts, vks);
	for (int t = 0; t < v::VL; t++) {
	if (v::mask_test_at(veff_mask, t)) {
	int t_ = ts[t] / (4 * sizeof(typename v::fscal));
	f[t_].x += fx[t];
	f[t_].y += fy[t];
	f[t_].z += fz[t];
	}
	}
	}
	ivec vkks = vkks_final_cache;
	bvec vactive_mask = vmask_final_cache;
	bvec veff_old_mask(0);
	ivec vks, vw_k;
	fvec vx_k, vy_k, vz_k, vcutsq;
	while (! v::mask_testz(vactive_mask)) {
	bvec vnew_mask = vactive_mask & ~ veff_old_mask;
	vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
	v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
	v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
	fvec vdx_ik = vx_k - vx_i;
	fvec vdy_ik = vy_k - vy_i;
	fvec vdz_ik = vz_k - vz_i;
	fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
	ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
	vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
	bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
	bvec vsame_mask = v::int_cmpneq(vjs, vks);
	bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
	if (v::mask_testz(~(veff_mask \| ~vactive_mask))) {
	fvec vfix, vfiy, vfiz;
	fvec vfjx, vfjy, vfjz;
	fvec vfkx, vfky, vfkz;

	attractive_vector<false>(c_inner,vc_idx,veff_mask,vprefactor,
	vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
	&vfix,&vfiy,&vfiz,
	&vfjx,&vfjy,&vfjz,
	&vfkx,&vfky,&vfkz,
	0);
	vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
	vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
	vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
	vfjxtmp = v::mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
	vfjytmp = v::mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
	vfjztmp = v::mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
	v::store(fx, vfkx);
	v::store(fy, vfky);
	v::store(fz, vfkz);
	v::int_store(ts, vks);
	for (int t = 0; t < v::VL; t++) {
	if (v::mask_test_at(veff_mask, t)) {
	int t_ = ts[t] / (4 * sizeof(typename v::fscal));
	f[t_].x += fx[t];
	f[t_].y += fy[t];
	f[t_].z += fz[t];
	}
	}
	vkks = vkks + v_i1;
	veff_old_mask = bvec(0);
	} else {
	vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
	veff_old_mask = veff_mask;
	}
	vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
	} // while (vactive_mask != 0)
	} // section k
	// We can not make any assumptions regarding conflicts.
	// So we sequentialize this.
	// TDO: Once AVX-512 is around check out VPCONFLICT
	v::store(fx, vfjxtmp);
	v::store(fy, vfjytmp);
	v::store(fz, vfjztmp);
	for (int t = 0; t < compress_idx; t++) {
	int t_ = js[t];
	f[t_].x += fx[t];
	f[t_].y += fy[t];
	f[t_].z += fz[t];
	if (EVFLAG && EFLAG && eatom) {
	f[t_].w += fw[t];
	}
	}
	v::store(fx, vfxtmp);
	v::store(fy, vfytmp);
	v::store(fz, vfztmp);
	for (int t = 0; t < compress_idx; t++) {
	int t_ = is[t];
	f[t_].x += fx[t];
	f[t_].y += fy[t];
	f[t_].z += fz[t];
	if (EVFLAG && EFLAG && eatom) {
	f[t_].w += fw[t];
	}
	}
	}

	// Specialized kernel step for fixed i, means that we don't have to use the
	// convoluted iteration scheme above, as the loop variables are uniform.
	template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
	template<int EVFLAG, int EFLAG>
	void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
	int eatom, int vflag,
	const int * _noalias const numneigh, const int * _noalias const cnumneigh,
	const int * _noalias const firstneigh, int ntypes,
	typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
	const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
	const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
	typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
	avec *vsevdwl,
	avec *vsv0,
	avec *vsv1,
	avec *vsv2,
	avec* vsv3,
	avec *vsv4,
	avec *vsv5,
	int compress_idx,
	int i,
	iarr js,
	bvec vmask_repulsive
	) {
	typedef typename v::fvec fvec;
	typedef typename v::ivec ivec;
	typedef typename v::bvec bvec;
	typedef typename v::farr farr;
	typedef typename v::iarr iarr;
	typedef typename v::avec avec;
	typedef typename v::aarr aarr;

	ivec v_i4floats((int) (4 * sizeof(typename v::fscal)));
	ivec v_i1(1), v_i0(0), v_i_ntypes(ntypes), v_i_NEIGHMASK(NEIGHMASK);
	fvec v_0_5(0.5);

	int cache_idx = 0;
	fvec vfkx_cache[N_CACHE];
	fvec vfky_cache[N_CACHE];
	fvec vfkz_cache[N_CACHE];
	int k_cache[N_CACHE];
	bvec vmask_cache[N_CACHE];
	int kk_final_cache;

	aarr fx, fy, fz, fw;
	iarr ts;

	bvec vmask = v::mask_enable_lower(compress_idx);
	fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z);
	int w_i = x[i].w;

	ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
	fvec vx_j = v::zero(), vy_j = v::zero(), vz_j = v::zero();
	ivec vw_j = v_i0;
	v::gather_x(vjs, vmask, x, &vx_j, &vy_j, &vz_j, &vw_j);

	fvec vdx_ij = vx_j - vx_i, vdy_ij = vy_j - vy_i, vdz_ij = vz_j - vz_i;
	fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij;
	fvec vrij = sqrt(vrijsq);

	int cnumneigh_i = cnumneigh[i];
	int numneigh_i = numneigh[i];
	ivec vc_idx_j = v::int_mullo(v_i4floats, vw_j);
	ivec vc_idx_j_ntypes = v::int_mullo(v_i_ntypes, vc_idx_j);

	avec vzeta = v::acc_zero();
	avec vfxtmp = v::acc_zero(), vfytmp = v::acc_zero(), vfztmp = v::acc_zero();
	avec vfjxtmp = v::acc_zero(), vfjytmp = v::acc_zero(), vfjztmp = v::acc_zero();

	// Same structure as kernel_step, just simpler as the loops all iterate over
	// the same k
	int kk = 0;
	for (; kk < numneigh_i && cache_idx < N_CACHE; kk++) {
	int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
	fvec vx_k(x[k].x);
	fvec vy_k(x[k].y);
	fvec vz_k(x[k].z);
	int w_k = x[k].w;
	fvec vdx_ik = vx_k - vx_i;
	fvec vdy_ik = vy_k - vy_i;
	fvec vdz_ik = vz_k - vz_i;
	fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
	fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
	bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
	bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
	bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
	if (! v::mask_testz(veff_mask)) {
	fvec vzeta_contrib;
	fvec vfix, vfiy, vfiz;
	fvec vfjx, vfjy, vfjz;
	fvec vfkx, vfky, vfkz;

	attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.),
	vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
	&vfix,&vfiy,&vfiz,
	&vfjx,&vfjy,&vfjz,
	&vfkx,&vfky,&vfkz,
	&vzeta_contrib);
	vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
	vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
	vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
	vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
	vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
	vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);

	vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero());
	vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero());
	vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero());
	vmask_cache[cache_idx] = veff_mask;
	k_cache[cache_idx] = k;
	cache_idx += 1;

	vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
	}
	}
	kk_final_cache = kk;
	for (; kk < numneigh_i; kk++) {
	int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
	fvec vx_k(x[k].x);
	fvec vy_k(x[k].y);
	fvec vz_k(x[k].z);
	int w_k = x[k].w;
	fvec vdx_ik = vx_k - vx_i;
	fvec vdy_ik = vy_k - vy_i;
	fvec vdz_ik = vz_k - vz_i;
	fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
	fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
	bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
	bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
	bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
	if (! v::mask_testz(veff_mask)) {
	fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq,
	vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
	vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
	}
	}
	fvec vfpair, vevdwl, vprefactor, vfwtmp;
	force_zeta_vector(&c_outer[ntypes * w_i], vc_idx_j, vmask, vrij, vzeta, &vfpair, &vprefactor, EFLAG, &vevdwl, vmask_repulsive);
	avec vaprefactor(vprefactor);
	vfxtmp = vfxtmp * vaprefactor + avec(vdx_ij * vfpair);
	vfytmp = vfytmp * vaprefactor + avec(vdy_ij * vfpair);
	vfztmp = vfztmp * vaprefactor + avec(vdz_ij * vfpair);
	vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair);
	vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
	vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);

	if (EVFLAG) {
	if (EFLAG) {
	vsevdwl = v::acc_mask_add(vsevdwl, vmask, *vsevdwl, vevdwl);
	if (eatom) {
	vfwtmp = v_0_5 * vevdwl;
	v::store(fw, vfwtmp);
	}
	}
	if (vflag == 1) {
	vsv0 = v::acc_mask_add(vsv0, vmask, vsv0, vdx_ij vdx_ij * vfpair);
	vsv1 = v::acc_mask_add(vsv1, vmask, vsv1, vdy_ij vdy_ij * vfpair);
	vsv2 = v::acc_mask_add(vsv2, vmask, vsv2, vdz_ij vdz_ij * vfpair);
	vsv3 = v::acc_mask_add(vsv3, vmask, vsv3, vdx_ij vdy_ij * vfpair);
	vsv4 = v::acc_mask_add(vsv4, vmask, vsv4, vdx_ij vdz_ij * vfpair);
	vsv5 = v::acc_mask_add(vsv5, vmask, vsv5, vdy_ij vdz_ij * vfpair);
	}
	}
	while (cache_idx-- > 0) {
	fvec vfkx = vprefactor * vfkx_cache[cache_idx];
	fvec vfky = vprefactor * vfky_cache[cache_idx];
	fvec vfkz = vprefactor * vfkz_cache[cache_idx];
	int k = k_cache[cache_idx];
	bvec veff_mask = vmask_cache[cache_idx];
	f[k].x += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkx, v::zero()));
	f[k].y += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfky, v::zero()));
	f[k].z += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkz, v::zero()));
	}
	for (int kk = kk_final_cache; kk < numneigh_i; kk++) {
	int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
	fvec vx_k(x[k].x);
	fvec vy_k(x[k].y);
	fvec vz_k(x[k].z);
	int w_k = x[k].w;
	fvec vdx_ik = vx_k - vx_i;
	fvec vdy_ik = vy_k - vy_i;
	fvec vdz_ik = vz_k - vz_i;
	fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
	fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq);
	bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
	bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
	bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
	if (! v::mask_testz(veff_mask)) {
	fvec vfix, vfiy, vfiz;
	fvec vfjx, vfjy, vfjz;
	fvec vfkx, vfky, vfkz;

	attractive_vector<false>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,vprefactor,
	vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
	&vfix,&vfiy,&vfiz,
	&vfjx,&vfjy,&vfjz,
	&vfkx,&vfky,&vfkz,
	0);
	vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
	vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
	vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
	vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
	vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
	vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
	f[k].x += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkx, v::zero()));
	f[k].y += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfky, v::zero()));
	f[k].z += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkz, v::zero()));
	}
	}
	// TDO: This could be a scatter
	v::acc_store(fx, vfjxtmp);
	v::acc_store(fy, vfjytmp);
	v::acc_store(fz, vfjztmp);
	for (int t = 0; t < compress_idx; t++) {
	int t_ = js[t];
	f[t_].x += fx[t];
	f[t_].y += fy[t];
	f[t_].z += fz[t];
	if (EVFLAG && EFLAG && eatom) {
	f[t_].w += fw[t];
	}
	}
	f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero()));
	f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero()));
	f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero()));
	if (EVFLAG && EFLAG && eatom) {
	f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero()));
	}
	}

	template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
	template<bool EVFLAG, bool EFLAG>
	void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
	int iito, int iifrom, int eatom, int vflag,
	const int * _noalias const numneigh,
	const int * _noalias const numneighhalf,
	const int * _noalias const cnumneigh,
	const int * _noalias const firstneigh, int ntypes,
	typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
	const c_inner_t * _noalias const c_inner,
	const c_outer_t * _noalias const c_outer,
	typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
	acc_t evdwl, acc_t ov0, acc_t * ov1, acc_t ov2, acc_t ov3, acc_t ov4, acc_t ov5
	) {
	int compress_idx = 0;
	int ii, jj;
	iarr is, js;
	avec vsevdwl = v::acc_zero();
	avec vsv0 = v::acc_zero(), vsv1 = v::acc_zero(), vsv2 = v::acc_zero();
	avec vsv3 = v::acc_zero(), vsv4 = v::acc_zero(), vsv5 = v::acc_zero();
	ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
	ivec vj, v_NEIGHMASK(NEIGHMASK);
	bvec vmask_repulsive(0);
	iarr repulsive_flag = {0};
	// If you want to get the very most out of this, please uncomment.
	// Consider getting a coffee or doing something else.
	// Also good for heating.
	//#pragma forceinline recursive
	for (ii = iifrom; ii < iito; ii++) {
	// Right now this loop is scalar, to allow for the compiler to do
	// its prefetching magic.
	int i = ii;
	int w_i = x[i].w;
	flt_t x_i = x[i].x;
	flt_t y_i = x[i].y;
	flt_t z_i = x[i].z;
	int jlist_off_i = cnumneigh[i];
	int jnum = numneigh[ii];
	for (jj = 0; jj < jnum; jj++) {
	int j = firstneigh[jlist_off_i + jj] & NEIGHMASK;
	int w_j = x[j].w;
	flt_t dx_ij = x[j].x - x_i;
	flt_t dy_ij = x[j].y - y_i;
	flt_t dz_ij = x[j].z - z_i;
	flt_t rsq = dx_ijdx_ij + dy_ijdy_ij + dz_ij*dz_ij;
	flt_t cutsq = c_outer[w_i * ntypes + w_j].cutsq;
	if (rsq < cutsq) {
	is[compress_idx] = ii;
	js[compress_idx] = j;
	if (jj < numneighhalf[i])
	repulsive_flag[compress_idx] = 1;
	compress_idx += 1;
	}
	if (pack_i) {
	if (compress_idx == v::VL) {
	vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
	kernel_step<EVFLAG,EFLAG>(
	eatom, vflag,
	numneigh, cnumneigh, firstneigh, ntypes,
	x, c_inner, c_outer, f,
	&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
	is, js, vmask_repulsive
	);
	compress_idx = 0;
	v::int_clear_arr(repulsive_flag);
	}
	} else {
	if (compress_idx == v::VL \|\| (compress_idx > 0 && jj == jnum-1)) {
	vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
	kernel_step_const_i<EVFLAG,EFLAG>(
	eatom, vflag,
	numneigh, cnumneigh, firstneigh, ntypes,
	x, c_inner, c_outer, f,
	&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
	i, js, vmask_repulsive
	);
	compress_idx = 0;
	v::int_clear_arr(repulsive_flag);
	}
	}
	}
	}
	if (compress_idx > 0) {
	vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
	IntelKernelTersoff::kernel_step<EVFLAG,EFLAG>(
	eatom, vflag,
	numneigh, cnumneigh, firstneigh, ntypes,
	x, c_inner, c_outer, f,
	&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
	is, js, vmask_repulsive
	);
	}
	if (EVFLAG) {
	if (EFLAG) {
	*evdwl += v::acc_reduce_add(vsevdwl);
	}
	if (vflag == 1) {
	*ov0 += v::acc_reduce_add(vsv0);
	*ov1 += v::acc_reduce_add(vsv1);
	*ov2 += v::acc_reduce_add(vsv2);
	*ov3 += v::acc_reduce_add(vsv3);
	*ov4 += v::acc_reduce_add(vsv4);
	*ov5 += v::acc_reduce_add(vsv5);
	}
	}
	}


	template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
	IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
	const c_inner_t * param,
	ivec xjw, bvec mask,
	fvec vrij, fvec rsq2,
	fvec vdijx, fvec vdijy, fvec vdijz,
	fvec dikx, fvec diky, fvec dikz
	) {
	fvec v_1_0(1.0);
	fvec v_0_5(0.5);
	fvec vph = v::zero(), vpc2 = v::zero(), vpd2 = v::zero(), vpgamma = v::zero(), vplam3 = v::zero(), vppowermint = v::zero(), vpbigr = v::zero(), vpbigd = v::zero();
	// TDO: Specialize on number of species
	v::gather_8(xjw, mask, &param[0].lam3, &vplam3, &vppowermint, &vpbigr, &vpbigd, &vpc2, &vpd2, &vph, &vpgamma);
	fvec vrik = sqrt(rsq2);
	fvec vcostheta = (vdijx * dikx + vdijy * diky + vdijz * dikz) * v::recip(vrij * vrik);
	fvec vhcth = vph - vcostheta;
	fvec vgijk_a = vhcth * vhcth;
	fvec vgijk = vpgamma * (v_1_0 + vpc2 * vgijk_a * v::recip(vpd2 * (vpd2 + vgijk_a)));
	fvec varg1 = vplam3 * (vrij - vrik);
	fvec varg3 = varg1 * varg1 * varg1;
	bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
	fvec varg = v::blend(mask_ex, varg1, varg3);
	fvec vex_delr = v::min(fvec(1.e30), exp(varg));
	bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
	fvec vfc = v_1_0;
	// Its kind of important to check the mask.
	// Some simulations never/rarely invoke this branch.
	if (! v::mask_testz(vmask_need_sine)) {
	vfc = v::blend(vmask_need_sine, vfc,
	v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd))));
	}
	return vgijk * vex_delr * vfc;
	}

	template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
	void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
	const c_outer_t * param,
	ivec xjw,
	bvec mask,
	fvec vrij, fvec vzeta_ij,
	fvec vfpair, fvec vprefactor, int EVDWL, fvec *vevdwl,
	bvec vmask_repulsive
	) {
	fvec v_0_0(0.0);
	fvec v_0_5(0.5);
	fvec v_m0_5(-0.5);
	fvec v_1_0(1.0);
	fvec v_m1_0(-1.0);
	fvec v_2_0(2.0);
	fvec vpbigr = v::zero(), vpbigd = v::zero(), vplam1 = v::zero(), vpbiga = v::zero(), vplam2 = v::zero(), vpbeta = v::zero(), vpbigb = v::zero(), vppowern = v::zero();
	v::gather_8(xjw, mask, &param[0].bigr, &vpbigr, &vpbigd, &vplam1, &vpbiga, &vplam2, &vpbeta, &vpbigb, &vppowern);
	fvec vfccos;

	// This is pretty much a literal translation.
	bvec vmask_need_sine = v::cmpnle(vrij, vpbigr - vpbigd) & mask;
	fvec vfc = v_1_0;
	fvec vfc_d = v_0_0;
	if (! v::mask_testz(vmask_need_sine)) {
	fvec vtmp = fvec(MY_PI2) * v::recip(vpbigd);
	vfc = v::blend(vmask_need_sine, vfc,
	v_0_5 * (v_1_0 - v::sincos(&vfccos, vtmp * (vrij - vpbigr))));
	vfc_d = v::blend(vmask_need_sine, vfc_d, v_m0_5 * vtmp * vfccos);
	}
	fvec vpminus_lam2 = - vplam2;

	fvec vpminus_bigb = -vpbigb;
	fvec vexp = exp(vpminus_lam2 * vrij);
	fvec vfa = vpminus_bigb * vexp * vfc;
	fvec vfa_d = vpminus_lam2 * vfa + vpminus_bigb * vexp * vfc_d;

	fvec vpc1 = v::zero(), vpc2 = v::zero(), vpc3 = v::zero(), vpc4 = v::zero();
	v::gather_4(xjw, mask, &param[0].c1, &vpc1, &vpc2, &vpc3, &vpc4);
	fvec vpminus_powern = - vppowern;
	fvec vbij(0.), vbij_d(0.);
	fvec vtmp = vpbeta * vzeta_ij;
	bvec vmc1 = v::cmple(vpc1, vtmp) & mask;
	if (! v::mask_testz(vmc1)) {
	vbij = v::invsqrt(vtmp);
	vbij_d = vpbeta * v_m0_5 * vbij * v::recip(vtmp);
	}
	bvec vmc2 = v::cmple(vpc2, vtmp) & ~ vmc1 & mask;
	if (! v::mask_testz(vmc2)) {
	fvec vpowminus_powern = pow(vtmp, vpminus_powern);
	fvec vinvsqrt = v::invsqrt(vtmp);
	fvec vrcp2powern = v::recip(v_2_0 * vppowern);
	fvec va = (v_1_0 - vpowminus_powern * vrcp2powern) * vinvsqrt;
	fvec va_d = vpbeta * v_m0_5 * vinvsqrt * v::recip(vtmp) *
	(v_1_0 + v_m0_5 * vpowminus_powern * (v_1_0 + vrcp2powern));
	vbij = v::blend(vmc2, vbij, va);
	vbij_d = v::blend(vmc2, vbij_d, va_d);
	}
	bvec vmc3 = v::cmplt(vtmp, vpc4) & ~vmc2 & ~vmc1 & mask;
	if (! v::mask_testz(vmc3)) {
	vbij = v::blend(vmc3, vbij, v_1_0);
	vbij_d = v::blend(vmc3, vbij_d, v_0_0);
	}
	bvec vmc4 = v::cmple(vtmp, vpc3) & ~vmc3 & ~vmc2 & ~ vmc1 & mask;
	if (! v::mask_testz(vmc4)) {
	fvec vpowm1 = pow(vtmp, vppowern - v_1_0);
	fvec vrcp2powern = v::recip(v_2_0 * vppowern);
	fvec va = v_1_0 - vtmp * vrcp2powern * vpowm1;
	fvec va_d = v_m0_5 * vpbeta * vpowm1;
	vbij = v::blend(vmc4, vbij, va);
	vbij_d = v::blend(vmc4, vbij_d, va_d);
	}
	bvec vmc5 = mask & ~vmc1 & ~vmc2 & ~vmc3 & ~vmc4;
	if (! v::mask_testz(vmc5)) {
	fvec vtmp_n = pow(vtmp, vppowern);
	fvec vpow2 = pow(v_1_0 + vtmp_n, v_m1_0 - v::recip(v_2_0 * vppowern));
	fvec va = (v_1_0 + vtmp_n) * vpow2;
	fvec va_d = v_m0_5 * vpow2 * vtmp_n * v::recip(vzeta_ij);
	vbij = v::blend(vmc5, vbij, va);
	vbij_d = v::blend(vmc5, vbij_d, va_d);
	}
	fvec vtmp_exp = exp(-vplam1 * vrij);
	fvec vrep_fforce = vpbiga * vtmp_exp * (vfc_d - vfc * vplam1);
	fvec vfz_fforce = v_0_5 * vbij * vfa_d;

	vfpair = v::mask_add(vfz_fforce, vmask_repulsive, vfz_fforce, vrep_fforce) v::recip(vrij);
	vprefactor = v_m0_5 vfa * vbij_d;
	if (EVDWL) {
	fvec vrep_eng = vfc * vpbiga * vtmp_exp;
	fvec vfz_eng = v_0_5 * vfa * vbij;
	*vevdwl = v::mask_add(vfz_eng, vmask_repulsive, vfz_eng, vrep_eng);
	}
	}

	template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
	template<bool ZETA>
	void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
	const c_inner_t * param,
	ivec xjw,
	bvec mask,
	fvec vprefactor,
	fvec vrij, fvec rsq2,
	fvec vdijx, fvec vdijy, fvec vdijz,
	fvec dikx, fvec diky, fvec dikz,
	fvec fix, fvec fiy, fvec *fiz,
	fvec fjx, fvec fjy, fvec *fjz,
	fvec fkx, fvec fky, fvec *fkz,
	fvec *zeta
	) {
	fvec v_1_0 = fvec(1.0);

	fvec vph = v::zero(), vpc2 = v::zero(), vpd2 = fvec(1.0), vpgamma = v::zero(), vplam3 = v::zero(), vppowermint = v::zero(), vpbigr = v::zero(), vpbigd = fvec(1.0);
	v::gather_8(xjw, mask, &param[0].lam3, &vplam3, &vppowermint, &vpbigr, &vpbigd, &vpc2, &vpd2, &vph, &vpgamma);
	fvec vrijinv = v::recip(vrij);
	fvec vrij_hatx = vrijinv * vdijx;
	fvec vrij_haty = vrijinv * vdijy;
	fvec vrij_hatz = vrijinv * vdijz;
	fvec rikinv = invsqrt(rsq2);
	fvec rik_hatx = rikinv * dikx;
	fvec rik_haty = rikinv * diky;
	fvec rik_hatz = rikinv * dikz;

	fvec vrik = sqrt(rsq2);
	fvec vcostheta = (vdijx * dikx + vdijy * diky + vdijz * dikz) * v::recip(vrij * vrik);
	fvec vhcth = vph - vcostheta;
	fvec vdenominator = v::recip(vpd2 + vhcth * vhcth);
	fvec vgijk = vpgamma * (v_1_0 + vpc2 * v::recip(vpd2) - vpc2 * vdenominator);
	fvec vnumerator = fvec(-2.) * vpc2 * vhcth;
	fvec vgijk_d = vpgamma * vnumerator * vdenominator * vdenominator;
	fvec varg1 = vplam3 * (vrij - vrik);
	fvec varg3 = varg1 * varg1 * varg1;
	bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
	fvec varg = v::blend(mask_ex, varg1, varg3);
	fvec vex_delr = min(fvec(1.e30), exp(varg));
	fvec vex_delr_d_factor = v::blend(mask_ex, v_1_0, fvec(3.0) * varg1 * varg1);
	fvec vex_delr_d = vplam3 * vex_delr_d_factor * vex_delr;
	bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
	fvec vfccos;
	fvec vfc = v_1_0;
	fvec vfc_d = v::zero();
	if (! v::mask_testz(vmask_need_sine)) {
	fvec vtmp = fvec(MY_PI2) * v::recip(vpbigd);
	vfc = v::blend(vmask_need_sine, vfc,
	fvec(0.5) * (v_1_0 - v::sincos(&vfccos, vtmp * (vrik - vpbigr))));
	vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos);
	}

	fvec vzeta_d_fc = vfc_d * vgijk * vex_delr;
	fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr;
	fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d;
	if (ZETA) zeta = vfc vgijk * vex_delr;

	fvec vminus_costheta = - vcostheta;
	fvec vdcosdrjx = vrijinv * fmadd(vminus_costheta, vrij_hatx, rik_hatx);
	fvec vdcosdrjy = vrijinv * fmadd(vminus_costheta, vrij_haty, rik_haty);
	fvec vdcosdrjz = vrijinv * fmadd(vminus_costheta, vrij_hatz, rik_hatz);
	fvec vdcosdrkx = rikinv * fmadd(vminus_costheta, rik_hatx, vrij_hatx);
	fvec vdcosdrky = rikinv * fmadd(vminus_costheta, rik_haty, vrij_haty);
	fvec vdcosdrkz = rikinv * fmadd(vminus_costheta, rik_hatz, vrij_hatz);
	fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
	fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
	fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);

	fix = vprefactor (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx);
	fiy = vprefactor (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty);
	fiz = vprefactor (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz);
	fjx = vprefactor (vzeta_d_gijk * vdcosdrjx + vzeta_d_ex_delr * vrij_hatx);
	fjy = vprefactor (vzeta_d_gijk * vdcosdrjy + vzeta_d_ex_delr * vrij_haty);
	fjz = vprefactor (vzeta_d_gijk * vdcosdrjz + vzeta_d_ex_delr * vrij_hatz);
	fkx = vprefactor ((vzeta_d_fc - vzeta_d_ex_delr) * rik_hatx + vzeta_d_gijk * vdcosdrkx);
	fky = vprefactor ((vzeta_d_fc - vzeta_d_ex_delr) * rik_haty + vzeta_d_gijk * vdcosdrky);
	fkz = vprefactor ((vzeta_d_fc - vzeta_d_ex_delr) * rik_hatz + vzeta_d_gijk * vdcosdrkz);
	}


	#ifdef _LMP_INTEL_OFFLOAD
	#pragma offload_attribute(pop)
	#endif

	#endif

pair_tersoff_intel.cppNo OneTemporaryActions

File Metadata

pair_tersoff_intel.cppView Options

Event Timeline

pair_tersoff_intel.cpp
No OneTemporary
Actions

pair_tersoff_intel.cpp
View Options