pair_gayberne_intel.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Jun 19, 07:28

pair_gayberne_intel.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	This software is distributed under the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing author: W. Michael Brown (Intel)
	------------------------------------------------------------------------- */

	#include "pair_gayberne_intel.h"
	#include "math_extra_intel.h"

	#ifdef _LMP_INTEL_OFFLOAD
	#pragma offload_attribute(push,target(mic))
	#endif
	#include <cmath>
	#ifdef _LMP_INTEL_OFFLOAD
	#pragma offload_attribute(pop)
	#endif

	#include "atom.h"
	#include "comm.h"
	#include "atom_vec_ellipsoid.h"
	#include "force.h"
	#include "memory.h"
	#include "modify.h"
	#include "neighbor.h"
	#include "neigh_list.h"
	#include "neigh_request.h"

	#include "suffix.h"
	using namespace LAMMPS_NS;

	#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
	#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
	#define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3

	/* ---------------------------------------------------------------------- */

	PairGayBerneIntel::PairGayBerneIntel(LAMMPS *lmp) :
	PairGayBerne(lmp)
	{
	suffix_flag \|= Suffix::INTEL;
	respa_enable = 0;
	}

	/* ---------------------------------------------------------------------- */

	void PairGayBerneIntel::compute(int eflag, int vflag)
	{
	if (fix->precision()==FixIntel::PREC_MODE_MIXED)
	compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
	force_const_single);
	else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
	compute<double,double>(eflag, vflag, fix->get_double_buffers(),
	force_const_double);
	else
	compute<float,float>(eflag, vflag, fix->get_single_buffers(),
	force_const_single);

	fix->balance_stamp();
	vflag_fdotr = 0;
	}

	template <class flt_t, class acc_t>
	void PairGayBerneIntel::compute(int eflag, int vflag,
	IntelBuffers<flt_t,acc_t> *buffers,
	const ForceConst<flt_t> &fc)
	{
	if (eflag \|\| vflag) {
	ev_setup(eflag, vflag);
	} else evflag = vflag_fdotr = 0;

	const int inum = list->inum;
	const int nall = atom->nlocal + atom->nghost;
	const int nthreads = comm->nthreads;
	const int host_start = fix->host_start_pair();
	const int offload_end = fix->offload_end_pair();
	const int ago = neighbor->ago;

	if (fix->separate_buffers() == 0) {
	fix->start_watch(TIME_PACK);
	const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
	const int * const ellipsoid = atom->ellipsoid;
	QUAT_T * _noalias const quat = buffers->get_quat();

	int packthreads;
	if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
	else packthreads = 1;
	#if defined(_OPENMP)
	#pragma omp parallel if(packthreads > 1)
	#endif
	{
	int ifrom, ito, tid;
	IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
	sizeof(ATOM_T));
	if (ago != 0) buffers->thr_pack(ifrom,ito,ago);

	for (int i = ifrom; i < ito; i++) {
	int qi = ellipsoid[i];
	if (qi > -1) {
	quat[i].w = bonus[qi].quat[0];
	quat[i].i = bonus[qi].quat[1];
	quat[i].j = bonus[qi].quat[2];
	quat[i].k = bonus[qi].quat[3];
	}
	}
	}
	quat[nall].w = (flt_t)1.0;
	quat[nall].i = (flt_t)0.0;
	quat[nall].j = (flt_t)0.0;
	quat[nall].k = (flt_t)0.0;
	fix->stop_watch(TIME_PACK);
	}

	int ovflag = 0;
	if (vflag_fdotr) ovflag = 2;
	else if (vflag) ovflag = 1;
	if (eflag) {
	if (force->newton_pair) {
	eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
	eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
	} else {
	eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
	eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
	}
	} else {
	if (force->newton_pair) {
	eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
	eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
	} else {
	eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
	eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
	}
	}
	}

	template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
	void PairGayBerneIntel::eval(const int offload, const int vflag,
	IntelBuffers<flt_t,acc_t> *buffers,
	const ForceConst<flt_t> &fc,
	const int astart, const int aend)
	{
	const int inum = aend - astart;
	if (inum == 0) return;
	int nlocal, nall, minlocal;
	fix->get_buffern(offload, nlocal, nall, minlocal);

	const int ago = neighbor->ago;
	ATOM_T * _noalias const x = buffers->get_x(offload);
	QUAT_T * _noalias const quat = buffers->get_quat(offload);
	const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
	const int *ellipsoid = atom->ellipsoid;

	#ifdef _LMP_INTEL_OFFLOAD
	if (fix->separate_buffers()) {
	fix->start_watch(TIME_PACK);
	if (offload) {
	#pragma omp parallel
	{
	int ifrom, ito, tid;
	int nthreads = comm->nthreads;
	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
	nthreads, sizeof(ATOM_T));
	if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
	for (int i = ifrom; i < ito; i++) {
	int qi = ellipsoid[i];
	if (qi > -1) {
	quat[i].w = bonus[qi].quat[0];
	quat[i].i = bonus[qi].quat[1];
	quat[i].j = bonus[qi].quat[2];
	quat[i].k = bonus[qi].quat[3];
	}
	}
	int nghost = nall - nlocal;
	if (nghost) {
	IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
	nthreads, sizeof(ATOM_T));
	int offset = 0;
	ifrom += nlocal;
	ito += nlocal;
	if (ago != 0) {
	offset = fix->offload_min_ghost() - nlocal;
	buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
	}
	for (int i = ifrom; i < ito; i++) {
	int qi = ellipsoid[i + offset];
	if (qi > -1) {
	quat[i].w = bonus[qi].quat[0];
	quat[i].i = bonus[qi].quat[1];
	quat[i].j = bonus[qi].quat[2];
	quat[i].k = bonus[qi].quat[3];
	}
	}
	}
	}
	} else {
	if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
	for (int i = fix->host_min_local(); i < nlocal; i++) {
	int qi = ellipsoid[i];
	if (qi > -1) {
	quat[i].w = bonus[qi].quat[0];
	quat[i].i = bonus[qi].quat[1];
	quat[i].j = bonus[qi].quat[2];
	quat[i].k = bonus[qi].quat[3];
	}
	}
	int offset = fix->host_min_ghost() - nlocal;
	if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
	for (int i = nlocal; i < nall; i++) {
	int qi = ellipsoid[i + offset];
	if (qi > -1) {
	quat[i].w = bonus[qi].quat[0];
	quat[i].i = bonus[qi].quat[1];
	quat[i].j = bonus[qi].quat[2];
	quat[i].k = bonus[qi].quat[3];
	}
	}
	}
	fix->stop_watch(TIME_PACK);
	}
	#endif

	// const int * _noalias const ilist = list->ilist;
	const int * _noalias const numneigh = list->numneigh;
	const int * _noalias const cnumneigh = buffers->cnumneigh(list);
	const int * _noalias const firstneigh = buffers->firstneigh(list);
	const flt_t * _noalias const special_lj = fc.special_lj;

	const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
	const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
	const FC_PACKED3_T * _noalias const ic = fc.ic;
	const flt_t mu = fc.mu;
	const flt_t gamma = fc.gamma;
	const flt_t upsilon = fc.upsilon;

	flt_t * const rsq_formi = fc.rsq_form[0];
	flt_t * const delx_formi = fc.delx_form[0];
	flt_t * const dely_formi = fc.dely_form[0];
	flt_t * const delz_formi = fc.delz_form[0];
	int * const jtype_formi = fc.jtype_form[0];
	int * const jlist_formi = fc.jlist_form[0];

	const int ntypes = atom->ntypes + 1;
	const int eatom = this->eflag_atom;

	// Determine how much data to transfer
	int x_size, q_size, f_stride, ev_size, separate_flag;
	IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
	buffers, offload, fix, separate_flag,
	x_size, q_size, ev_size, f_stride);

	int tc;
	FORCE_T * _noalias f_start;
	acc_t * _noalias ev_global;
	IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
	const int max_nbors = _max_nbors;
	const int nthreads = tc;

	int pad = 1;
	if (offload) {
	if (INTEL_MIC_NBOR_PAD > 1)
	pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
	} else {
	if (INTEL_NBOR_PAD > 1)
	pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
	}
	const int pad_width = pad;

	#ifdef _LMP_INTEL_OFFLOAD
	int *overflow = fix->get_off_overflow_flag();
	double *timer_compute = fix->off_watch_pair();

	if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
	#pragma offload target(mic:_cop) if(offload) \
	in(special_lj:length(0) alloc_if(0) free_if(0)) \
	in(ijc,lj34,ic:length(0) alloc_if(0) free_if(0)) \
	in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
	in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
	in(firstneigh:length(0) alloc_if(0) free_if(0)) \
	in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
	in(numneigh:length(0) alloc_if(0) free_if(0)) \
	in(x:length(x_size) alloc_if(0) free_if(0)) \
	in(quat:length(nall+1) alloc_if(0) free_if(0)) \
	in(overflow:length(0) alloc_if(0) free_if(0)) \
	in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \
	in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \
	out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
	out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
	out(timer_compute:length(1) alloc_if(0) free_if(0)) \
	signal(f_start)
	#endif
	{
	#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
	*timer_compute=MIC_Wtime();
	#endif

	#ifdef _LMP_INTEL_OFFLOAD
	if (separate_flag) {
	if (separate_flag < 3) {
	int all_local = nlocal;
	int ghost_min = overflow[LMP_GHOST_MIN];
	nlocal = overflow[LMP_LOCAL_MAX] + 1;
	int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
	if (nghost < 0) nghost = 0;
	nall = nlocal + nghost;
	separate_flag--;
	int flength;
	if (NEWTON_PAIR) flength = nall;
	else flength = nlocal;
	IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
	separate_flag);
	if (nghost) {
	if (nlocal < all_local \|\| ghost_min > all_local) {
	memmove(x + nlocal, x + ghost_min,
	(nall - nlocal) * sizeof(ATOM_T));
	memmove(quat + nlocal, quat + ghost_min,
	(nall - nlocal) * sizeof(QUAT_T));
	}
	}
	}
	x[nall].x = (flt_t)INTEL_BIGP;
	x[nall].y = (flt_t)INTEL_BIGP;
	x[nall].z = (flt_t)INTEL_BIGP;
	x[nall].w = 1;
	quat[nall].w = (flt_t)1.0;
	quat[nall].i = (flt_t)0.0;
	quat[nall].j = (flt_t)0.0;
	quat[nall].k = (flt_t)0.0;
	}
	#endif

	acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
	if (EFLAG) oevdwl = (acc_t)0.0;
	if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
	if (NEWTON_PAIR == 0) f_start[1].w = 0;

	// loop over neighbors of my atoms
	#if defined(_OPENMP)
	#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
	#endif
	{
	int iifrom, iip, iito, tid;
	IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
	iifrom += astart;
	iito += astart;

	int foff;
	if (NEWTON_PAIR) foff = tid * f_stride - minlocal * 2;
	else foff = minlocal*-2;
	FORCE_T * _noalias const f = f_start + foff;
	if (NEWTON_PAIR) memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));

	flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
	flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
	flt_t * _noalias const dely_form = dely_formi + tid * max_nbors;
	flt_t * _noalias const delz_form = delz_formi + tid * max_nbors;
	int * _noalias const jtype_form = jtype_formi + tid * max_nbors;
	int * _noalias const jlist_form = jlist_formi + tid * max_nbors;

	int ierror = 0;
	for (int i = iifrom; i < iito; i += iip) {
	// const int i = ilist[ii];
	const int itype = x[i].w;
	const int ptr_off = itype * ntypes;
	const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
	const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;

	const int * _noalias const jlist = firstneigh + cnumneigh[i];
	const int jnum = numneigh[i];

	const flt_t xtmp = x[i].x;
	const flt_t ytmp = x[i].y;
	const flt_t ztmp = x[i].z;

	flt_t a1_0, a1_1, a1_2, a1_3, a1_4, a1_5, a1_6, a1_7, a1_8;
	flt_t b1_0, b1_1, b1_2, b1_3, b1_4, b1_5, b1_6, b1_7, b1_8;
	flt_t g1_0, g1_1, g1_2, g1_3, g1_4, g1_5, g1_6, g1_7, g1_8;

	if (ijci[itype].form == ELLIPSE_ELLIPSE) {
	flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
	ME_quat_to_mat_trans(quat[i],a1);
	ME_diag_times3(ic[itype].well,a1,temp);
	ME_transpose_times3(a1,temp,b1);
	ME_diag_times3(ic[itype].shape2,a1,temp);
	ME_transpose_times3(a1,temp,g1);
	}

	acc_t fxtmp, fytmp, fztmp, fwtmp, t1tmp, t2tmp, t3tmp;
	acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
	fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;

	if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
	if (NEWTON_PAIR == 0)
	if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;

	bool multiple_forms = false;
	int packed_j = 0;
	#if defined(LMP_SIMD_COMPILER)
	#pragma vector aligned
	#pragma ivdep
	#endif
	for (int jj = 0; jj < jnum; jj++) {
	int jm = jlist[jj];
	int j = jm & NEIGHMASK;
	const int jtype = x[j].w;

	if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
	flt_t delx = x[j].x-xtmp;
	flt_t dely = x[j].y-ytmp;
	flt_t delz = x[j].z-ztmp;
	flt_t rsq = delx * delx + dely * dely + delz * delz;

	if (rsq < ijci[jtype].cutsq) {
	rsq_form[packed_j] = rsq;
	delx_form[packed_j] = delx;
	dely_form[packed_j] = dely;
	delz_form[packed_j] = delz;
	jtype_form[packed_j] = jtype;
	jlist_form[packed_j] = jm;
	packed_j++;
	}
	} else
	multiple_forms = true;
	}
	const int edge = (packed_j % pad_width);
	if (edge) {
	const int packed_end = packed_j + (pad_width - edge);
	#if defined(LMP_SIMD_COMPILER)
	#pragma loop_count min=1, max=15, avg=8
	#endif
	for ( ; packed_j < packed_end; packed_j++)
	jlist_form[packed_j] = nall;
	}

	// -------------------------------------------------------------

	#ifdef INTEL_V512
	__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
	__assume(packed_j % 8 == 0);
	__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
	#endif
	#if defined(LMP_SIMD_COMPILER)
	#pragma vector aligned
	#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
	sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
	#endif
	for (int jj = 0; jj < packed_j; jj++) {
	flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
	flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
	flt_t g2_0, g2_1, g2_2, g2_3, g2_4, g2_5, g2_6, g2_7, g2_8;
	flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
	flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
	flt_t rtor_0, rtor_1, rtor_2;

	const int sbindex = jlist_form[jj] >> SBBITS & 3;
	const int j = jlist_form[jj] & NEIGHMASK;
	flt_t factor_lj = special_lj[sbindex];
	const int jtype = jtype_form[jj];
	const flt_t sigma = ijci[jtype].sigma;
	const flt_t epsilon = ijci[jtype].epsilon;
	const flt_t shape2_0 = ic[jtype].shape2[0];
	const flt_t shape2_1 = ic[jtype].shape2[1];
	const flt_t shape2_2 = ic[jtype].shape2[2];
	flt_t one_eng, evdwl;

	ME_quat_to_mat_trans(quat[j], a2);
	ME_diag_times3(ic[jtype].well, a2, temp);
	ME_transpose_times3(a2, temp, b2);
	ME_diag_times3a(shape2, a2, temp);
	ME_transpose_times3(a2, temp, g2);

	flt_t tempv_0, tempv_1, tempv_2, tempv2_0, tempv2_1, tempv2_2;
	flt_t temp1, temp2, temp3;

	flt_t r12hat_0, r12hat_1, r12hat_2;
	ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat);
	flt_t r = sqrt(rsq_form[jj]);

	// compute distance of closest approach

	flt_t g12_0, g12_1, g12_2, g12_3, g12_4, g12_5, g12_6, g12_7, g12_8;
	ME_plus3(g1, g2, g12);
	flt_t kappa_0, kappa_1, kappa_2;
	ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
	kappa, ierror);

	// tempv = G12^-1*r12hat

	flt_t inv_r = (flt_t)1.0 / r;
	tempv_0 = kappa_0 * inv_r;
	tempv_1 = kappa_1 * inv_r;
	tempv_2 = kappa_2 * inv_r;
	flt_t sigma12 = ME_dot3(r12hat, tempv);
	sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
	flt_t h12 = r - sigma12;

	// energy
	// compute u_r

	flt_t varrho = sigma / (h12 + gamma * sigma);
	flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
	flt_t varrho12 = varrho6 * varrho6;
	flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);

	// compute eta_12

	flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
	flt_t det_g12 = ME_det3(g12);
	eta = std::pow(eta / det_g12, upsilon);

	// compute chi_12

	flt_t b12_0, b12_1, b12_2, b12_3, b12_4, b12_5, b12_6, b12_7, b12_8;
	flt_t iota_0, iota_1, iota_2;
	ME_plus3(b1, b2, b12);
	ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
	iota, ierror);

	// tempv = G12^-1*r12hat

	tempv_0 = iota_0 * inv_r;
	tempv_1 = iota_1 * inv_r;
	tempv_2 = iota_2 * inv_r;
	flt_t chi = ME_dot3(r12hat, tempv);
	chi = std::pow(chi * (flt_t)2.0, mu);

	// force
	// compute dUr/dr

	temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
	sigma;
	temp1 = temp1 * (flt_t)24.0 * epsilon;
	flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
	flt_t dUr_0, dUr_1, dUr_2;
	temp2 = ME_dot3(kappa, r12hat);
	flt_t uslj_rsq = u_slj / rsq_form[jj];
	dUr_0 = temp1 * r12hat_0 + uslj_rsq * (kappa_0 - temp2 * r12hat_0);
	dUr_1 = temp1 * r12hat_1 + uslj_rsq * (kappa_1 - temp2 * r12hat_1);
	dUr_2 = temp1 * r12hat_2 + uslj_rsq * (kappa_2 - temp2 * r12hat_2);

	// compute dChi_12/dr

	flt_t dchi_0, dchi_1, dchi_2;
	temp1 = ME_dot3(iota, r12hat);
	temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
	std::pow(chi, (mu - (flt_t)1.0) / mu);
	dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
	dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
	dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);

	temp1 = -eta * u_r;
	temp2 = eta * chi;
	fforce_0 = temp1 * dchi_0 - temp2 * dUr_0;
	fforce_1 = temp1 * dchi_1 - temp2 * dUr_1;
	fforce_2 = temp1 * dchi_2 - temp2 * dUr_2;

	// torque for particle 1 and 2
	// compute dUr

	tempv_0 = -uslj_rsq * kappa_0;
	tempv_1 = -uslj_rsq * kappa_1;
	tempv_2 = -uslj_rsq * kappa_2;
	ME_vecmat(kappa, g1, tempv2);
	ME_cross3(tempv, tempv2, dUr);
	flt_t dUr2_0, dUr2_1, dUr2_2;

	if (NEWTON_PAIR) {
	ME_vecmat(kappa, g2, tempv2);
	ME_cross3(tempv, tempv2, dUr2);
	}

	// compute d_chi

	ME_vecmat(iota, b1, tempv);
	ME_cross3(tempv, iota, dchi);
	temp1 = (flt_t)-4.0 / rsq_form[jj];
	dchi_0 *= temp1;
	dchi_1 *= temp1;
	dchi_2 *= temp1;
	flt_t dchi2_0, dchi2_1, dchi2_2;

	if (NEWTON_PAIR) {
	ME_vecmat(iota, b2, tempv);
	ME_cross3(tempv, iota, dchi2);
	dchi2_0 *= temp1;
	dchi2_1 *= temp1;
	dchi2_2 *= temp1;
	}

	// compute d_eta

	flt_t deta_0, deta_1, deta_2;
	deta_0 = deta_1 = deta_2 = (flt_t)0.0;
	ME_compute_eta_torque(g12, a1, shape2, temp);
	temp1 = -eta * upsilon;

	tempv_0 = temp1 * temp_0;
	tempv_1 = temp1 * temp_1;
	tempv_2 = temp1 * temp_2;
	ME_mv0_cross3(a1, tempv, tempv2);
	deta_0 += tempv2_0;
	deta_1 += tempv2_1;
	deta_2 += tempv2_2;

	tempv_0 = temp1 * temp_3;
	tempv_1 = temp1 * temp_4;
	tempv_2 = temp1 * temp_5;
	ME_mv1_cross3(a1, tempv, tempv2);
	deta_0 += tempv2_0;
	deta_1 += tempv2_1;
	deta_2 += tempv2_2;

	tempv_0 = temp1 * temp_6;
	tempv_1 = temp1 * temp_7;
	tempv_2 = temp1 * temp_8;
	ME_mv2_cross3(a1, tempv, tempv2);
	deta_0 += tempv2_0;
	deta_1 += tempv2_1;
	deta_2 += tempv2_2;

	// compute d_eta for particle 2

	flt_t deta2_0, deta2_1, deta2_2;
	if (NEWTON_PAIR) {
	deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
	ME_compute_eta_torque(g12, a2, shape2, temp);

	tempv_0 = temp1 * temp_0;
	tempv_1 = temp1 * temp_1;
	tempv_2 = temp1 * temp_2;
	ME_mv0_cross3(a2, tempv, tempv2);
	deta2_0 += tempv2_0;
	deta2_1 += tempv2_1;
	deta2_2 += tempv2_2;

	tempv_0 = temp1 * temp_3;
	tempv_1 = temp1 * temp_4;
	tempv_2 = temp1 * temp_5;
	ME_mv1_cross3(a2, tempv, tempv2);
	deta2_0 += tempv2_0;
	deta2_1 += tempv2_1;
	deta2_2 += tempv2_2;

	tempv_0 = temp1 * temp_6;
	tempv_1 = temp1 * temp_7;
	tempv_2 = temp1 * temp_8;
	ME_mv2_cross3(a2, tempv, tempv2);
	deta2_0 += tempv2_0;
	deta2_1 += tempv2_1;
	deta2_2 += tempv2_2;
	}

	// torque

	temp1 = u_r * eta;
	temp2 = u_r * chi;
	temp3 = chi * eta;

	ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
	(flt_t)-1.0;
	ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
	(flt_t)-1.0;
	ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
	(flt_t)-1.0;

	if (NEWTON_PAIR) {
	rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
	(flt_t)-1.0;
	rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
	(flt_t)-1.0;
	rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
	(flt_t)-1.0;
	}

	one_eng = temp1 * chi;
	#ifndef INTEL_VMASK
	if (jlist_form[jj] == nall) {
	one_eng = (flt_t)0.0;
	fforce_0 = 0.0;
	fforce_1 = 0.0;
	fforce_2 = 0.0;
	ttor_0 = 0.0;
	ttor_1 = 0.0;
	ttor_2 = 0.0;
	rtor_0 = 0.0;
	rtor_1 = 0.0;
	rtor_2 = 0.0;
	}
	#endif

	fforce_0 *= factor_lj;
	fforce_1 *= factor_lj;
	fforce_2 *= factor_lj;
	ttor_0 *= factor_lj;
	ttor_1 *= factor_lj;
	ttor_2 *= factor_lj;

	#ifdef INTEL_VMASK
	if (jlist_form[jj] < nall) {
	#endif
	fxtmp += fforce_0;
	fytmp += fforce_1;
	fztmp += fforce_2;
	t1tmp += ttor_0;
	t2tmp += ttor_1;
	t3tmp += ttor_2;

	if (NEWTON_PAIR) {
	rtor_0 *= factor_lj;
	rtor_1 *= factor_lj;
	rtor_2 *= factor_lj;
	int jp = j * 2;
	f[jp].x -= fforce_0;
	f[jp].y -= fforce_1;
	f[jp].z -= fforce_2;
	jp++;
	f[jp].x += rtor_0;
	f[jp].y += rtor_1;
	f[jp].z += rtor_2;
	}

	if (EFLAG) {
	evdwl = factor_lj * one_eng;
	sevdwl += evdwl;
	if (eatom) {
	fwtmp += (flt_t)0.5 * evdwl;
	if (NEWTON_PAIR)
	f[j2].w += (flt_t)0.5 evdwl;
	}
	}

	if (NEWTON_PAIR == 0) {
	if (vflag == 1) {
	sv0 += delx_form[jj] * fforce_0;
	sv1 += dely_form[jj] * fforce_1;
	sv2 += delz_form[jj] * fforce_2;
	sv3 += delx_form[jj] * fforce_1;
	sv4 += delx_form[jj] * fforce_2;
	sv5 += dely_form[jj] * fforce_2;
	}
	} // EVFLAG
	#ifdef INTEL_VMASK
	}
	#endif
	} // for jj

	// -------------------------------------------------------------

	if (multiple_forms)
	ierror = 2;

	int ip = i * 2;
	if (NEWTON_PAIR) {
	f[ip].x += fxtmp;
	f[ip].y += fytmp;
	f[ip].z += fztmp;
	ip++;
	f[ip].x += t1tmp;
	f[ip].y += t2tmp;
	f[ip].z += t3tmp;
	} else {
	f[ip].x = fxtmp;
	f[ip].y = fytmp;
	f[ip].z = fztmp;
	ip++;
	f[ip].x = t1tmp;
	f[ip].y = t2tmp;
	f[ip].z = t3tmp;
	}

	if (EFLAG) {
	oevdwl += sevdwl;
	if (eatom) f[i * 2].w += fwtmp;
	}
	if (NEWTON_PAIR == 0) {
	if (vflag == 1) {
	ov0 += sv0;
	ov1 += sv1;
	ov2 += sv2;
	ov3 += sv3;
	ov4 += sv4;
	ov5 += sv5;
	}
	}
	} // for i
	int o_range;
	if (NEWTON_PAIR) {
	o_range = nall;
	if (offload == 0) o_range -= minlocal;
	IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
	sizeof(FORCE_T));
	const int sto = iito * 8;
	const int fst4 = f_stride * 4;
	#if defined(_OPENMP)
	#pragma omp barrier
	#endif
	acc_t *f_scalar = &f_start[0].x;
	acc_t *f_scalar2 = f_scalar + fst4;
	for (int t = 1; t < nthreads; t++) {
	#if defined(LMP_SIMD_COMPILER)
	#pragma vector aligned
	#pragma simd
	#endif
	for (int n = iifrom * 8; n < sto; n++)
	f_scalar[n] += f_scalar2[n];
	f_scalar2 += fst4;
	}

	if (vflag==2) {
	const ATOM_T * _noalias const xo = x + minlocal;
	#if defined(LMP_SIMD_COMPILER)
	#pragma novector
	#endif
	for (int n = iifrom; n < iito; n++) {
	const int nt2 = n * 2;
	ov0 += f_start[nt2].x * xo[n].x;
	ov1 += f_start[nt2].y * xo[n].y;
	ov2 += f_start[nt2].z * xo[n].z;
	ov3 += f_start[nt2].y * xo[n].x;
	ov4 += f_start[nt2].z * xo[n].x;
	ov5 += f_start[nt2].z * xo[n].y;
	}
	}
	}

	if (ierror)
	f_start[1].w = ierror;
	} // omp

	if (EFLAG) {
	if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
	ev_global[0] = oevdwl;
	ev_global[1] = (acc_t)0.0;
	}
	if (vflag) {
	if (NEWTON_PAIR == 0) {
	ov0 *= (acc_t)-0.5;
	ov1 *= (acc_t)-0.5;
	ov2 *= (acc_t)-0.5;
	ov3 *= (acc_t)-0.5;
	ov4 *= (acc_t)-0.5;
	ov5 *= (acc_t)-0.5;
	}
	ev_global[2] = ov0;
	ev_global[3] = ov1;
	ev_global[4] = ov2;
	ev_global[5] = ov3;
	ev_global[6] = ov4;
	ev_global[7] = ov5;
	}

	#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
	timer_compute = MIC_Wtime() - timer_compute;
	#endif
	} // offload

	if (offload)
	fix->stop_watch(TIME_OFFLOAD_LATENCY);
	else
	fix->stop_watch(TIME_HOST_PAIR);

	if (EFLAG \|\| vflag)
	fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
	else
	fix->add_result_array(f_start, 0, offload, 0, 0, 2);
	}

	/* ---------------------------------------------------------------------- */

	void PairGayBerneIntel::init_style()
	{
	PairGayBerne::init_style();
	if (force->newton_pair == 0) {
	neighbor->requests[neighbor->nrequest-1]->half = 0;
	neighbor->requests[neighbor->nrequest-1]->full = 1;
	}
	neighbor->requests[neighbor->nrequest-1]->intel = 1;

	int ifix = modify->find_fix("package_intel");
	if (ifix < 0)
	error->all(FLERR,
	"The 'package intel' command is required for /intel styles");
	fix = static_cast<FixIntel *>(modify->fix[ifix]);

	fix->pair_init_check();
	#ifdef _LMP_INTEL_OFFLOAD
	if (force->newton_pair) fix->set_offload_noghost(1);
	_cop = fix->coprocessor_number();
	#endif

	if (fix->precision() == FixIntel::PREC_MODE_MIXED)
	pack_force_const(force_const_single, fix->get_mixed_buffers());
	else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
	pack_force_const(force_const_double, fix->get_double_buffers());
	else
	pack_force_const(force_const_single, fix->get_single_buffers());
	}

	/* ---------------------------------------------------------------------- */

	template <class flt_t, class acc_t>
	void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
	IntelBuffers<flt_t,acc_t> *buffers)
	{
	int tp1 = atom->ntypes + 1;
	_max_nbors = buffers->get_max_nbors();
	int mthreads = comm->nthreads;
	if (mthreads < buffers->get_off_threads())
	mthreads = buffers->get_off_threads();
	fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
	buffers->set_ntypes(tp1);
	flt_t **cutneighsq = buffers->get_cutneighsq();

	// Repeat cutsq calculation because done after call to init_style
	double cut, cutneigh;
	for (int i = 1; i <= atom->ntypes; i++) {
	for (int j = i; j <= atom->ntypes; j++) {
	if (setflag[i][j] != 0 \|\| (setflag[i][i] != 0 && setflag[j][j] != 0)) {
	cut = init_one(i,j);
	cutneigh = cut + neighbor->skin;
	cutsq[i][j] = cutsq[j][i] = cut*cut;
	cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
	}
	}
	}

	for (int i = 0; i < 4; i++) {
	fc.special_lj[i] = force->special_lj[i];
	fc.special_lj[0] = 1.0;
	}
	fc.gamma = gamma;
	fc.upsilon = upsilon;
	fc.mu = mu;

	for (int i = 0; i < tp1; i++) {
	for (int j = 0; j < tp1; j++) {
	fc.ijc[i][j].lj1 = lj1[i][j];
	fc.ijc[i][j].lj2 = lj2[i][j];
	fc.ijc[i][j].cutsq = cutsq[i][j];
	fc.ijc[i][j].offset = offset[i][j];
	fc.ijc[i][j].sigma = sigma[i][j];
	fc.ijc[i][j].epsilon = epsilon[i][j];
	fc.ijc[i][j].form = form[i][j];
	fc.ijc[i][j].lshape = lshape[i] * lshape[j];
	fc.lj34[i][j].lj3 = lj3[i][j];
	fc.lj34[i][j].lj4 = lj4[i][j];
	}
	for (int j = 0; j < 4; j++) {
	fc.ic[i].shape2[j] = shape2[i][j];
	fc.ic[i].well[j] = well[i][j];
	}
	}

	#ifdef _LMP_INTEL_OFFLOAD
	if (_cop < 0) return;
	flt_t * special_lj = fc.special_lj;
	FC_PACKED1_T *oijc = fc.ijc[0];
	FC_PACKED2_T *olj34 = fc.lj34[0];
	FC_PACKED3_T *oic = fc.ic;
	flt_t * ocutneighsq = cutneighsq[0];
	int tp1sq = tp1 * tp1;
	if (oijc != NULL && oic != NULL) {
	#pragma offload_transfer target(mic:_cop) \
	in(special_lj: length(4) alloc_if(0) free_if(0)) \
	in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
	in(oic: length(tp1) alloc_if(0) free_if(0)) \
	in(ocutneighsq: length(tp1sq))
	}
	#endif
	}

	/* ---------------------------------------------------------------------- */

	template <class flt_t>
	void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
	const int one_length,
	const int nthreads,
	Memory *memory,
	const int cop) {
	if (ntypes != _ntypes) {
	if (_ntypes > 0) {
	fc_packed3 *oic = ic;

	#ifdef _LMP_INTEL_OFFLOAD
	flt_t * ospecial_lj = special_lj;
	fc_packed1 *oijc = ijc[0];
	fc_packed2 *olj34 = lj34[0];
	flt_t * orsq_form = rsq_form[0];
	flt_t * odelx_form = delx_form[0];
	flt_t * odely_form = dely_form[0];
	flt_t * odelz_form = delz_form[0];
	int * ojtype_form = jtype_form[0];
	int * ojlist_form = jlist_form[0];

	if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
	orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
	odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
	_cop >= 0) {
	#pragma offload_transfer target(mic:_cop) \
	nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
	nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
	nocopy(odelz_form, ojtype_form, ojlist_form: alloc_if(0) free_if(1))
	}
	#endif

	_memory->destroy(oic);
	_memory->destroy(ijc);
	_memory->destroy(lj34);
	_memory->destroy(rsq_form);
	_memory->destroy(delx_form);
	_memory->destroy(dely_form);
	_memory->destroy(delz_form);
	_memory->destroy(jtype_form);
	_memory->destroy(jlist_form);
	}

	if (ntypes > 0) {
	_cop = cop;
	memory->create(ijc, ntypes, ntypes, "fc.ijc");
	memory->create(lj34, ntypes, ntypes, "fc.lj34");
	memory->create(ic, ntypes, "fc.ic");
	memory->create(rsq_form, nthreads, one_length, "rsq_form");
	memory->create(delx_form, nthreads, one_length, "delx_form");
	memory->create(dely_form, nthreads, one_length, "dely_form");
	memory->create(delz_form, nthreads, one_length, "delz_form");
	memory->create(jtype_form, nthreads, one_length, "jtype_form");
	memory->create(jlist_form, nthreads, one_length, "jlist_form");

	for (int zn = 0; zn < nthreads; zn++)
	for (int zo = 0; zo < one_length; zo++) {
	rsq_form[zn][zo] = 10.0;
	delx_form[zn][zo] = 10.0;
	dely_form[zn][zo] = 10.0;
	delz_form[zn][zo] = 10.0;
	jtype_form[zn][zo] = 1;
	jlist_form[zn][zo] = 0;
	}

	#ifdef _LMP_INTEL_OFFLOAD
	flt_t * ospecial_lj = special_lj;
	fc_packed1 *oijc = ijc[0];
	fc_packed2 *olj34 = lj34[0];
	fc_packed3 *oic = ic;
	flt_t * orsq_form = rsq_form[0];
	flt_t * odelx_form = delx_form[0];
	flt_t * odely_form = dely_form[0];
	flt_t * odelz_form = delz_form[0];
	int * ojtype_form = jtype_form[0];
	int * ojlist_form = jlist_form[0];
	int off_onel = one_length * nthreads;

	int tp1sq = ntypes*ntypes;
	if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
	oic != NULL && orsq_form != NULL && odelx_form != NULL &&
	odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
	ojlist_form !=NULL && cop >= 0) {
	#pragma offload_transfer target(mic:cop) \
	nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
	nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
	nocopy(oic: length(ntypes) alloc_if(1) free_if(0)) \
	in(orsq_form: length(off_onel) alloc_if(1) free_if(0)) \
	in(odelx_form: length(off_onel) alloc_if(1) free_if(0)) \
	in(odely_form: length(off_onel) alloc_if(1) free_if(0)) \
	in(odelz_form: length(off_onel) alloc_if(1) free_if(0)) \
	in(ojtype_form: length(off_onel) alloc_if(1) free_if(0)) \
	in(ojlist_form: length(off_onel) alloc_if(1) free_if(0))
	}
	#endif
	}
	}
	_ntypes = ntypes;
	_memory = memory;
	}

pair_gayberne_intel.cppNo OneTemporaryActions

File Metadata

pair_gayberne_intel.cppView Options

Event Timeline

pair_gayberne_intel.cpp
No OneTemporary
Actions

pair_gayberne_intel.cpp
View Options