Page MenuHomec4science

pair_gayberne_intel.cpp
No OneTemporary

File Metadata

Created
Mon, Nov 11, 07:57

pair_gayberne_intel.cpp

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
This software is distributed under the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include "pair_gayberne_intel.h"
#include "math_extra_intel.h"
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push,target(mic))
#endif
#include <cmath>
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
#include "atom.h"
#include "comm.h"
#include "atom_vec_ellipsoid.h"
#include "force.h"
#include "memory.h"
#include "modify.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
#define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
/* ---------------------------------------------------------------------- */
PairGayBerneIntel::PairGayBerneIntel(LAMMPS *lmp) :
PairGayBerne(lmp)
{
suffix_flag |= Suffix::INTEL;
respa_enable = 0;
}
/* ---------------------------------------------------------------------- */
void PairGayBerneIntel::compute(int eflag, int vflag)
{
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
fix->balance_stamp();
vflag_fdotr = 0;
}
template <class flt_t, class acc_t>
void PairGayBerneIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag, vflag);
} else evflag = vflag_fdotr = 0;
const int inum = list->inum;
const int nall = atom->nlocal + atom->nghost;
const int nthreads = comm->nthreads;
const int host_start = fix->host_start_pair();
const int offload_end = fix->offload_end_pair();
const int ago = neighbor->ago;
if (fix->separate_buffers() == 0) {
fix->start_watch(TIME_PACK);
const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
const int * const ellipsoid = atom->ellipsoid;
QUAT_T * _noalias const quat = buffers->get_quat();
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
sizeof(ATOM_T));
if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
for (int i = ifrom; i < ito; i++) {
int qi = ellipsoid[i];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
}
quat[nall].w = (flt_t)1.0;
quat[nall].i = (flt_t)0.0;
quat[nall].j = (flt_t)0.0;
quat[nall].k = (flt_t)0.0;
fix->stop_watch(TIME_PACK);
}
if (evflag || vflag_fdotr) {
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (eflag) {
if (force->newton_pair) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
if (force->newton_pair) {
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
} else {
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
}
}
}
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairGayBerneIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc,
const int astart, const int aend)
{
const int inum = aend - astart;
if (inum == 0) return;
int nlocal, nall, minlocal;
fix->get_buffern(offload, nlocal, nall, minlocal);
const int ago = neighbor->ago;
ATOM_T * _noalias const x = buffers->get_x(offload);
QUAT_T * _noalias const quat = buffers->get_quat(offload);
const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
const int *ellipsoid = atom->ellipsoid;
#ifdef _LMP_INTEL_OFFLOAD
if (fix->separate_buffers()) {
fix->start_watch(TIME_PACK);
if (offload) {
#pragma omp parallel default(none) \
shared(buffers,nlocal,nall,bonus,ellipsoid)
{
int ifrom, ito, tid;
int nthreads = comm->nthreads;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
nthreads, sizeof(ATOM_T));
if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
for (int i = ifrom; i < ito; i++) {
int qi = ellipsoid[i];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
int nghost = nall - nlocal;
if (nghost) {
IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
nthreads, sizeof(ATOM_T));
int offset = 0;
ifrom += nlocal;
ito += nlocal;
if (ago != 0) {
offset = fix->offload_min_ghost() - nlocal;
buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
}
for (int i = ifrom; i < ito; i++) {
int qi = ellipsoid[i + offset];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
}
}
} else {
if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
for (int i = fix->host_min_local(); i < nlocal; i++) {
int qi = ellipsoid[i];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
int offset = fix->host_min_ghost() - nlocal;
if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
for (int i = nlocal; i < nall; i++) {
int qi = ellipsoid[i + offset];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
}
fix->stop_watch(TIME_PACK);
}
#endif
// const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const flt_t * _noalias const special_lj = fc.special_lj;
const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
const FC_PACKED3_T * _noalias const ic = fc.ic;
const flt_t mu = fc.mu;
const flt_t gamma = fc.gamma;
const flt_t upsilon = fc.upsilon;
flt_t * const rsq_formi = fc.rsq_form[0];
flt_t * const delx_formi = fc.delx_form[0];
flt_t * const dely_formi = fc.dely_form[0];
flt_t * const delz_formi = fc.delz_form[0];
int * const jtype_formi = fc.jtype_form[0];
int * const jlist_formi = fc.jlist_form[0];
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
// Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int max_nbors = _max_nbors;
const int nthreads = tc;
int pad = 1;
if (offload) {
if (INTEL_MIC_NBOR_PAD > 1)
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
} else {
if (INTEL_NBOR_PAD > 1)
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
}
const int pad_width = pad;
#ifdef _LMP_INTEL_OFFLOAD
int *overflow = fix->get_off_overflow_flag();
double *timer_compute = fix->off_watch_pair();
if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
#pragma offload target(mic:_cop) if(offload) \
in(special_lj:length(0) alloc_if(0) free_if(0)) \
in(ijc,lj34,ic:length(0) alloc_if(0) free_if(0)) \
in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(quat:length(nall+1) alloc_if(0) free_if(0)) \
in(overflow:length(0) alloc_if(0) free_if(0)) \
in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \
in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(f_start)
#endif
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute=MIC_Wtime();
#endif
#ifdef _LMP_INTEL_OFFLOAD
if (separate_flag) {
if (separate_flag < 3) {
int all_local = nlocal;
int ghost_min = overflow[LMP_GHOST_MIN];
nlocal = overflow[LMP_LOCAL_MAX] + 1;
int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
if (nghost < 0) nghost = 0;
nall = nlocal + nghost;
separate_flag--;
int flength;
if (NEWTON_PAIR) flength = nall;
else flength = nlocal;
IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
separate_flag);
if (nghost) {
if (nlocal < all_local || ghost_min > all_local) {
memmove(x + nlocal, x + ghost_min,
(nall - nlocal) * sizeof(ATOM_T));
memmove(quat + nlocal, quat + ghost_min,
(nall - nlocal) * sizeof(QUAT_T));
}
}
}
x[nall].x = (flt_t)INTEL_BIGP;
x[nall].y = (flt_t)INTEL_BIGP;
x[nall].z = (flt_t)INTEL_BIGP;
quat[nall].w = (flt_t)1.0;
quat[nall].i = (flt_t)0.0;
quat[nall].j = (flt_t)0.0;
quat[nall].k = (flt_t)0.0;
}
#endif
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EVFLAG) {
oevdwl = (acc_t)0.0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
}
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,nlocal,nall,minlocal) \
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int iifrom, iito, tid;
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
iifrom += astart;
iito += astart;
FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
flt_t * _noalias const dely_form = dely_formi + tid * max_nbors;
flt_t * _noalias const delz_form = delz_formi + tid * max_nbors;
int * _noalias const jtype_form = jtype_formi + tid * max_nbors;
int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
int ierror = 0;
for (int i = iifrom; i < iito; ++i) {
// const int i = ilist[ii];
const int itype = x[i].w;
const int ptr_off = itype * ntypes;
const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
flt_t a1_0, a1_1, a1_2, a1_3, a1_4, a1_5, a1_6, a1_7, a1_8;
flt_t b1_0, b1_1, b1_2, b1_3, b1_4, b1_5, b1_6, b1_7, b1_8;
flt_t g1_0, g1_1, g1_2, g1_3, g1_4, g1_5, g1_6, g1_7, g1_8;
if (ijci[itype].form == ELLIPSE_ELLIPSE) {
flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
ME_quat_to_mat_trans(quat[i],a1);
ME_diag_times3(ic[itype].well,a1,temp);
ME_transpose_times3(a1,temp,b1);
ME_diag_times3(ic[itype].shape2,a1,temp);
ME_transpose_times3(a1,temp,g1);
}
acc_t fxtmp, fytmp, fztmp, fwtmp, t1tmp, t2tmp, t3tmp;
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
if (EVFLAG) {
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
bool multiple_forms = false;
int packed_j = 0;
for (int jj = 0; jj < jnum; jj++) {
int jm = jlist[jj];
int j = jm & NEIGHMASK;
const int jtype = x[j].w;
if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
flt_t delx = x[j].x-xtmp;
flt_t dely = x[j].y-ytmp;
flt_t delz = x[j].z-ztmp;
flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq < ijci[jtype].cutsq) {
rsq_form[packed_j] = rsq;
delx_form[packed_j] = delx;
dely_form[packed_j] = dely;
delz_form[packed_j] = delz;
jtype_form[packed_j] = jtype;
jlist_form[packed_j] = jm;
packed_j++;
}
} else
multiple_forms = true;
}
const int edge = (packed_j % pad_width);
if (edge) {
const int packed_end = packed_j + (pad_width - edge);
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min=1, max=15, avg=8
#endif
for ( ; packed_j < packed_end; packed_j++)
jlist_form[packed_j] = nall;
}
// -------------------------------------------------------------
#ifdef INTEL_V512
__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
__assume(packed_j % 8 == 0);
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
#endif
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
#endif
for (int jj = 0; jj < packed_j; jj++) {
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
flt_t g2_0, g2_1, g2_2, g2_3, g2_4, g2_5, g2_6, g2_7, g2_8;
flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
flt_t rtor_0, rtor_1, rtor_2;
const int sbindex = jlist_form[jj] >> SBBITS & 3;
const int j = jlist_form[jj] & NEIGHMASK;
flt_t factor_lj = special_lj[sbindex];
const int jtype = jtype_form[jj];
const flt_t sigma = ijci[jtype].sigma;
const flt_t epsilon = ijci[jtype].epsilon;
const flt_t shape2_0 = ic[jtype].shape2[0];
const flt_t shape2_1 = ic[jtype].shape2[1];
const flt_t shape2_2 = ic[jtype].shape2[2];
flt_t one_eng, evdwl;
ME_quat_to_mat_trans(quat[j], a2);
ME_diag_times3(ic[jtype].well, a2, temp);
ME_transpose_times3(a2, temp, b2);
ME_diag_times3a(shape2, a2, temp);
ME_transpose_times3(a2, temp, g2);
flt_t tempv_0, tempv_1, tempv_2, tempv2_0, tempv2_1, tempv2_2;
flt_t temp1, temp2, temp3;
flt_t r12hat_0, r12hat_1, r12hat_2;
ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat);
flt_t r = sqrt(rsq_form[jj]);
// compute distance of closest approach
flt_t g12_0, g12_1, g12_2, g12_3, g12_4, g12_5, g12_6, g12_7, g12_8;
ME_plus3(g1, g2, g12);
flt_t kappa_0, kappa_1, kappa_2;
ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
kappa, ierror);
// tempv = G12^-1*r12hat
flt_t inv_r = (flt_t)1.0 / r;
tempv_0 = kappa_0 * inv_r;
tempv_1 = kappa_1 * inv_r;
tempv_2 = kappa_2 * inv_r;
flt_t sigma12 = ME_dot3(r12hat, tempv);
sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
flt_t h12 = r - sigma12;
// energy
// compute u_r
flt_t varrho = sigma / (h12 + gamma * sigma);
flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
flt_t varrho12 = varrho6 * varrho6;
flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
// compute eta_12
flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
flt_t det_g12 = ME_det3(g12);
eta = std::pow(eta / det_g12, upsilon);
// compute chi_12
flt_t b12_0, b12_1, b12_2, b12_3, b12_4, b12_5, b12_6, b12_7, b12_8;
flt_t iota_0, iota_1, iota_2;
ME_plus3(b1, b2, b12);
ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
iota, ierror);
// tempv = G12^-1*r12hat
tempv_0 = iota_0 * inv_r;
tempv_1 = iota_1 * inv_r;
tempv_2 = iota_2 * inv_r;
flt_t chi = ME_dot3(r12hat, tempv);
chi = std::pow(chi * (flt_t)2.0, mu);
// force
// compute dUr/dr
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
sigma;
temp1 = temp1 * (flt_t)24.0 * epsilon;
flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
flt_t dUr_0, dUr_1, dUr_2;
temp2 = ME_dot3(kappa, r12hat);
flt_t uslj_rsq = u_slj / rsq_form[jj];
dUr_0 = temp1 * r12hat_0 + uslj_rsq * (kappa_0 - temp2 * r12hat_0);
dUr_1 = temp1 * r12hat_1 + uslj_rsq * (kappa_1 - temp2 * r12hat_1);
dUr_2 = temp1 * r12hat_2 + uslj_rsq * (kappa_2 - temp2 * r12hat_2);
// compute dChi_12/dr
flt_t dchi_0, dchi_1, dchi_2;
temp1 = ME_dot3(iota, r12hat);
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
std::pow(chi, (mu - (flt_t)1.0) / mu);
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
temp1 = -eta * u_r;
temp2 = eta * chi;
fforce_0 = temp1 * dchi_0 - temp2 * dUr_0;
fforce_1 = temp1 * dchi_1 - temp2 * dUr_1;
fforce_2 = temp1 * dchi_2 - temp2 * dUr_2;
// torque for particle 1 and 2
// compute dUr
tempv_0 = -uslj_rsq * kappa_0;
tempv_1 = -uslj_rsq * kappa_1;
tempv_2 = -uslj_rsq * kappa_2;
ME_vecmat(kappa, g1, tempv2);
ME_cross3(tempv, tempv2, dUr);
flt_t dUr2_0, dUr2_1, dUr2_2;
if (NEWTON_PAIR || j < nlocal) {
ME_vecmat(kappa, g2, tempv2);
ME_cross3(tempv, tempv2, dUr2);
}
// compute d_chi
ME_vecmat(iota, b1, tempv);
ME_cross3(tempv, iota, dchi);
temp1 = (flt_t)-4.0 / rsq_form[jj];
dchi_0 *= temp1;
dchi_1 *= temp1;
dchi_2 *= temp1;
flt_t dchi2_0, dchi2_1, dchi2_2;
if (NEWTON_PAIR || j < nlocal) {
ME_vecmat(iota, b2, tempv);
ME_cross3(tempv, iota, dchi2);
dchi2_0 *= temp1;
dchi2_1 *= temp1;
dchi2_2 *= temp1;
}
// compute d_eta
flt_t deta_0, deta_1, deta_2;
deta_0 = deta_1 = deta_2 = (flt_t)0.0;
ME_compute_eta_torque(g12, a1, shape2, temp);
temp1 = -eta * upsilon;
tempv_0 = temp1 * temp_0;
tempv_1 = temp1 * temp_1;
tempv_2 = temp1 * temp_2;
ME_mv0_cross3(a1, tempv, tempv2);
deta_0 += tempv2_0;
deta_1 += tempv2_1;
deta_2 += tempv2_2;
tempv_0 = temp1 * temp_3;
tempv_1 = temp1 * temp_4;
tempv_2 = temp1 * temp_5;
ME_mv1_cross3(a1, tempv, tempv2);
deta_0 += tempv2_0;
deta_1 += tempv2_1;
deta_2 += tempv2_2;
tempv_0 = temp1 * temp_6;
tempv_1 = temp1 * temp_7;
tempv_2 = temp1 * temp_8;
ME_mv2_cross3(a1, tempv, tempv2);
deta_0 += tempv2_0;
deta_1 += tempv2_1;
deta_2 += tempv2_2;
// compute d_eta for particle 2
flt_t deta2_0, deta2_1, deta2_2;
if (NEWTON_PAIR || j < nlocal) {
deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
ME_compute_eta_torque(g12, a2, shape2, temp);
tempv_0 = temp1 * temp_0;
tempv_1 = temp1 * temp_1;
tempv_2 = temp1 * temp_2;
ME_mv0_cross3(a2, tempv, tempv2);
deta2_0 += tempv2_0;
deta2_1 += tempv2_1;
deta2_2 += tempv2_2;
tempv_0 = temp1 * temp_3;
tempv_1 = temp1 * temp_4;
tempv_2 = temp1 * temp_5;
ME_mv1_cross3(a2, tempv, tempv2);
deta2_0 += tempv2_0;
deta2_1 += tempv2_1;
deta2_2 += tempv2_2;
tempv_0 = temp1 * temp_6;
tempv_1 = temp1 * temp_7;
tempv_2 = temp1 * temp_8;
ME_mv2_cross3(a2, tempv, tempv2);
deta2_0 += tempv2_0;
deta2_1 += tempv2_1;
deta2_2 += tempv2_2;
}
// torque
temp1 = u_r * eta;
temp2 = u_r * chi;
temp3 = chi * eta;
ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
(flt_t)-1.0;
ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
(flt_t)-1.0;
ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
(flt_t)-1.0;
if (NEWTON_PAIR || j < nlocal) {
rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
(flt_t)-1.0;
rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
(flt_t)-1.0;
rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
(flt_t)-1.0;
}
one_eng = temp1 * chi;
#ifndef INTEL_VMASK
if (jlist_form[jj] == nall) {
one_eng = (flt_t)0.0;
fforce_0 = 0.0;
fforce_1 = 0.0;
fforce_2 = 0.0;
ttor_0 = 0.0;
ttor_1 = 0.0;
ttor_2 = 0.0;
rtor_0 = 0.0;
rtor_1 = 0.0;
rtor_2 = 0.0;
}
#endif
fforce_0 *= factor_lj;
fforce_1 *= factor_lj;
fforce_2 *= factor_lj;
ttor_0 *= factor_lj;
ttor_1 *= factor_lj;
ttor_2 *= factor_lj;
#ifdef INTEL_VMASK
if (jlist_form[jj] < nall) {
#endif
fxtmp += fforce_0;
fytmp += fforce_1;
fztmp += fforce_2;
t1tmp += ttor_0;
t2tmp += ttor_1;
t3tmp += ttor_2;
if (NEWTON_PAIR || j < nlocal) {
rtor_0 *= factor_lj;
rtor_1 *= factor_lj;
rtor_2 *= factor_lj;
int jp = j * 2;
f[jp].x -= fforce_0;
f[jp].y -= fforce_1;
f[jp].z -= fforce_2;
jp++;
f[jp].x += rtor_0;
f[jp].y += rtor_1;
f[jp].z += rtor_2;
}
if (EVFLAG) {
flt_t ev_pre = (flt_t)0.0;
if (NEWTON_PAIR || i < nlocal)
ev_pre += (flt_t)0.5;
if (NEWTON_PAIR || j < nlocal)
ev_pre += (flt_t)0.5;
if (EFLAG) {
evdwl = factor_lj * one_eng;
sevdwl += ev_pre * evdwl;
if (eatom) {
if (NEWTON_PAIR || i < nlocal)
fwtmp += (flt_t)0.5 * evdwl;
if (NEWTON_PAIR || j < nlocal)
f[j*2].w += (flt_t)0.5 * evdwl;
}
}
if (vflag == 1) {
ev_pre *= (flt_t)-1.0;
sv0 += ev_pre * delx_form[jj] * fforce_0;
sv1 += ev_pre * dely_form[jj] * fforce_1;
sv2 += ev_pre * delz_form[jj] * fforce_2;
sv3 += ev_pre * delx_form[jj] * fforce_1;
sv4 += ev_pre * delx_form[jj] * fforce_2;
sv5 += ev_pre * dely_form[jj] * fforce_2;
}
} // EVFLAG
#ifdef INTEL_VMASK
}
#endif
} // for jj
// -------------------------------------------------------------
if (multiple_forms)
ierror = 2;
int ip = i * 2;
f[ip].x += fxtmp;
f[ip].y += fytmp;
f[ip].z += fztmp;
ip++;
f[ip].x += t1tmp;
f[ip].y += t2tmp;
f[ip].z += t3tmp;
if (EVFLAG) {
if (EFLAG) {
if (eatom) f[i * 2].w += fwtmp;
oevdwl += sevdwl;
}
if (vflag == 1) {
ov0 += sv0;
ov1 += sv1;
ov2 += sv2;
ov3 += sv3;
ov4 += sv4;
ov5 += sv5;
}
}
} // for i
int o_range;
if (NEWTON_PAIR)
o_range = nall;
else
o_range = nlocal;
if (offload == 0) o_range -= minlocal;
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
sizeof(FORCE_T));
const int two_iito = iito * 2;
acc_t *facc = &(f_start[0].x);
const int sto = two_iito * 4;
const int fst4 = f_stride * 4;
#if defined(_OPENMP)
#pragma omp barrier
#endif
int t_off = f_stride;
if (EFLAG && eatom) {
for (int t = 1; t < nthreads; t++) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector nontemporal
#pragma novector
#endif
for (int n = iifrom * 2; n < two_iito; n++) {
f_start[n].x += f_start[n + t_off].x;
f_start[n].y += f_start[n + t_off].y;
f_start[n].z += f_start[n + t_off].z;
f_start[n].w += f_start[n + t_off].w;
}
t_off += f_stride;
}
} else {
for (int t = 1; t < nthreads; t++) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector nontemporal
#pragma novector
#endif
for (int n = iifrom * 2; n < two_iito; n++) {
f_start[n].x += f_start[n + t_off].x;
f_start[n].y += f_start[n + t_off].y;
f_start[n].z += f_start[n + t_off].z;
}
t_off += f_stride;
}
}
if (EVFLAG) {
if (vflag==2) {
const ATOM_T * _noalias const xo = x + minlocal;
#if defined(LMP_SIMD_COMPILER)
#pragma vector nontemporal
#pragma novector
#endif
for (int n = iifrom; n < iito; n++) {
const int nt2 = n * 2;
ov0 += f_start[nt2].x * xo[n].x;
ov1 += f_start[nt2].y * xo[n].y;
ov2 += f_start[nt2].z * xo[n].z;
ov3 += f_start[nt2].y * xo[n].x;
ov4 += f_start[nt2].z * xo[n].x;
ov5 += f_start[nt2].z * xo[n].y;
}
}
}
if (ierror)
f_start[1].w = ierror;
} // omp
if (EVFLAG) {
if (EFLAG) {
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
}
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // offload
if (offload)
fix->stop_watch(TIME_OFFLOAD_LATENCY);
else
fix->stop_watch(TIME_HOST_PAIR);
if (EVFLAG)
fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
else
fix->add_result_array(f_start, 0, offload, 0, 0, 2);
}
/* ---------------------------------------------------------------------- */
void PairGayBerneIntel::init_style()
{
PairGayBerne::init_style();
neighbor->requests[neighbor->nrequest-1]->intel = 1;
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD
if (force->newton_pair) fix->set_offload_noghost(1);
_cop = fix->coprocessor_number();
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
int tp1 = atom->ntypes + 1;
_max_nbors = buffers->get_max_nbors();
int mthreads = comm->nthreads;
if (mthreads < buffers->get_off_threads())
mthreads = buffers->get_off_threads();
fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
}
}
for (int i = 0; i < 4; i++) {
fc.special_lj[i] = force->special_lj[i];
fc.special_lj[0] = 1.0;
}
fc.gamma = gamma;
fc.upsilon = upsilon;
fc.mu = mu;
for (int i = 0; i < tp1; i++) {
for (int j = 0; j < tp1; j++) {
fc.ijc[i][j].lj1 = lj1[i][j];
fc.ijc[i][j].lj2 = lj2[i][j];
fc.ijc[i][j].cutsq = cutsq[i][j];
fc.ijc[i][j].offset = offset[i][j];
fc.ijc[i][j].sigma = sigma[i][j];
fc.ijc[i][j].epsilon = epsilon[i][j];
fc.ijc[i][j].form = form[i][j];
fc.ijc[i][j].lshape = lshape[i] * lshape[j];
fc.lj34[i][j].lj3 = lj3[i][j];
fc.lj34[i][j].lj4 = lj4[i][j];
}
for (int j = 0; j < 4; j++) {
fc.ic[i].shape2[j] = shape2[i][j];
fc.ic[i].well[j] = well[i][j];
}
}
#ifdef _LMP_INTEL_OFFLOAD
if (_cop < 0) return;
flt_t * special_lj = fc.special_lj;
FC_PACKED1_T *oijc = fc.ijc[0];
FC_PACKED2_T *olj34 = fc.lj34[0];
FC_PACKED3_T *oic = fc.ic;
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
if (oijc != NULL && oic != NULL) {
#pragma offload_transfer target(mic:_cop) \
in(special_lj: length(4) alloc_if(0) free_if(0)) \
in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
in(oic: length(tp1) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq))
}
#endif
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int one_length,
const int nthreads,
Memory *memory,
const int cop) {
if (ntypes != _ntypes) {
if (_ntypes > 0) {
fc_packed3 *oic = ic;
#ifdef _LMP_INTEL_OFFLOAD
flt_t * ospecial_lj = special_lj;
fc_packed1 *oijc = ijc[0];
fc_packed2 *olj34 = lj34[0];
flt_t * orsq_form = rsq_form[0];
flt_t * odelx_form = delx_form[0];
flt_t * odely_form = dely_form[0];
flt_t * odelz_form = delz_form[0];
int * ojtype_form = jtype_form[0];
int * ojlist_form = jlist_form[0];
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
_cop >= 0) {
#pragma offload_transfer target(mic:_cop) \
nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
nocopy(odelz_form, ojtype_form, ojlist_form: alloc_if(0) free_if(1))
}
#endif
_memory->destroy(oic);
_memory->destroy(ijc);
_memory->destroy(lj34);
_memory->destroy(rsq_form);
_memory->destroy(delx_form);
_memory->destroy(dely_form);
_memory->destroy(delz_form);
_memory->destroy(jtype_form);
_memory->destroy(jlist_form);
}
if (ntypes > 0) {
_cop = cop;
memory->create(ijc, ntypes, ntypes, "fc.ijc");
memory->create(lj34, ntypes, ntypes, "fc.lj34");
memory->create(ic, ntypes, "fc.ic");
memory->create(rsq_form, nthreads, one_length, "rsq_form");
memory->create(delx_form, nthreads, one_length, "delx_form");
memory->create(dely_form, nthreads, one_length, "dely_form");
memory->create(delz_form, nthreads, one_length, "delz_form");
memory->create(jtype_form, nthreads, one_length, "jtype_form");
memory->create(jlist_form, nthreads, one_length, "jlist_form");
for (int zn = 0; zn < nthreads; zn++)
for (int zo = 0; zo < one_length; zo++) {
rsq_form[zn][zo] = 10.0;
delx_form[zn][zo] = 10.0;
dely_form[zn][zo] = 10.0;
delz_form[zn][zo] = 10.0;
jtype_form[zn][zo] = 1;
jlist_form[zn][zo] = 0;
}
#ifdef _LMP_INTEL_OFFLOAD
flt_t * ospecial_lj = special_lj;
fc_packed1 *oijc = ijc[0];
fc_packed2 *olj34 = lj34[0];
fc_packed3 *oic = ic;
flt_t * orsq_form = rsq_form[0];
flt_t * odelx_form = delx_form[0];
flt_t * odely_form = dely_form[0];
flt_t * odelz_form = delz_form[0];
int * ojtype_form = jtype_form[0];
int * ojlist_form = jlist_form[0];
int off_onel = one_length * nthreads;
int tp1sq = ntypes*ntypes;
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
oic != NULL && orsq_form != NULL && odelx_form != NULL &&
odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
ojlist_form !=NULL && cop >= 0) {
#pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oic: length(ntypes) alloc_if(1) free_if(0)) \
in(orsq_form: length(off_onel) alloc_if(1) free_if(0)) \
in(odelx_form: length(off_onel) alloc_if(1) free_if(0)) \
in(odely_form: length(off_onel) alloc_if(1) free_if(0)) \
in(odelz_form: length(off_onel) alloc_if(1) free_if(0)) \
in(ojtype_form: length(off_onel) alloc_if(1) free_if(0)) \
in(ojlist_form: length(off_onel) alloc_if(1) free_if(0))
}
#endif
}
}
_ntypes = ntypes;
_memory = memory;
}

Event Timeline