Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91458656
pair_gayberne_intel.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Nov 11, 07:57
Size
35 KB
Mime Type
text/x-c++
Expires
Wed, Nov 13, 07:57 (2 d)
Engine
blob
Format
Raw Data
Handle
22266070
Attached To
rLAMMPS lammps
pair_gayberne_intel.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
This software is distributed under the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include "pair_gayberne_intel.h"
#include "math_extra_intel.h"
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(push,target(mic))
#endif
#include <cmath>
#ifdef _LMP_INTEL_OFFLOAD
#pragma offload_attribute(pop)
#endif
#include "atom.h"
#include "comm.h"
#include "atom_vec_ellipsoid.h"
#include "force.h"
#include "memory.h"
#include "modify.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
#define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
/* ---------------------------------------------------------------------- */
PairGayBerneIntel::PairGayBerneIntel(LAMMPS *lmp) :
PairGayBerne(lmp)
{
suffix_flag |= Suffix::INTEL;
respa_enable = 0;
}
/* ---------------------------------------------------------------------- */
void PairGayBerneIntel::compute(int eflag, int vflag)
{
if (fix->precision()==FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
fix->balance_stamp();
vflag_fdotr = 0;
}
template <class flt_t, class acc_t>
void PairGayBerneIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag, vflag);
} else evflag = vflag_fdotr = 0;
const int inum = list->inum;
const int nall = atom->nlocal + atom->nghost;
const int nthreads = comm->nthreads;
const int host_start = fix->host_start_pair();
const int offload_end = fix->offload_end_pair();
const int ago = neighbor->ago;
if (fix->separate_buffers() == 0) {
fix->start_watch(TIME_PACK);
const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
const int * const ellipsoid = atom->ellipsoid;
QUAT_T * _noalias const quat = buffers->get_quat();
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
sizeof(ATOM_T));
if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
for (int i = ifrom; i < ito; i++) {
int qi = ellipsoid[i];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
}
quat[nall].w = (flt_t)1.0;
quat[nall].i = (flt_t)0.0;
quat[nall].j = (flt_t)0.0;
quat[nall].k = (flt_t)0.0;
fix->stop_watch(TIME_PACK);
}
if (evflag || vflag_fdotr) {
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (eflag) {
if (force->newton_pair) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
if (force->newton_pair) {
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
} else {
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
}
}
}
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairGayBerneIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc,
const int astart, const int aend)
{
const int inum = aend - astart;
if (inum == 0) return;
int nlocal, nall, minlocal;
fix->get_buffern(offload, nlocal, nall, minlocal);
const int ago = neighbor->ago;
ATOM_T * _noalias const x = buffers->get_x(offload);
QUAT_T * _noalias const quat = buffers->get_quat(offload);
const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
const int *ellipsoid = atom->ellipsoid;
#ifdef _LMP_INTEL_OFFLOAD
if (fix->separate_buffers()) {
fix->start_watch(TIME_PACK);
if (offload) {
#pragma omp parallel default(none) \
shared(buffers,nlocal,nall,bonus,ellipsoid)
{
int ifrom, ito, tid;
int nthreads = comm->nthreads;
IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
nthreads, sizeof(ATOM_T));
if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
for (int i = ifrom; i < ito; i++) {
int qi = ellipsoid[i];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
int nghost = nall - nlocal;
if (nghost) {
IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
nthreads, sizeof(ATOM_T));
int offset = 0;
ifrom += nlocal;
ito += nlocal;
if (ago != 0) {
offset = fix->offload_min_ghost() - nlocal;
buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
}
for (int i = ifrom; i < ito; i++) {
int qi = ellipsoid[i + offset];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
}
}
} else {
if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
for (int i = fix->host_min_local(); i < nlocal; i++) {
int qi = ellipsoid[i];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
int offset = fix->host_min_ghost() - nlocal;
if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
for (int i = nlocal; i < nall; i++) {
int qi = ellipsoid[i + offset];
if (qi > -1) {
quat[i].w = bonus[qi].quat[0];
quat[i].i = bonus[qi].quat[1];
quat[i].j = bonus[qi].quat[2];
quat[i].k = bonus[qi].quat[3];
}
}
}
fix->stop_watch(TIME_PACK);
}
#endif
// const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const flt_t * _noalias const special_lj = fc.special_lj;
const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
const FC_PACKED3_T * _noalias const ic = fc.ic;
const flt_t mu = fc.mu;
const flt_t gamma = fc.gamma;
const flt_t upsilon = fc.upsilon;
flt_t * const rsq_formi = fc.rsq_form[0];
flt_t * const delx_formi = fc.delx_form[0];
flt_t * const dely_formi = fc.dely_form[0];
flt_t * const delz_formi = fc.delz_form[0];
int * const jtype_formi = fc.jtype_form[0];
int * const jlist_formi = fc.jlist_form[0];
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
// Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int max_nbors = _max_nbors;
const int nthreads = tc;
int pad = 1;
if (offload) {
if (INTEL_MIC_NBOR_PAD > 1)
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
} else {
if (INTEL_NBOR_PAD > 1)
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
}
const int pad_width = pad;
#ifdef _LMP_INTEL_OFFLOAD
int *overflow = fix->get_off_overflow_flag();
double *timer_compute = fix->off_watch_pair();
if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
#pragma offload target(mic:_cop) if(offload) \
in(special_lj:length(0) alloc_if(0) free_if(0)) \
in(ijc,lj34,ic:length(0) alloc_if(0) free_if(0)) \
in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
in(numneigh:length(0) alloc_if(0) free_if(0)) \
in(x:length(x_size) alloc_if(0) free_if(0)) \
in(quat:length(nall+1) alloc_if(0) free_if(0)) \
in(overflow:length(0) alloc_if(0) free_if(0)) \
in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \
in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
signal(f_start)
#endif
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute=MIC_Wtime();
#endif
#ifdef _LMP_INTEL_OFFLOAD
if (separate_flag) {
if (separate_flag < 3) {
int all_local = nlocal;
int ghost_min = overflow[LMP_GHOST_MIN];
nlocal = overflow[LMP_LOCAL_MAX] + 1;
int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
if (nghost < 0) nghost = 0;
nall = nlocal + nghost;
separate_flag--;
int flength;
if (NEWTON_PAIR) flength = nall;
else flength = nlocal;
IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
separate_flag);
if (nghost) {
if (nlocal < all_local || ghost_min > all_local) {
memmove(x + nlocal, x + ghost_min,
(nall - nlocal) * sizeof(ATOM_T));
memmove(quat + nlocal, quat + ghost_min,
(nall - nlocal) * sizeof(QUAT_T));
}
}
}
x[nall].x = (flt_t)INTEL_BIGP;
x[nall].y = (flt_t)INTEL_BIGP;
x[nall].z = (flt_t)INTEL_BIGP;
quat[nall].w = (flt_t)1.0;
quat[nall].i = (flt_t)0.0;
quat[nall].j = (flt_t)0.0;
quat[nall].k = (flt_t)0.0;
}
#endif
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EVFLAG) {
oevdwl = (acc_t)0.0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
}
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,nlocal,nall,minlocal) \
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int iifrom, iito, tid;
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
iifrom += astart;
iito += astart;
FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
flt_t * _noalias const dely_form = dely_formi + tid * max_nbors;
flt_t * _noalias const delz_form = delz_formi + tid * max_nbors;
int * _noalias const jtype_form = jtype_formi + tid * max_nbors;
int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
int ierror = 0;
for (int i = iifrom; i < iito; ++i) {
// const int i = ilist[ii];
const int itype = x[i].w;
const int ptr_off = itype * ntypes;
const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
flt_t a1_0, a1_1, a1_2, a1_3, a1_4, a1_5, a1_6, a1_7, a1_8;
flt_t b1_0, b1_1, b1_2, b1_3, b1_4, b1_5, b1_6, b1_7, b1_8;
flt_t g1_0, g1_1, g1_2, g1_3, g1_4, g1_5, g1_6, g1_7, g1_8;
if (ijci[itype].form == ELLIPSE_ELLIPSE) {
flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
ME_quat_to_mat_trans(quat[i],a1);
ME_diag_times3(ic[itype].well,a1,temp);
ME_transpose_times3(a1,temp,b1);
ME_diag_times3(ic[itype].shape2,a1,temp);
ME_transpose_times3(a1,temp,g1);
}
acc_t fxtmp, fytmp, fztmp, fwtmp, t1tmp, t2tmp, t3tmp;
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
if (EVFLAG) {
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
bool multiple_forms = false;
int packed_j = 0;
for (int jj = 0; jj < jnum; jj++) {
int jm = jlist[jj];
int j = jm & NEIGHMASK;
const int jtype = x[j].w;
if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
flt_t delx = x[j].x-xtmp;
flt_t dely = x[j].y-ytmp;
flt_t delz = x[j].z-ztmp;
flt_t rsq = delx * delx + dely * dely + delz * delz;
if (rsq < ijci[jtype].cutsq) {
rsq_form[packed_j] = rsq;
delx_form[packed_j] = delx;
dely_form[packed_j] = dely;
delz_form[packed_j] = delz;
jtype_form[packed_j] = jtype;
jlist_form[packed_j] = jm;
packed_j++;
}
} else
multiple_forms = true;
}
const int edge = (packed_j % pad_width);
if (edge) {
const int packed_end = packed_j + (pad_width - edge);
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min=1, max=15, avg=8
#endif
for ( ; packed_j < packed_end; packed_j++)
jlist_form[packed_j] = nall;
}
// -------------------------------------------------------------
#ifdef INTEL_V512
__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
__assume(packed_j % 8 == 0);
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
#endif
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
#endif
for (int jj = 0; jj < packed_j; jj++) {
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
flt_t g2_0, g2_1, g2_2, g2_3, g2_4, g2_5, g2_6, g2_7, g2_8;
flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
flt_t rtor_0, rtor_1, rtor_2;
const int sbindex = jlist_form[jj] >> SBBITS & 3;
const int j = jlist_form[jj] & NEIGHMASK;
flt_t factor_lj = special_lj[sbindex];
const int jtype = jtype_form[jj];
const flt_t sigma = ijci[jtype].sigma;
const flt_t epsilon = ijci[jtype].epsilon;
const flt_t shape2_0 = ic[jtype].shape2[0];
const flt_t shape2_1 = ic[jtype].shape2[1];
const flt_t shape2_2 = ic[jtype].shape2[2];
flt_t one_eng, evdwl;
ME_quat_to_mat_trans(quat[j], a2);
ME_diag_times3(ic[jtype].well, a2, temp);
ME_transpose_times3(a2, temp, b2);
ME_diag_times3a(shape2, a2, temp);
ME_transpose_times3(a2, temp, g2);
flt_t tempv_0, tempv_1, tempv_2, tempv2_0, tempv2_1, tempv2_2;
flt_t temp1, temp2, temp3;
flt_t r12hat_0, r12hat_1, r12hat_2;
ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat);
flt_t r = sqrt(rsq_form[jj]);
// compute distance of closest approach
flt_t g12_0, g12_1, g12_2, g12_3, g12_4, g12_5, g12_6, g12_7, g12_8;
ME_plus3(g1, g2, g12);
flt_t kappa_0, kappa_1, kappa_2;
ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
kappa, ierror);
// tempv = G12^-1*r12hat
flt_t inv_r = (flt_t)1.0 / r;
tempv_0 = kappa_0 * inv_r;
tempv_1 = kappa_1 * inv_r;
tempv_2 = kappa_2 * inv_r;
flt_t sigma12 = ME_dot3(r12hat, tempv);
sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
flt_t h12 = r - sigma12;
// energy
// compute u_r
flt_t varrho = sigma / (h12 + gamma * sigma);
flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
flt_t varrho12 = varrho6 * varrho6;
flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
// compute eta_12
flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
flt_t det_g12 = ME_det3(g12);
eta = std::pow(eta / det_g12, upsilon);
// compute chi_12
flt_t b12_0, b12_1, b12_2, b12_3, b12_4, b12_5, b12_6, b12_7, b12_8;
flt_t iota_0, iota_1, iota_2;
ME_plus3(b1, b2, b12);
ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
iota, ierror);
// tempv = G12^-1*r12hat
tempv_0 = iota_0 * inv_r;
tempv_1 = iota_1 * inv_r;
tempv_2 = iota_2 * inv_r;
flt_t chi = ME_dot3(r12hat, tempv);
chi = std::pow(chi * (flt_t)2.0, mu);
// force
// compute dUr/dr
temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
sigma;
temp1 = temp1 * (flt_t)24.0 * epsilon;
flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
flt_t dUr_0, dUr_1, dUr_2;
temp2 = ME_dot3(kappa, r12hat);
flt_t uslj_rsq = u_slj / rsq_form[jj];
dUr_0 = temp1 * r12hat_0 + uslj_rsq * (kappa_0 - temp2 * r12hat_0);
dUr_1 = temp1 * r12hat_1 + uslj_rsq * (kappa_1 - temp2 * r12hat_1);
dUr_2 = temp1 * r12hat_2 + uslj_rsq * (kappa_2 - temp2 * r12hat_2);
// compute dChi_12/dr
flt_t dchi_0, dchi_1, dchi_2;
temp1 = ME_dot3(iota, r12hat);
temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
std::pow(chi, (mu - (flt_t)1.0) / mu);
dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
temp1 = -eta * u_r;
temp2 = eta * chi;
fforce_0 = temp1 * dchi_0 - temp2 * dUr_0;
fforce_1 = temp1 * dchi_1 - temp2 * dUr_1;
fforce_2 = temp1 * dchi_2 - temp2 * dUr_2;
// torque for particle 1 and 2
// compute dUr
tempv_0 = -uslj_rsq * kappa_0;
tempv_1 = -uslj_rsq * kappa_1;
tempv_2 = -uslj_rsq * kappa_2;
ME_vecmat(kappa, g1, tempv2);
ME_cross3(tempv, tempv2, dUr);
flt_t dUr2_0, dUr2_1, dUr2_2;
if (NEWTON_PAIR || j < nlocal) {
ME_vecmat(kappa, g2, tempv2);
ME_cross3(tempv, tempv2, dUr2);
}
// compute d_chi
ME_vecmat(iota, b1, tempv);
ME_cross3(tempv, iota, dchi);
temp1 = (flt_t)-4.0 / rsq_form[jj];
dchi_0 *= temp1;
dchi_1 *= temp1;
dchi_2 *= temp1;
flt_t dchi2_0, dchi2_1, dchi2_2;
if (NEWTON_PAIR || j < nlocal) {
ME_vecmat(iota, b2, tempv);
ME_cross3(tempv, iota, dchi2);
dchi2_0 *= temp1;
dchi2_1 *= temp1;
dchi2_2 *= temp1;
}
// compute d_eta
flt_t deta_0, deta_1, deta_2;
deta_0 = deta_1 = deta_2 = (flt_t)0.0;
ME_compute_eta_torque(g12, a1, shape2, temp);
temp1 = -eta * upsilon;
tempv_0 = temp1 * temp_0;
tempv_1 = temp1 * temp_1;
tempv_2 = temp1 * temp_2;
ME_mv0_cross3(a1, tempv, tempv2);
deta_0 += tempv2_0;
deta_1 += tempv2_1;
deta_2 += tempv2_2;
tempv_0 = temp1 * temp_3;
tempv_1 = temp1 * temp_4;
tempv_2 = temp1 * temp_5;
ME_mv1_cross3(a1, tempv, tempv2);
deta_0 += tempv2_0;
deta_1 += tempv2_1;
deta_2 += tempv2_2;
tempv_0 = temp1 * temp_6;
tempv_1 = temp1 * temp_7;
tempv_2 = temp1 * temp_8;
ME_mv2_cross3(a1, tempv, tempv2);
deta_0 += tempv2_0;
deta_1 += tempv2_1;
deta_2 += tempv2_2;
// compute d_eta for particle 2
flt_t deta2_0, deta2_1, deta2_2;
if (NEWTON_PAIR || j < nlocal) {
deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
ME_compute_eta_torque(g12, a2, shape2, temp);
tempv_0 = temp1 * temp_0;
tempv_1 = temp1 * temp_1;
tempv_2 = temp1 * temp_2;
ME_mv0_cross3(a2, tempv, tempv2);
deta2_0 += tempv2_0;
deta2_1 += tempv2_1;
deta2_2 += tempv2_2;
tempv_0 = temp1 * temp_3;
tempv_1 = temp1 * temp_4;
tempv_2 = temp1 * temp_5;
ME_mv1_cross3(a2, tempv, tempv2);
deta2_0 += tempv2_0;
deta2_1 += tempv2_1;
deta2_2 += tempv2_2;
tempv_0 = temp1 * temp_6;
tempv_1 = temp1 * temp_7;
tempv_2 = temp1 * temp_8;
ME_mv2_cross3(a2, tempv, tempv2);
deta2_0 += tempv2_0;
deta2_1 += tempv2_1;
deta2_2 += tempv2_2;
}
// torque
temp1 = u_r * eta;
temp2 = u_r * chi;
temp3 = chi * eta;
ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
(flt_t)-1.0;
ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
(flt_t)-1.0;
ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
(flt_t)-1.0;
if (NEWTON_PAIR || j < nlocal) {
rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
(flt_t)-1.0;
rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
(flt_t)-1.0;
rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
(flt_t)-1.0;
}
one_eng = temp1 * chi;
#ifndef INTEL_VMASK
if (jlist_form[jj] == nall) {
one_eng = (flt_t)0.0;
fforce_0 = 0.0;
fforce_1 = 0.0;
fforce_2 = 0.0;
ttor_0 = 0.0;
ttor_1 = 0.0;
ttor_2 = 0.0;
rtor_0 = 0.0;
rtor_1 = 0.0;
rtor_2 = 0.0;
}
#endif
fforce_0 *= factor_lj;
fforce_1 *= factor_lj;
fforce_2 *= factor_lj;
ttor_0 *= factor_lj;
ttor_1 *= factor_lj;
ttor_2 *= factor_lj;
#ifdef INTEL_VMASK
if (jlist_form[jj] < nall) {
#endif
fxtmp += fforce_0;
fytmp += fforce_1;
fztmp += fforce_2;
t1tmp += ttor_0;
t2tmp += ttor_1;
t3tmp += ttor_2;
if (NEWTON_PAIR || j < nlocal) {
rtor_0 *= factor_lj;
rtor_1 *= factor_lj;
rtor_2 *= factor_lj;
int jp = j * 2;
f[jp].x -= fforce_0;
f[jp].y -= fforce_1;
f[jp].z -= fforce_2;
jp++;
f[jp].x += rtor_0;
f[jp].y += rtor_1;
f[jp].z += rtor_2;
}
if (EVFLAG) {
flt_t ev_pre = (flt_t)0.0;
if (NEWTON_PAIR || i < nlocal)
ev_pre += (flt_t)0.5;
if (NEWTON_PAIR || j < nlocal)
ev_pre += (flt_t)0.5;
if (EFLAG) {
evdwl = factor_lj * one_eng;
sevdwl += ev_pre * evdwl;
if (eatom) {
if (NEWTON_PAIR || i < nlocal)
fwtmp += (flt_t)0.5 * evdwl;
if (NEWTON_PAIR || j < nlocal)
f[j*2].w += (flt_t)0.5 * evdwl;
}
}
if (vflag == 1) {
ev_pre *= (flt_t)-1.0;
sv0 += ev_pre * delx_form[jj] * fforce_0;
sv1 += ev_pre * dely_form[jj] * fforce_1;
sv2 += ev_pre * delz_form[jj] * fforce_2;
sv3 += ev_pre * delx_form[jj] * fforce_1;
sv4 += ev_pre * delx_form[jj] * fforce_2;
sv5 += ev_pre * dely_form[jj] * fforce_2;
}
} // EVFLAG
#ifdef INTEL_VMASK
}
#endif
} // for jj
// -------------------------------------------------------------
if (multiple_forms)
ierror = 2;
int ip = i * 2;
f[ip].x += fxtmp;
f[ip].y += fytmp;
f[ip].z += fztmp;
ip++;
f[ip].x += t1tmp;
f[ip].y += t2tmp;
f[ip].z += t3tmp;
if (EVFLAG) {
if (EFLAG) {
if (eatom) f[i * 2].w += fwtmp;
oevdwl += sevdwl;
}
if (vflag == 1) {
ov0 += sv0;
ov1 += sv1;
ov2 += sv2;
ov3 += sv3;
ov4 += sv4;
ov5 += sv5;
}
}
} // for i
int o_range;
if (NEWTON_PAIR)
o_range = nall;
else
o_range = nlocal;
if (offload == 0) o_range -= minlocal;
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
sizeof(FORCE_T));
const int two_iito = iito * 2;
acc_t *facc = &(f_start[0].x);
const int sto = two_iito * 4;
const int fst4 = f_stride * 4;
#if defined(_OPENMP)
#pragma omp barrier
#endif
int t_off = f_stride;
if (EFLAG && eatom) {
for (int t = 1; t < nthreads; t++) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector nontemporal
#pragma novector
#endif
for (int n = iifrom * 2; n < two_iito; n++) {
f_start[n].x += f_start[n + t_off].x;
f_start[n].y += f_start[n + t_off].y;
f_start[n].z += f_start[n + t_off].z;
f_start[n].w += f_start[n + t_off].w;
}
t_off += f_stride;
}
} else {
for (int t = 1; t < nthreads; t++) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector nontemporal
#pragma novector
#endif
for (int n = iifrom * 2; n < two_iito; n++) {
f_start[n].x += f_start[n + t_off].x;
f_start[n].y += f_start[n + t_off].y;
f_start[n].z += f_start[n + t_off].z;
}
t_off += f_stride;
}
}
if (EVFLAG) {
if (vflag==2) {
const ATOM_T * _noalias const xo = x + minlocal;
#if defined(LMP_SIMD_COMPILER)
#pragma vector nontemporal
#pragma novector
#endif
for (int n = iifrom; n < iito; n++) {
const int nt2 = n * 2;
ov0 += f_start[nt2].x * xo[n].x;
ov1 += f_start[nt2].y * xo[n].y;
ov2 += f_start[nt2].z * xo[n].z;
ov3 += f_start[nt2].y * xo[n].x;
ov4 += f_start[nt2].z * xo[n].x;
ov5 += f_start[nt2].z * xo[n].y;
}
}
}
if (ierror)
f_start[1].w = ierror;
} // omp
if (EVFLAG) {
if (EFLAG) {
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
}
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // offload
if (offload)
fix->stop_watch(TIME_OFFLOAD_LATENCY);
else
fix->stop_watch(TIME_HOST_PAIR);
if (EVFLAG)
fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
else
fix->add_result_array(f_start, 0, offload, 0, 0, 2);
}
/* ---------------------------------------------------------------------- */
void PairGayBerneIntel::init_style()
{
PairGayBerne::init_style();
neighbor->requests[neighbor->nrequest-1]->intel = 1;
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD
if (force->newton_pair) fix->set_offload_noghost(1);
_cop = fix->coprocessor_number();
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
int tp1 = atom->ntypes + 1;
_max_nbors = buffers->get_max_nbors();
int mthreads = comm->nthreads;
if (mthreads < buffers->get_off_threads())
mthreads = buffers->get_off_threads();
fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
}
}
for (int i = 0; i < 4; i++) {
fc.special_lj[i] = force->special_lj[i];
fc.special_lj[0] = 1.0;
}
fc.gamma = gamma;
fc.upsilon = upsilon;
fc.mu = mu;
for (int i = 0; i < tp1; i++) {
for (int j = 0; j < tp1; j++) {
fc.ijc[i][j].lj1 = lj1[i][j];
fc.ijc[i][j].lj2 = lj2[i][j];
fc.ijc[i][j].cutsq = cutsq[i][j];
fc.ijc[i][j].offset = offset[i][j];
fc.ijc[i][j].sigma = sigma[i][j];
fc.ijc[i][j].epsilon = epsilon[i][j];
fc.ijc[i][j].form = form[i][j];
fc.ijc[i][j].lshape = lshape[i] * lshape[j];
fc.lj34[i][j].lj3 = lj3[i][j];
fc.lj34[i][j].lj4 = lj4[i][j];
}
for (int j = 0; j < 4; j++) {
fc.ic[i].shape2[j] = shape2[i][j];
fc.ic[i].well[j] = well[i][j];
}
}
#ifdef _LMP_INTEL_OFFLOAD
if (_cop < 0) return;
flt_t * special_lj = fc.special_lj;
FC_PACKED1_T *oijc = fc.ijc[0];
FC_PACKED2_T *olj34 = fc.lj34[0];
FC_PACKED3_T *oic = fc.ic;
flt_t * ocutneighsq = cutneighsq[0];
int tp1sq = tp1 * tp1;
if (oijc != NULL && oic != NULL) {
#pragma offload_transfer target(mic:_cop) \
in(special_lj: length(4) alloc_if(0) free_if(0)) \
in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
in(oic: length(tp1) alloc_if(0) free_if(0)) \
in(ocutneighsq: length(tp1sq))
}
#endif
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int one_length,
const int nthreads,
Memory *memory,
const int cop) {
if (ntypes != _ntypes) {
if (_ntypes > 0) {
fc_packed3 *oic = ic;
#ifdef _LMP_INTEL_OFFLOAD
flt_t * ospecial_lj = special_lj;
fc_packed1 *oijc = ijc[0];
fc_packed2 *olj34 = lj34[0];
flt_t * orsq_form = rsq_form[0];
flt_t * odelx_form = delx_form[0];
flt_t * odely_form = dely_form[0];
flt_t * odelz_form = delz_form[0];
int * ojtype_form = jtype_form[0];
int * ojlist_form = jlist_form[0];
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
_cop >= 0) {
#pragma offload_transfer target(mic:_cop) \
nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
nocopy(odelz_form, ojtype_form, ojlist_form: alloc_if(0) free_if(1))
}
#endif
_memory->destroy(oic);
_memory->destroy(ijc);
_memory->destroy(lj34);
_memory->destroy(rsq_form);
_memory->destroy(delx_form);
_memory->destroy(dely_form);
_memory->destroy(delz_form);
_memory->destroy(jtype_form);
_memory->destroy(jlist_form);
}
if (ntypes > 0) {
_cop = cop;
memory->create(ijc, ntypes, ntypes, "fc.ijc");
memory->create(lj34, ntypes, ntypes, "fc.lj34");
memory->create(ic, ntypes, "fc.ic");
memory->create(rsq_form, nthreads, one_length, "rsq_form");
memory->create(delx_form, nthreads, one_length, "delx_form");
memory->create(dely_form, nthreads, one_length, "dely_form");
memory->create(delz_form, nthreads, one_length, "delz_form");
memory->create(jtype_form, nthreads, one_length, "jtype_form");
memory->create(jlist_form, nthreads, one_length, "jlist_form");
for (int zn = 0; zn < nthreads; zn++)
for (int zo = 0; zo < one_length; zo++) {
rsq_form[zn][zo] = 10.0;
delx_form[zn][zo] = 10.0;
dely_form[zn][zo] = 10.0;
delz_form[zn][zo] = 10.0;
jtype_form[zn][zo] = 1;
jlist_form[zn][zo] = 0;
}
#ifdef _LMP_INTEL_OFFLOAD
flt_t * ospecial_lj = special_lj;
fc_packed1 *oijc = ijc[0];
fc_packed2 *olj34 = lj34[0];
fc_packed3 *oic = ic;
flt_t * orsq_form = rsq_form[0];
flt_t * odelx_form = delx_form[0];
flt_t * odely_form = dely_form[0];
flt_t * odelz_form = delz_form[0];
int * ojtype_form = jtype_form[0];
int * ojlist_form = jlist_form[0];
int off_onel = one_length * nthreads;
int tp1sq = ntypes*ntypes;
if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
oic != NULL && orsq_form != NULL && odelx_form != NULL &&
odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
ojlist_form !=NULL && cop >= 0) {
#pragma offload_transfer target(mic:cop) \
nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
nocopy(oic: length(ntypes) alloc_if(1) free_if(0)) \
in(orsq_form: length(off_onel) alloc_if(1) free_if(0)) \
in(odelx_form: length(off_onel) alloc_if(1) free_if(0)) \
in(odely_form: length(off_onel) alloc_if(1) free_if(0)) \
in(odelz_form: length(off_onel) alloc_if(1) free_if(0)) \
in(ojtype_form: length(off_onel) alloc_if(1) free_if(0)) \
in(ojlist_form: length(off_onel) alloc_if(1) free_if(0))
}
#endif
}
}
_ntypes = ntypes;
_memory = memory;
}
Event Timeline
Log In to Comment