Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F88628986
pair_lj_cut_intel.cpp
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Oct 19, 20:38
Size
14 KB
Mime Type
text/x-c++
Expires
Mon, Oct 21, 20:38 (2 d)
Engine
blob
Format
Raw Data
Handle
21800647
Attached To
rLAMMPS lammps
pair_lj_cut_intel.cpp
View Options
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
This software is distributed under the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include <math.h>
#include "pair_lj_cut_intel.h"
#include "atom.h"
#include "comm.h"
#include "force.h"
#include "memory.h"
#include "modify.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
/* ---------------------------------------------------------------------- */
PairLJCutIntel::PairLJCutIntel(LAMMPS *lmp) :
PairLJCut(lmp)
{
suffix_flag |= Suffix::INTEL;
respa_enable = 0;
cut_respa = NULL;
}
/* ---------------------------------------------------------------------- */
void PairLJCutIntel::compute(int eflag, int vflag)
{
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
fix->balance_stamp();
vflag_fdotr = 0;
}
template <class flt_t, class acc_t>
void PairLJCutIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag, vflag);
} else evflag = vflag_fdotr = 0;
const int inum = list->inum;
const int nthreads = comm->nthreads;
const int host_start = fix->host_start_pair();
const int offload_end = fix->offload_end_pair();
const int ago = neighbor->ago;
if (ago != 0 && fix->separate_buffers() == 0) {
fix->start_watch(TIME_PACK);
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
nthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago);
}
fix->stop_watch(TIME_PACK);
}
if (_onetype) {
if (evflag || vflag_fdotr) {
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (eflag) {
if (force->newton_pair) {
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
if (force->newton_pair) {
eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
} else {
eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
}
}
} else {
if (evflag || vflag_fdotr) {
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (eflag) {
if (force->newton_pair) {
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
if (force->newton_pair) {
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
} else {
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
}
}
}
}
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
class acc_t>
void PairLJCutIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc,
const int astart, const int aend)
{
const int inum = aend - astart;
if (inum == 0) return;
int nlocal, nall, minlocal;
fix->get_buffern(offload, nlocal, nall, minlocal);
const int ago = neighbor->ago;
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
ATOM_T * _noalias const x = buffers->get_x(offload);
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const flt_t * _noalias const special_lj = fc.special_lj;
const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0];
const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
// Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
int *overflow = fix->get_off_overflow_flag();
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EVFLAG) {
oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
}
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,nlocal,nall,minlocal) \
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int iifrom, iito, tid;
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
iifrom += astart;
iito += astart;
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
if (ONETYPE) {
cutsq = ljc12o[3].cutsq;
lj1 = ljc12o[3].lj1;
lj2 = ljc12o[3].lj2;
lj3 = lj34[3].lj3;
lj4 = lj34[3].lj4;
offset = ljc12o[3].offset;
}
for (int i = iifrom; i < iito; ++i) {
int itype, ptr_off;
const FC_PACKED1_T * _noalias ljc12oi;
const FC_PACKED2_T * _noalias lj34i;
if (!ONETYPE) {
itype = x[i].w;
ptr_off = itype * ntypes;
ljc12oi = ljc12o + ptr_off;
lj34i = lj34 + ptr_off;
}
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
acc_t fxtmp, fytmp, fztmp, fwtmp;
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
fxtmp = fytmp = fztmp = (acc_t)0;
if (EVFLAG) {
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcelj, evdwl;
forcelj = evdwl = (flt_t)0.0;
int j, jtype, sbindex;
if (!ONETYPE) {
sbindex = jlist[jj] >> SBBITS & 3;
j = jlist[jj] & NEIGHMASK;
} else
j = jlist[jj];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
if (!ONETYPE) {
jtype = x[j].w;
cutsq = ljc12oi[jtype].cutsq;
}
const flt_t rsq = delx * delx + dely * dely + delz * delz;
#ifdef INTEL_VMASK
if (rsq < cutsq) {
#endif
flt_t factor_lj;
if (!ONETYPE) factor_lj = special_lj[sbindex];
flt_t r2inv = 1.0 / rsq;
flt_t r6inv = r2inv * r2inv * r2inv;
#ifndef INTEL_VMASK
if (rsq > cutsq) r6inv = (flt_t)0.0;
#endif
if (!ONETYPE) {
lj1 = ljc12oi[jtype].lj1;
lj2 = ljc12oi[jtype].lj2;
}
forcelj = r6inv * (lj1 * r6inv - lj2);
flt_t fpair;
if (!ONETYPE)
fpair = factor_lj * forcelj * r2inv;
else
fpair = forcelj * r2inv;
fxtmp += delx * fpair;
fytmp += dely * fpair;
fztmp += delz * fpair;
if (NEWTON_PAIR || j < nlocal) {
f[j].x -= delx * fpair;
f[j].y -= dely * fpair;
f[j].z -= delz * fpair;
}
if (EVFLAG) {
flt_t ev_pre = (flt_t)0;
if (NEWTON_PAIR || i<nlocal)
ev_pre += (flt_t)0.5;
if (NEWTON_PAIR || j<nlocal)
ev_pre += (flt_t)0.5;
if (EFLAG) {
if (!ONETYPE) {
lj3 = lj34i[jtype].lj3;
lj4 = lj34i[jtype].lj4;
offset = ljc12oi[jtype].offset;
}
evdwl = r6inv * (lj3 * r6inv - lj4) - offset;
if (!ONETYPE) evdwl *= factor_lj;
sevdwl += ev_pre*evdwl;
if (eatom) {
if (NEWTON_PAIR || i < nlocal)
fwtmp += 0.5 * evdwl;
if (NEWTON_PAIR || j < nlocal)
f[j].w += 0.5 * evdwl;
}
}
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
delx, dely, delz);
}
#ifdef INTEL_VMASK
} // if rsq
#endif
} // for jj
f[i].x += fxtmp;
f[i].y += fytmp;
f[i].z += fztmp;
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
} // for ii
#ifndef _LMP_INTEL_OFFLOAD
if (vflag == 2)
#endif
{
#if defined(_OPENMP)
#pragma omp barrier
#endif
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
nlocal, minlocal, nthreads, f_start, f_stride,
x, offload);
}
} // end omp
if (EVFLAG) {
if (EFLAG) {
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
}
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
if (offload)
fix->stop_watch(TIME_OFFLOAD_LATENCY);
else
fix->stop_watch(TIME_HOST_PAIR);
if (EVFLAG)
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
else
fix->add_result_array(f_start, 0, offload);
}
/* ---------------------------------------------------------------------- */
void PairLJCutIntel::init_style()
{
PairLJCut::init_style();
neighbor->requests[neighbor->nrequest-1]->intel = 1;
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD
if (fix->offload_balance() != 0.0)
error->all(FLERR,
"Offload for lj/cut/intel is not yet available. Set balance to 0.");
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
_onetype = 0;
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1,memory,_cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
}
}
}
for (int i = 0; i < 4; i++) {
fc.special_lj[i] = force->special_lj[i];
fc.special_lj[0] = 1.0;
}
for (int i = 0; i < tp1; i++) {
for (int j = 0; j < tp1; j++) {
fc.ljc12o[i][j].lj1 = lj1[i][j];
fc.ljc12o[i][j].lj2 = lj2[i][j];
fc.lj34[i][j].lj3 = lj3[i][j];
fc.lj34[i][j].lj4 = lj4[i][j];
fc.ljc12o[i][j].cutsq = cutsq[i][j];
fc.ljc12o[i][j].offset = offset[i][j];
}
}
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
Memory *memory,
const int cop) {
if (ntypes != _ntypes) {
if (_ntypes > 0) {
fc_packed1 *oljc12o = ljc12o[0];
fc_packed2 *olj34 = lj34[0];
_memory->destroy(oljc12o);
_memory->destroy(olj34);
}
if (ntypes > 0) {
_cop = cop;
memory->create(ljc12o,ntypes,ntypes,"fc.c12o");
memory->create(lj34,ntypes,ntypes,"fc.lj34");
}
}
_ntypes = ntypes;
_memory = memory;
}
Event Timeline
Log In to Comment