pppm_disp_omp.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Oct 9, 10:01

pppm_disp_omp.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing author: Axel Kohlmeyer (Temple U), Rolf Isele-Holder (RWTH Aachen University)
	------------------------------------------------------------------------- */

	#include "pppm_disp_omp.h"
	#include "atom.h"
	#include "comm.h"
	#include "domain.h"
	#include "force.h"
	#include "memory.h"
	#include "math_const.h"

	#include <string.h>
	#include <math.h>

	#include "suffix.h"
	using namespace LAMMPS_NS;
	using namespace MathConst;

	#ifdef FFT_SINGLE
	#define ZEROF 0.0f
	#define ONEF 1.0f
	#else
	#define ZEROF 0.0
	#define ONEF 1.0
	#endif

	#define OFFSET 16384


	/* ---------------------------------------------------------------------- */

	PPPMDispOMP::PPPMDispOMP(LAMMPS lmp, int narg, char *arg) :
	PPPMDisp(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
	{
	triclinic_support = 0;
	suffix_flag \|= Suffix::OMP;
	}

	/* ----------------------------------------------------------------------
	allocate memory that depends on # of K-vectors and order
	------------------------------------------------------------------------- */

	void PPPMDispOMP::allocate()
	{
	PPPMDisp::allocate();

	#if defined(_OPENMP)
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	const int tid = omp_get_thread_num();
	#else
	const int tid = 0;
	#endif

	if (function[0]) {
	ThrData *thr = fix->get_thr(tid);
	thr->init_pppm(order,memory);
	}
	if (function[1] + function[2]) {
	ThrData * thr = fix->get_thr(tid);
	thr->init_pppm_disp(order_6,memory);
	}
	}
	}

	/* ----------------------------------------------------------------------
	free memory that depends on # of K-vectors and order
	------------------------------------------------------------------------- */

	void PPPMDispOMP::deallocate()
	{
	PPPMDisp::deallocate();

	#if defined(_OPENMP)
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	const int tid = omp_get_thread_num();
	#else
	const int tid = 0;
	#endif
	if (function[0]) {
	ThrData * thr = fix->get_thr(tid);
	thr->init_pppm(-order,memory);
	}
	if (function[1] + function[2]) {
	ThrData * thr = fix->get_thr(tid);
	thr->init_pppm_disp(-order_6,memory);
	}
	}
	}

	/* ----------------------------------------------------------------------
	Compute the modified (hockney-eastwood) coulomb green function
	------------------------------------------------------------------------- */

	void PPPMDispOMP::compute_gf()
	{
	#if defined(_OPENMP)
	#pragma omp parallel default(none)
	#endif
	{

	double *prd;
	if (triclinic == 0) prd = domain->prd;
	else prd = domain->prd_lamda;

	double xprd = prd[0];
	double yprd = prd[1];
	double zprd = prd[2];
	double zprd_slab = zprd*slab_volfactor;

	double unitkx = (2.0*MY_PI/xprd);
	double unitky = (2.0*MY_PI/yprd);
	double unitkz = (2.0*MY_PI/zprd_slab);

	int tid,nn,nnfrom,nnto,k,l,m;
	int kper,lper,mper;
	double snx,sny,snz,snx2,sny2,snz2;
	double sqk;
	double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
	double numerator,denominator;

	const int nnx = nxhi_fft-nxlo_fft+1;
	const int nny = nyhi_fft-nylo_fft+1;

	loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads);

	for (m = nzlo_fft; m <= nzhi_fft; m++) {
	mper = m - nz_pppm(2m/nz_pppm);
	qz = unitkz*mper;
	snz = sin(0.5qzzprd_slab/nz_pppm);
	snz2 = snz*snz;
	sz = exp(-0.25*pow(qz/g_ewald,2.0));
	wz = 1.0;
	argz = 0.5qzzprd_slab/nz_pppm;
	if (argz != 0.0) wz = pow(sin(argz)/argz,order);
	wz *= wz;

	for (l = nylo_fft; l <= nyhi_fft; l++) {
	lper = l - ny_pppm(2l/ny_pppm);
	qy = unitky*lper;
	sny = sin(0.5qyyprd/ny_pppm);
	sny2 = sny*sny;
	sy = exp(-0.25*pow(qy/g_ewald,2.0));
	wy = 1.0;
	argy = 0.5qyyprd/ny_pppm;
	if (argy != 0.0) wy = pow(sin(argy)/argy,order);
	wy *= wy;

	for (k = nxlo_fft; k <= nxhi_fft; k++) {

	/* only compute the part designated to this thread */
	nn = k-nxlo_fft + nnx(l-nylo_fft + nny(m-nzlo_fft));
	if ((nn < nnfrom) \|\| (nn >=nnto)) continue;

	kper = k - nx_pppm(2k/nx_pppm);
	qx = unitkx*kper;
	snx = sin(0.5qxxprd/nx_pppm);
	snx2 = snx*snx;
	sx = exp(-0.25*pow(qx/g_ewald,2.0));
	wx = 1.0;
	argx = 0.5qxxprd/nx_pppm;
	if (argx != 0.0) wx = pow(sin(argx)/argx,order);
	wx *= wx;

	sqk = pow(qx,2.0) + pow(qy,2.0) + pow(qz,2.0);

	if (sqk != 0.0) {
	numerator = 4.0*MY_PI/sqk;
	denominator = gf_denom(snx2,sny2,snz2, gf_b, order);
	greensfn[nn] = numeratorsxsyszwxwywz/denominator;
	} else greensfn[nn] = 0.0;
	}
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	Compyute the modified (hockney-eastwood) dispersion green function
	------------------------------------------------------------------------- */

	void PPPMDispOMP::compute_gf_6()
	{
	#if defined(_OPENMP)
	#pragma omp parallel default(none)
	#endif
	{
	double *prd;
	int k,l,m,nn;

	// volume-dependent factors
	// adjust z dimension for 2d slab PPPM
	// z dimension for 3d PPPM is zprd since slab_volfactor = 1.0

	if (triclinic == 0) prd = domain->prd;
	else prd = domain->prd_lamda;

	double xprd = prd[0];
	double yprd = prd[1];
	double zprd = prd[2];
	double zprd_slab = zprd*slab_volfactor;

	double unitkx = (2.0*MY_PI/xprd);
	double unitky = (2.0*MY_PI/yprd);
	double unitkz = (2.0*MY_PI/zprd_slab);

	int kper,lper,mper;
	double sqk;
	double snx,sny,snz,snx2,sny2,snz2;
	double argx,argy,argz,wx,wy,wz,sx,sy,sz;
	double qx,qy,qz;
	double rtsqk, term;
	double numerator,denominator;
	double inv2ew = 2*g_ewald_6;
	inv2ew = 1/inv2ew;
	double rtpi = sqrt(MY_PI);
	int nnfrom, nnto, tid;

	numerator = -MY_PIrtpig_ewald_6g_ewald_6g_ewald_6/(3.0);

	const int nnx = nxhi_fft_6-nxlo_fft_6+1;
	const int nny = nyhi_fft_6-nylo_fft_6+1;

	loop_setup_thr(nnfrom, nnto, tid, nfft_6, comm->nthreads);

	for (m = nzlo_fft_6; m <= nzhi_fft_6; m++) {
	mper = m - nz_pppm_6(2m/nz_pppm_6);
	qz = unitkz*mper;
	snz = sin(0.5unitkzmper*zprd_slab/nz_pppm_6);
	snz2 = snz*snz;
	sz = exp(-qzqzinv2ew*inv2ew);
	wz = 1.0;
	argz = 0.5qzzprd_slab/nz_pppm_6;
	if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
	wz *= wz;

	for (l = nylo_fft_6; l <= nyhi_fft_6; l++) {
	lper = l - ny_pppm_6(2l/ny_pppm_6);
	qy = unitky*lper;
	sny = sin(0.5unitkylper*yprd/ny_pppm_6);
	sny2 = sny*sny;
	sy = exp(-qyqyinv2ew*inv2ew);
	wy = 1.0;
	argy = 0.5qyyprd/ny_pppm_6;
	if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
	wy *= wy;

	for (k = nxlo_fft_6; k <= nxhi_fft_6; k++) {

	/* only compute the part designated to this thread */
	nn = k-nxlo_fft_6 + nnx(l-nylo_fft_6 + nny(m-nzlo_fft_6));
	if ((nn < nnfrom) \|\| (nn >=nnto)) continue;

	kper = k - nx_pppm_6(2k/nx_pppm_6);
	qx = unitkx*kper;
	snx = sin(0.5unitkxkper*xprd/nx_pppm_6);
	snx2 = snx*snx;
	sx = exp(-qxqxinv2ew*inv2ew);
	wx = 1.0;
	argx = 0.5qxxprd/nx_pppm_6;
	if (argx != 0.0) wx = pow(sin(argx)/argx,order_6);
	wx *= wx;

	sqk = pow(qx,2.0) + pow(qy,2.0) + pow(qz,2.0);

	if (sqk != 0.0) {
	denominator = gf_denom(snx2,sny2,snz2, gf_b_6, order_6);
	rtsqk = sqrt(sqk);
	term = (1-2sqkinv2ewinv2ew)sxsysz +
	2sqkrtsqkinv2ewinv2ewinv2ewrtpierfc(rtsqkinv2ew);
	greensfn_6[nn] = numeratortermwxwywz/denominator;
	} else greensfn_6[nn] = 0.0;
	}
	}
	}
	}
	}
	/* ----------------------------------------------------------------------
	run the regular toplevel compute method from plain PPPPM
	which will have individual methods replaced by our threaded
	versions and then call the obligatory force reduction.
	------------------------------------------------------------------------- */

	void PPPMDispOMP::compute(int eflag, int vflag)
	{

	PPPMDisp::compute(eflag,vflag);
	#if defined(_OPENMP)
	#pragma omp parallel default(none) shared(eflag,vflag)
	#endif
	{
	#if defined(_OPENMP)
	const int tid = omp_get_thread_num();
	#else
	const int tid = 0;
	#endif

	ThrData *thr = fix->get_thr(tid);
	reduce_thr(this, eflag, vflag, thr);
	} // end of omp parallel region
	}


	/* ----------------------------------------------------------------------
	find center grid pt for each of my particles
	check that full stencil for the particle will fit in my 3d brick
	store central grid pt indices in part2grid array
	------------------------------------------------------------------------- */

	void PPPMDispOMP::particle_map(double dxinv, double dyinv,
	double dzinv, double sft,
	int ** part2grid, int nup,
	int nlw, int nxlo_o,
	int nylo_o, int nzlo_o,
	int nxhi_o, int nyhi_o,
	int nzhi_o)
	{
	const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
	int3_t * _noalias const p2g = (int3_t *) part2grid[0];
	const double boxlox = boxlo[0];
	const double boxloy = boxlo[1];
	const double boxloz = boxlo[2];
	const int nlocal = atom->nlocal;

	const double delxinv = dxinv;
	const double delyinv = dyinv;
	const double delzinv = dzinv;
	const double shift = sft;
	const int nupper = nup;
	const int nlower = nlw;
	const int nxlo_out = nxlo_o;
	const int nylo_out = nylo_o;
	const int nzlo_out = nzlo_o;
	const int nxhi_out = nxhi_o;
	const int nyhi_out = nyhi_o;
	const int nzhi_out = nzhi_o;

	if (!isfinite(boxlo[0]) \|\| !isfinite(boxlo[1]) \|\| !isfinite(boxlo[2]))
	error->one(FLERR,"Non-numeric box dimensions. Simulation unstable.");

	int i, flag = 0;
	#if defined(_OPENMP)
	#pragma omp parallel for private(i) default(none) reduction(+:flag) schedule(static)
	#endif
	for (i = 0; i < nlocal; i++) {

	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// current particle coord can be outside global and local box
	// add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1

	const int nx = static_cast<int> ((x[i].x-boxlox)*delxinv+shift) - OFFSET;
	const int ny = static_cast<int> ((x[i].y-boxloy)*delyinv+shift) - OFFSET;
	const int nz = static_cast<int> ((x[i].z-boxloz)*delzinv+shift) - OFFSET;

	p2g[i].a = nx;
	p2g[i].b = ny;
	p2g[i].t = nz;

	// check that entire stencil around nx,ny,nz will fit in my 3d brick

	if (nx+nlower < nxlo_out \|\| nx+nupper > nxhi_out \|\|
	ny+nlower < nylo_out \|\| ny+nupper > nyhi_out \|\|
	nz+nlower < nzlo_out \|\| nz+nupper > nzhi_out)
	flag++;
	}

	int flag_all;
	MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
	if (flag_all) error->all(FLERR,"Out of range atoms - cannot compute PPPM");
	}

	/* ----------------------------------------------------------------------
	create discretized "density" on section of global grid due to my particles
	density(x,y,z) = charge "density" at grid points of my 3d brick
	(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
	in global grid
	------------------------------------------------------------------------- */

	void PPPMDispOMP::make_rho_c()
	{

	// clear 3d density array

	FFT_SCALAR * _noalias const d = &(density_brick[nzlo_out][nylo_out][nxlo_out]);
	memset(d,0,ngrid*sizeof(FFT_SCALAR));

	// no local atoms => nothing else to do

	const int nlocal = atom->nlocal;
	if (nlocal == 0) return;

	const int ix = nxhi_out - nxlo_out + 1;
	const int iy = nyhi_out - nylo_out + 1;

	#if defined(_OPENMP)
	#pragma omp parallel default(none)
	#endif
	{
	const double * _noalias const q = atom->q;
	const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
	const int3_t * _noalias const p2g = (int3_t *) part2grid[0];

	const double boxlox = boxlo[0];
	const double boxloy = boxlo[1];
	const double boxloz = boxlo[2];

	// determine range of grid points handled by this thread
	int i,jfrom,jto,tid;
	loop_setup_thr(jfrom,jto,tid,ngrid,comm->nthreads);

	// get per thread data
	ThrData *thr = fix->get_thr(tid);
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());

	// loop over my charges, add their contribution to nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt

	// loop over all local atoms for all threads
	for (i = 0; i < nlocal; i++) {

	const int nx = p2g[i].a;
	const int ny = p2g[i].b;
	const int nz = p2g[i].t;

	// pre-screen whether this atom will ever come within
	// reach of the data segement this thread is updating.
	if ( ((nz+nlower-nzlo_out)ixiy >= jto)
	\|\| ((nz+nupper-nzlo_out+1)ixiy < jfrom) ) continue;

	const FFT_SCALAR dx = nx+shiftone - (x[i].x-boxlox)*delxinv;
	const FFT_SCALAR dy = ny+shiftone - (x[i].y-boxloy)*delyinv;
	const FFT_SCALAR dz = nz+shiftone - (x[i].z-boxloz)*delzinv;

	compute_rho1d_thr(r1d,dx,dy,dz,order,rho_coeff);

	const FFT_SCALAR z0 = delvolinv * q[i];

	for (int n = nlower; n <= nupper; ++n) {
	const int jn = (nz+n-nzlo_out)ixiy;
	const FFT_SCALAR y0 = z0*r1d[2][n];

	for (int m = nlower; m <= nupper; ++m) {
	const int jm = jn+(ny+m-nylo_out)*ix;
	const FFT_SCALAR x0 = y0*r1d[1][m];

	for (int l = nlower; l <= nupper; ++l) {
	const int jl = jm+nx+l-nxlo_out;
	// make sure each thread only updates
	// "his" elements of the density grid
	if (jl >= jto) break;
	if (jl < jfrom) continue;

	d[jl] += x0*r1d[0][l];
	}
	}
	}
	}
	}
	}


	/* ----------------------------------------------------------------------
	same as above for dispersion interaction with geometric mixing rule
	------------------------------------------------------------------------- */

	void PPPMDispOMP::make_rho_g()
	{

	// clear 3d density array

	FFT_SCALAR * _noalias const d = &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]);
	memset(d,0,ngrid_6*sizeof(FFT_SCALAR));

	// no local atoms => nothing else to do

	const int nlocal = atom->nlocal;
	if (nlocal == 0) return;

	const int ix = nxhi_out_6 - nxlo_out_6 + 1;
	const int iy = nyhi_out_6 - nylo_out_6 + 1;

	#if defined(_OPENMP)
	#pragma omp parallel default(none)
	#endif
	{
	const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
	const int3_t * _noalias const p2g = (int3_t *) part2grid_6[0];

	const double boxlox = boxlo[0];
	const double boxloy = boxlo[1];
	const double boxloz = boxlo[2];

	// determine range of grid points handled by this thread
	int i,jfrom,jto,tid;
	loop_setup_thr(jfrom,jto,tid,ngrid_6,comm->nthreads);

	// get per thread data
	ThrData *thr = fix->get_thr(tid);
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());

	// loop over my charges, add their contribution to nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt

	// loop over all local atoms for all threads
	for (i = 0; i < nlocal; i++) {

	const int nx = p2g[i].a;
	const int ny = p2g[i].b;
	const int nz = p2g[i].t;

	// pre-screen whether this atom will ever come within
	// reach of the data segement this thread is updating.
	if ( ((nz+nlower_6-nzlo_out_6)ixiy >= jto)
	\|\| ((nz+nupper_6-nzlo_out_6+1)ixiy < jfrom) ) continue;

	const FFT_SCALAR dx = nx+shiftone_6 - (x[i].x-boxlox)*delxinv_6;
	const FFT_SCALAR dy = ny+shiftone_6 - (x[i].y-boxloy)*delyinv_6;
	const FFT_SCALAR dz = nz+shiftone_6 - (x[i].z-boxloz)*delzinv_6;

	compute_rho1d_thr(r1d,dx,dy,dz,order_6,rho_coeff_6);

	const int type = atom->type[i];
	const double lj = B[type];
	const FFT_SCALAR z0 = delvolinv_6 * lj;

	for (int n = nlower_6; n <= nupper_6; ++n) {
	const int jn = (nz+n-nzlo_out_6)ixiy;
	const FFT_SCALAR y0 = z0*r1d[2][n];

	for (int m = nlower_6; m <= nupper_6; ++m) {
	const int jm = jn+(ny+m-nylo_out_6)*ix;
	const FFT_SCALAR x0 = y0*r1d[1][m];

	for (int l = nlower_6; l <= nupper_6; ++l) {
	const int jl = jm+nx+l-nxlo_out_6;
	// make sure each thread only updates
	// "his" elements of the density grid
	if (jl >= jto) break;
	if (jl < jfrom) continue;

	d[jl] += x0*r1d[0][l];
	}
	}
	}
	}
	}
	}


	/* ----------------------------------------------------------------------
	same as above for dispersion interaction with arithmetic mixing rule
	------------------------------------------------------------------------- */

	void PPPMDispOMP::make_rho_a()
	{

	// clear 3d density array

	FFT_SCALAR * _noalias const d0 = &(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]);
	FFT_SCALAR * _noalias const d1 = &(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]);
	FFT_SCALAR * _noalias const d2 = &(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]);
	FFT_SCALAR * _noalias const d3 = &(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]);
	FFT_SCALAR * _noalias const d4 = &(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]);
	FFT_SCALAR * _noalias const d5 = &(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]);
	FFT_SCALAR * _noalias const d6 = &(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]);

	memset(d0,0,ngrid_6*sizeof(FFT_SCALAR));
	memset(d1,0,ngrid_6*sizeof(FFT_SCALAR));
	memset(d2,0,ngrid_6*sizeof(FFT_SCALAR));
	memset(d3,0,ngrid_6*sizeof(FFT_SCALAR));
	memset(d4,0,ngrid_6*sizeof(FFT_SCALAR));
	memset(d5,0,ngrid_6*sizeof(FFT_SCALAR));
	memset(d6,0,ngrid_6*sizeof(FFT_SCALAR));

	// no local atoms => nothing else to do

	const int nlocal = atom->nlocal;
	if (nlocal == 0) return;

	const int ix = nxhi_out_6 - nxlo_out_6 + 1;
	const int iy = nyhi_out_6 - nylo_out_6 + 1;

	#if defined(_OPENMP)
	#pragma omp parallel default(none)
	#endif
	{
	const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
	const int3_t * _noalias const p2g = (int3_t *) part2grid_6[0];

	const double boxlox = boxlo[0];
	const double boxloy = boxlo[1];
	const double boxloz = boxlo[2];

	// determine range of grid points handled by this thread
	int i,jfrom,jto,tid;
	loop_setup_thr(jfrom,jto,tid,ngrid_6,comm->nthreads);

	// get per thread data
	ThrData *thr = fix->get_thr(tid);
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());

	// loop over my charges, add their contribution to nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt

	// loop over all local atoms for all threads
	for (i = 0; i < nlocal; i++) {

	const int nx = p2g[i].a;
	const int ny = p2g[i].b;
	const int nz = p2g[i].t;

	// pre-screen whether this atom will ever come within
	// reach of the data segement this thread is updating.
	if ( ((nz+nlower_6-nzlo_out_6)ixiy >= jto)
	\|\| ((nz+nupper_6-nzlo_out_6+1)ixiy < jfrom) ) continue;

	const FFT_SCALAR dx = nx+shiftone_6 - (x[i].x-boxlox)*delxinv_6;
	const FFT_SCALAR dy = ny+shiftone_6 - (x[i].y-boxloy)*delyinv_6;
	const FFT_SCALAR dz = nz+shiftone_6 - (x[i].z-boxloz)*delzinv_6;

	compute_rho1d_thr(r1d,dx,dy,dz,order_6,rho_coeff_6);

	const int type = atom->type[i];
	const double lj0 = B[7*type];
	const double lj1 = B[7*type+1];
	const double lj2 = B[7*type+2];
	const double lj3 = B[7*type+3];
	const double lj4 = B[7*type+4];
	const double lj5 = B[7*type+5];
	const double lj6 = B[7*type+6];

	const FFT_SCALAR z0 = delvolinv_6;

	for (int n = nlower_6; n <= nupper_6; ++n) {
	const int jn = (nz+n-nzlo_out_6)ixiy;
	const FFT_SCALAR y0 = z0*r1d[2][n];

	for (int m = nlower_6; m <= nupper_6; ++m) {
	const int jm = jn+(ny+m-nylo_out_6)*ix;
	const FFT_SCALAR x0 = y0*r1d[1][m];

	for (int l = nlower_6; l <= nupper_6; ++l) {
	const int jl = jm+nx+l-nxlo_out_6;
	// make sure each thread only updates
	// "his" elements of the density grid
	if (jl >= jto) break;
	if (jl < jfrom) continue;

	const double w = x0*r1d[0][l];

	d0[jl] += w*lj0;
	d1[jl] += w*lj1;
	d2[jl] += w*lj2;
	d3[jl] += w*lj3;
	d4[jl] += w*lj4;
	d5[jl] += w*lj5;
	d6[jl] += w*lj6;
	}
	}
	}
	}
	}
	}


	/* ----------------------------------------------------------------------
	interpolate from grid to get electric field & force on my particles
	for ik scheme
	------------------------------------------------------------------------- */

	void PPPMDispOMP::fieldforce_c_ik()
	{
	const int nlocal = atom->nlocal;

	// no local atoms => nothing to do

	if (nlocal == 0) return;

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of E-field on particle

	const double * const q = atom->q;
	const double * const * const x = atom->x;
	const double qqrd2e = force->qqrd2e;

	#if defined(_OPENMP)
	const int nthreads = comm->nthreads;
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	// each thread works on a fixed chunk of atoms.
	const int tid = omp_get_thread_num();
	const int inum = nlocal;
	const int idelta = 1 + inum/nthreads;
	const int ifrom = tid*idelta;
	const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
	#else
	const int ifrom = 0;
	const int ito = nlocal;
	const int tid = 0;
	#endif
	ThrData *thr = fix->get_thr(tid);
	double * const * const f = thr->get_f();
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());

	int l,m,n,nx,ny,nz,mx,my,mz;
	FFT_SCALAR dx,dy,dz,x0,y0,z0;
	FFT_SCALAR ekx,eky,ekz;

	// this if protects against having more threads than local atoms
	if (ifrom < nlocal) {
	for (int i = ifrom; i < ito; i++) {

	nx = part2grid[i][0];
	ny = part2grid[i][1];
	nz = part2grid[i][2];
	dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
	dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
	dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;

	compute_rho1d_thr(r1d,dx,dy,dz, order, rho_coeff);

	ekx = eky = ekz = ZEROF;
	for (n = nlower; n <= nupper; n++) {
	mz = n+nz;
	z0 = r1d[2][n];
	for (m = nlower; m <= nupper; m++) {
	my = m+ny;
	y0 = z0*r1d[1][m];
	for (l = nlower; l <= nupper; l++) {
	mx = l+nx;
	x0 = y0*r1d[0][l];
	ekx -= x0*vdx_brick[mz][my][mx];
	eky -= x0*vdy_brick[mz][my][mx];
	ekz -= x0*vdz_brick[mz][my][mx];
	}
	}
	}

	// convert E-field to force
	const double qfactor = qqrd2escaleq[i];
	f[i][0] += qfactor*ekx;
	f[i][1] += qfactor*eky;
	f[i][2] += qfactor*ekz;
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get electric field & force on my particles
	for ad scheme
	------------------------------------------------------------------------- */

	void PPPMDispOMP::fieldforce_c_ad()
	{
	const int nlocal = atom->nlocal;

	// no local atoms => nothing to do

	if (nlocal == 0) return;

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of E-field on particle

	const double * const q = atom->q;
	const double * const * const x = atom->x;
	const double qqrd2e = force->qqrd2e;
	//const double * const sf_c = sf_coeff;

	double *prd;

	if (triclinic == 0) prd = domain->prd;
	else prd = domain->prd_lamda;

	double xprd = prd[0];
	double yprd = prd[1];
	double zprd = prd[2];
	double zprd_slab = zprd*slab_volfactor;

	const double hx_inv = nx_pppm/xprd;
	const double hy_inv = ny_pppm/yprd;
	const double hz_inv = nz_pppm/zprd_slab;

	#if defined(_OPENMP)
	const int nthreads = comm->nthreads;
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	// each thread works on a fixed chunk of atoms.
	const int tid = omp_get_thread_num();
	const int inum = nlocal;
	const int idelta = 1 + inum/nthreads;
	const int ifrom = tid*idelta;
	const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
	#else
	const int ifrom = 0;
	const int ito = nlocal;
	const int tid = 0;
	#endif
	ThrData *thr = fix->get_thr(tid);
	double * const * const f = thr->get_f();
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
	FFT_SCALAR * const * const dr1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());

	int l,m,n,nx,ny,nz,mx,my,mz;
	FFT_SCALAR dx,dy,dz;
	FFT_SCALAR ekx,eky,ekz;
	double sf = 0.0;
	double s1,s2,s3;

	// this if protects against having more threads than local atoms
	if (ifrom < nlocal) {
	for (int i = ifrom; i < ito; i++) {

	nx = part2grid[i][0];
	ny = part2grid[i][1];
	nz = part2grid[i][2];
	dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
	dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
	dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;

	compute_rho1d_thr(r1d,dx,dy,dz, order, rho_coeff);
	compute_drho1d_thr(dr1d,dx,dy,dz, order, drho_coeff);

	ekx = eky = ekz = ZEROF;
	for (n = nlower; n <= nupper; n++) {
	mz = n+nz;
	for (m = nlower; m <= nupper; m++) {
	my = m+ny;
	for (l = nlower; l <= nupper; l++) {
	mx = l+nx;
	ekx += dr1d[0][l]r1d[1][m]r1d[2][n]*u_brick[mz][my][mx];
	eky += r1d[0][l]dr1d[1][m]r1d[2][n]*u_brick[mz][my][mx];
	ekz += r1d[0][l]r1d[1][m]dr1d[2][n]*u_brick[mz][my][mx];
	}
	}
	}
	ekx *= hx_inv;
	eky *= hy_inv;
	ekz *= hz_inv;

	// convert E-field to force
	const double qfactor = qqrd2e*scale;

	s1 = x[i][0]*hx_inv;
	s2 = x[i][1]*hy_inv;
	s3 = x[i][2]*hz_inv;
	sf = sf_coeff[0]sin(2MY_PI*s1);
	sf += sf_coeff[1]sin(4MY_PI*s1);
	sf = 2q[i]*q[i];
	f[i][0] += qfactor(ekxq[i] - sf);

	sf = sf_coeff[2]sin(2MY_PI*s2);
	sf += sf_coeff[3]sin(4MY_PI*s2);
	sf = 2q[i]*q[i];
	f[i][1] += qfactor(ekyq[i] - sf);


	sf = sf_coeff[4]sin(2MY_PI*s3);
	sf += sf_coeff[5]sin(4MY_PI*s3);
	sf = 2q[i]*q[i];
	if (slabflag != 2) f[i][2] += qfactor(ekzq[i] - sf);
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get per-atom energy/virial
	------------------------------------------------------------------------- */

	void PPPMDispOMP::fieldforce_c_peratom()
	{
	const int nlocal = atom->nlocal;

	// no local atoms => nothing to do

	if (nlocal == 0) return;

	// loop over my charges, interpolate from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt

	const double * const q = atom->q;
	const double * const * const x = atom->x;

	#if defined(_OPENMP)
	const int nthreads = comm->nthreads;
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	// each thread works on a fixed chunk of atoms.
	const int tid = omp_get_thread_num();
	const int inum = nlocal;
	const int idelta = 1 + inum/nthreads;
	const int ifrom = tid*idelta;
	const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
	#else
	const int ifrom = 0;
	const int ito = nlocal;
	const int tid = 0;
	#endif
	ThrData *thr = fix->get_thr(tid);
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());

	int l,m,n,nx,ny,nz,mx,my,mz;
	FFT_SCALAR dx,dy,dz,x0,y0,z0;
	FFT_SCALAR u,v0,v1,v2,v3,v4,v5;

	// this if protects against having more threads than local atoms
	if (ifrom < nlocal) {
	for (int i = ifrom; i < ito; i++) {

	nx = part2grid[i][0];
	ny = part2grid[i][1];
	nz = part2grid[i][2];
	dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
	dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
	dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;

	compute_rho1d_thr(r1d,dx,dy,dz, order, rho_coeff);

	u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
	for (n = nlower; n <= nupper; n++) {
	mz = n+nz;
	z0 = r1d[2][n];
	for (m = nlower; m <= nupper; m++) {
	my = m+ny;
	y0 = z0*r1d[1][m];
	for (l = nlower; l <= nupper; l++) {
	mx = l+nx;
	x0 = y0*r1d[0][l];
	if (eflag_atom) u += x0*u_brick[mz][my][mx];
	if (vflag_atom) {
	v0 += x0*v0_brick[mz][my][mx];
	v1 += x0*v1_brick[mz][my][mx];
	v2 += x0*v2_brick[mz][my][mx];
	v3 += x0*v3_brick[mz][my][mx];
	v4 += x0*v4_brick[mz][my][mx];
	v5 += x0*v5_brick[mz][my][mx];
	}
	}
	}
	}

	const double qfactor = 0.5force->qqrd2e scale * q[i];

	if (eflag_atom) eatom[i] += u*qfactor;
	if (vflag_atom) {
	vatom[i][0] += v0*qfactor;
	vatom[i][1] += v1*qfactor;
	vatom[i][2] += v2*qfactor;
	vatom[i][3] += v3*qfactor;
	vatom[i][4] += v4*qfactor;
	vatom[i][5] += v5*qfactor;
	}
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for ik scheme and geometric mixing rule
	------------------------------------------------------------------------- */

	void PPPMDispOMP::fieldforce_g_ik()
	{
	const int nlocal = atom->nlocal;

	// no local atoms => nothing to do

	if (nlocal == 0) return;

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of E-field on particle

	const double * const * const x = atom->x;

	#if defined(_OPENMP)
	const int nthreads = comm->nthreads;
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	// each thread works on a fixed chunk of atoms.
	const int tid = omp_get_thread_num();
	const int inum = nlocal;
	const int idelta = 1 + inum/nthreads;
	const int ifrom = tid*idelta;
	const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
	#else
	const int ifrom = 0;
	const int ito = nlocal;
	const int tid = 0;
	#endif
	ThrData *thr = fix->get_thr(tid);
	double * const * const f = thr->get_f();
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());

	int l,m,n,nx,ny,nz,mx,my,mz;
	FFT_SCALAR dx,dy,dz,x0,y0,z0;
	FFT_SCALAR ekx,eky,ekz;
	int type;
	double lj;

	// this if protects against having more threads than local atoms
	if (ifrom < nlocal) {
	for (int i = ifrom; i < ito; i++) {

	nx = part2grid_6[i][0];
	ny = part2grid_6[i][1];
	nz = part2grid_6[i][2];
	dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
	dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
	dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;

	compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);

	ekx = eky = ekz = ZEROF;
	for (n = nlower_6; n <= nupper_6; n++) {
	mz = n+nz;
	z0 = r1d[2][n];
	for (m = nlower_6; m <= nupper_6; m++) {
	my = m+ny;
	y0 = z0*r1d[1][m];
	for (l = nlower_6; l <= nupper_6; l++) {
	mx = l+nx;
	x0 = y0*r1d[0][l];
	ekx -= x0*vdx_brick_g[mz][my][mx];
	eky -= x0*vdy_brick_g[mz][my][mx];
	ekz -= x0*vdz_brick_g[mz][my][mx];
	}
	}
	}

	// convert E-field to force
	type = atom->type[i];
	lj = B[type];
	f[i][0] += lj*ekx;
	f[i][1] += lj*eky;
	f[i][2] += lj*ekz;
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for ad scheme and geometric mixing rule
	------------------------------------------------------------------------- */

	void PPPMDispOMP::fieldforce_g_ad()
	{
	const int nlocal = atom->nlocal;

	// no local atoms => nothing to do

	if (nlocal == 0) return;

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of E-field on particle

	const double * const * const x = atom->x;
	double *prd;

	if (triclinic == 0) prd = domain->prd;
	else prd = domain->prd_lamda;

	double xprd = prd[0];
	double yprd = prd[1];
	double zprd = prd[2];
	double zprd_slab = zprd*slab_volfactor;

	const double hx_inv = nx_pppm_6/xprd;
	const double hy_inv = ny_pppm_6/yprd;
	const double hz_inv = nz_pppm_6/zprd_slab;

	#if defined(_OPENMP)
	const int nthreads = comm->nthreads;
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	// each thread works on a fixed chunk of atoms.
	const int tid = omp_get_thread_num();
	const int inum = nlocal;
	const int idelta = 1 + inum/nthreads;
	const int ifrom = tid*idelta;
	const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
	#else
	const int ifrom = 0;
	const int ito = nlocal;
	const int tid = 0;
	#endif
	ThrData *thr = fix->get_thr(tid);
	double * const * const f = thr->get_f();
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
	FFT_SCALAR * const * const dr1d = static_cast<FFT_SCALAR **>(thr->get_drho1d_6());

	int l,m,n,nx,ny,nz,mx,my,mz;
	FFT_SCALAR dx,dy,dz;
	FFT_SCALAR ekx,eky,ekz;
	int type;
	double lj;
	double sf = 0.0;
	double s1,s2,s3;

	// this if protects against having more threads than local atoms
	if (ifrom < nlocal) {
	for (int i = ifrom; i < ito; i++) {

	nx = part2grid_6[i][0];
	ny = part2grid_6[i][1];
	nz = part2grid_6[i][2];
	dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
	dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
	dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;

	compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
	compute_drho1d_thr(dr1d,dx,dy,dz, order_6, drho_coeff_6);

	ekx = eky = ekz = ZEROF;
	for (n = nlower_6; n <= nupper_6; n++) {
	mz = n+nz;
	for (m = nlower_6; m <= nupper_6; m++) {
	my = m+ny;
	for (l = nlower_6; l <= nupper_6; l++) {
	mx = l+nx;
	ekx += dr1d[0][l]r1d[1][m]r1d[2][n]*u_brick_g[mz][my][mx];
	eky += r1d[0][l]dr1d[1][m]r1d[2][n]*u_brick_g[mz][my][mx];
	ekz += r1d[0][l]r1d[1][m]dr1d[2][n]*u_brick_g[mz][my][mx];
	}
	}
	}
	ekx *= hx_inv;
	eky *= hy_inv;
	ekz *= hz_inv;

	// convert E-field to force
	type = atom->type[i];
	lj = B[type];

	s1 = x[i][0]*hx_inv;
	s2 = x[i][1]*hy_inv;
	s3 = x[i][2]*hz_inv;

	sf = sf_coeff_6[0]sin(2MY_PI*s1);
	sf += sf_coeff_6[1]sin(4MY_PI*s1);
	sf = 2lj*lj;
	f[i][0] += ekx*lj - sf;

	sf = sf_coeff_6[2]sin(2MY_PI*s2);
	sf += sf_coeff_6[3]sin(4MY_PI*s2);
	sf = 2lj*lj;
	f[i][1] += eky*lj - sf;

	sf = sf_coeff_6[4]sin(2MY_PI*s3);
	sf += sf_coeff_6[5]sin(4MY_PI*s3);
	sf = 2lj*lj;
	if (slabflag != 2) f[i][2] += ekz*lj - sf;
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get per-atom energy/virial for dispersion
	interaction and geometric mixing rule
	------------------------------------------------------------------------- */

	void PPPMDispOMP::fieldforce_g_peratom()
	{
	const int nlocal = atom->nlocal;

	// no local atoms => nothing to do

	if (nlocal == 0) return;

	// loop over my charges, interpolate from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt

	const double * const * const x = atom->x;

	#if defined(_OPENMP)
	const int nthreads = comm->nthreads;
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	// each thread works on a fixed chunk of atoms.
	const int tid = omp_get_thread_num();
	const int inum = nlocal;
	const int idelta = 1 + inum/nthreads;
	const int ifrom = tid*idelta;
	const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
	#else
	const int ifrom = 0;
	const int ito = nlocal;
	const int tid = 0;
	#endif
	ThrData *thr = fix->get_thr(tid);
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());

	int l,m,n,nx,ny,nz,mx,my,mz;
	FFT_SCALAR dx,dy,dz,x0,y0,z0;
	FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
	int type;
	double lj;

	// this if protects against having more threads than local atoms
	if (ifrom < nlocal) {
	for (int i = ifrom; i < ito; i++) {

	nx = part2grid_6[i][0];
	ny = part2grid_6[i][1];
	nz = part2grid_6[i][2];
	dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
	dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
	dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;

	compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);

	u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
	for (n = nlower_6; n <= nupper_6; n++) {
	mz = n+nz;
	z0 = r1d[2][n];
	for (m = nlower_6; m <= nupper_6; m++) {
	my = m+ny;
	y0 = z0*r1d[1][m];
	for (l = nlower_6; l <= nupper_6; l++) {
	mx = l+nx;
	x0 = y0*r1d[0][l];
	if (eflag_atom) u += x0*u_brick_g[mz][my][mx];
	if (vflag_atom) {
	v0 += x0*v0_brick_g[mz][my][mx];
	v1 += x0*v1_brick_g[mz][my][mx];
	v2 += x0*v2_brick_g[mz][my][mx];
	v3 += x0*v3_brick_g[mz][my][mx];
	v4 += x0*v4_brick_g[mz][my][mx];
	v5 += x0*v5_brick_g[mz][my][mx];
	}
	}
	}
	}

	type = atom->type[i];
	lj = B[type]*0.5;

	if (eflag_atom) eatom[i] += u*lj;
	if (vflag_atom) {
	vatom[i][0] += v0*lj;
	vatom[i][1] += v1*lj;
	vatom[i][2] += v2*lj;
	vatom[i][3] += v3*lj;
	vatom[i][4] += v4*lj;
	vatom[i][5] += v5*lj;
	}
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for ik scheme and arithmetic mixing rule
	------------------------------------------------------------------------- */

	void PPPMDispOMP::fieldforce_a_ik()
	{
	const int nlocal = atom->nlocal;

	// no local atoms => nothing to do

	if (nlocal == 0) return;

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of E-field on particle

	const double * const * const x = atom->x;

	#if defined(_OPENMP)
	const int nthreads = comm->nthreads;
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	// each thread works on a fixed chunk of atoms.
	const int tid = omp_get_thread_num();
	const int inum = nlocal;
	const int idelta = 1 + inum/nthreads;
	const int ifrom = tid*idelta;
	const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
	#else
	const int ifrom = 0;
	const int ito = nlocal;
	const int tid = 0;
	#endif
	ThrData *thr = fix->get_thr(tid);
	double * const * const f = thr->get_f();
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());

	int l,m,n,nx,ny,nz,mx,my,mz;
	FFT_SCALAR dx,dy,dz,x0,y0,z0;
	FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
	FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
	FFT_SCALAR ekx6, eky6, ekz6;
	int type;
	double lj0,lj1,lj2,lj3,lj4,lj5,lj6;

	// this if protects against having more threads than local atoms
	if (ifrom < nlocal) {
	for (int i = ifrom; i < ito; i++) {

	nx = part2grid_6[i][0];
	ny = part2grid_6[i][1];
	nz = part2grid_6[i][2];
	dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
	dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
	dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;

	compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);

	ekx0 = eky0 = ekz0 = ZEROF;
	ekx1 = eky1 = ekz1 = ZEROF;
	ekx2 = eky2 = ekz2 = ZEROF;
	ekx3 = eky3 = ekz3 = ZEROF;
	ekx4 = eky4 = ekz4 = ZEROF;
	ekx5 = eky5 = ekz5 = ZEROF;
	ekx6 = eky6 = ekz6 = ZEROF;
	for (n = nlower_6; n <= nupper_6; n++) {
	mz = n+nz;
	z0 = r1d[2][n];
	for (m = nlower_6; m <= nupper_6; m++) {
	my = m+ny;
	y0 = z0*r1d[1][m];
	for (l = nlower_6; l <= nupper_6; l++) {
	mx = l+nx;
	x0 = y0*r1d[0][l];
	ekx0 -= x0*vdx_brick_a0[mz][my][mx];
	eky0 -= x0*vdy_brick_a0[mz][my][mx];
	ekz0 -= x0*vdz_brick_a0[mz][my][mx];
	ekx1 -= x0*vdx_brick_a1[mz][my][mx];
	eky1 -= x0*vdy_brick_a1[mz][my][mx];
	ekz1 -= x0*vdz_brick_a1[mz][my][mx];
	ekx2 -= x0*vdx_brick_a2[mz][my][mx];
	eky2 -= x0*vdy_brick_a2[mz][my][mx];
	ekz2 -= x0*vdz_brick_a2[mz][my][mx];
	ekx3 -= x0*vdx_brick_a3[mz][my][mx];
	eky3 -= x0*vdy_brick_a3[mz][my][mx];
	ekz3 -= x0*vdz_brick_a3[mz][my][mx];
	ekx4 -= x0*vdx_brick_a4[mz][my][mx];
	eky4 -= x0*vdy_brick_a4[mz][my][mx];
	ekz4 -= x0*vdz_brick_a4[mz][my][mx];
	ekx5 -= x0*vdx_brick_a5[mz][my][mx];
	eky5 -= x0*vdy_brick_a5[mz][my][mx];
	ekz5 -= x0*vdz_brick_a5[mz][my][mx];
	ekx6 -= x0*vdx_brick_a6[mz][my][mx];
	eky6 -= x0*vdy_brick_a6[mz][my][mx];
	ekz6 -= x0*vdz_brick_a6[mz][my][mx];
	}
	}
	}

	// convert D-field to force
	type = atom->type[i];
	lj0 = B[7*type+6];
	lj1 = B[7*type+5];
	lj2 = B[7*type+4];
	lj3 = B[7*type+3];
	lj4 = B[7*type+2];
	lj5 = B[7*type+1];
	lj6 = B[7*type];
	f[i][0] += lj0ekx0 + lj1ekx1 + lj2ekx2 + lj3ekx3 + lj4ekx4 + lj5ekx5 + lj6*ekx6;
	f[i][1] += lj0eky0 + lj1eky1 + lj2eky2 + lj3eky3 + lj4eky4 + lj5eky5 + lj6*eky6;
	f[i][2] += lj0ekz0 + lj1ekz1 + lj2ekz2 + lj3ekz3 + lj4ekz4 + lj5ekz5 + lj6*ekz6;
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get dispersion field & force on my particles
	for ad scheme and arithmetic mixing rule
	------------------------------------------------------------------------- */

	void PPPMDispOMP::fieldforce_a_ad()
	{
	const int nlocal = atom->nlocal;

	// no local atoms => nothing to do

	if (nlocal == 0) return;

	// loop over my charges, interpolate electric field from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt
	// ek = 3 components of E-field on particle

	const double * const * const x = atom->x;
	double *prd;

	if (triclinic == 0) prd = domain->prd;
	else prd = domain->prd_lamda;

	double xprd = prd[0];
	double yprd = prd[1];
	double zprd = prd[2];
	double zprd_slab = zprd*slab_volfactor;

	const double hx_inv = nx_pppm_6/xprd;
	const double hy_inv = ny_pppm_6/yprd;
	const double hz_inv = nz_pppm_6/zprd_slab;

	#if defined(_OPENMP)
	const int nthreads = comm->nthreads;
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	// each thread works on a fixed chunk of atoms.
	const int tid = omp_get_thread_num();
	const int inum = nlocal;
	const int idelta = 1 + inum/nthreads;
	const int ifrom = tid*idelta;
	const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
	#else
	const int ifrom = 0;
	const int ito = nlocal;
	const int tid = 0;
	#endif
	ThrData *thr = fix->get_thr(tid);
	double * const * const f = thr->get_f();
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
	FFT_SCALAR * const * const dr1d = static_cast<FFT_SCALAR **>(thr->get_drho1d_6());

	int l,m,n,nx,ny,nz,mx,my,mz;
	FFT_SCALAR dx,dy,dz,x0,y0,z0;
	FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
	FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
	FFT_SCALAR ekx6, eky6, ekz6;
	int type;
	double lj0,lj1,lj2,lj3,lj4,lj5,lj6;
	double sf = 0.0;
	double s1,s2,s3;

	// this if protects against having more threads than local atoms
	if (ifrom < nlocal) {
	for (int i = ifrom; i < ito; i++) {

	nx = part2grid_6[i][0];
	ny = part2grid_6[i][1];
	nz = part2grid_6[i][2];
	dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
	dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
	dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;

	compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
	compute_drho1d_thr(dr1d,dx,dy,dz, order_6, drho_coeff_6);

	ekx0 = eky0 = ekz0 = ZEROF;
	ekx1 = eky1 = ekz1 = ZEROF;
	ekx2 = eky2 = ekz2 = ZEROF;
	ekx3 = eky3 = ekz3 = ZEROF;
	ekx4 = eky4 = ekz4 = ZEROF;
	ekx5 = eky5 = ekz5 = ZEROF;
	ekx6 = eky6 = ekz6 = ZEROF;
	for (n = nlower_6; n <= nupper_6; n++) {
	mz = n+nz;
	for (m = nlower_6; m <= nupper_6; m++) {
	my = m+ny;
	for (l = nlower_6; l <= nupper_6; l++) {
	mx = l+nx;
	x0 = dr1d[0][l]r1d[1][m]r1d[2][n];
	y0 = r1d[0][l]dr1d[1][m]r1d[2][n];
	z0 = r1d[0][l]r1d[1][m]dr1d[2][n];

	ekx0 += x0*u_brick_a0[mz][my][mx];
	eky0 += y0*u_brick_a0[mz][my][mx];
	ekz0 += z0*u_brick_a0[mz][my][mx];

	ekx1 += x0*u_brick_a1[mz][my][mx];
	eky1 += y0*u_brick_a1[mz][my][mx];
	ekz1 += z0*u_brick_a1[mz][my][mx];

	ekx2 += x0*u_brick_a2[mz][my][mx];
	eky2 += y0*u_brick_a2[mz][my][mx];
	ekz2 += z0*u_brick_a2[mz][my][mx];

	ekx3 += x0*u_brick_a3[mz][my][mx];
	eky3 += y0*u_brick_a3[mz][my][mx];
	ekz3 += z0*u_brick_a3[mz][my][mx];

	ekx4 += x0*u_brick_a4[mz][my][mx];
	eky4 += y0*u_brick_a4[mz][my][mx];
	ekz4 += z0*u_brick_a4[mz][my][mx];

	ekx5 += x0*u_brick_a5[mz][my][mx];
	eky5 += y0*u_brick_a5[mz][my][mx];
	ekz5 += z0*u_brick_a5[mz][my][mx];

	ekx6 += x0*u_brick_a6[mz][my][mx];
	eky6 += y0*u_brick_a6[mz][my][mx];
	ekz6 += z0*u_brick_a6[mz][my][mx];
	}
	}
	}

	ekx0 *= hx_inv;
	eky0 *= hy_inv;
	ekz0 *= hz_inv;

	ekx1 *= hx_inv;
	eky1 *= hy_inv;
	ekz1 *= hz_inv;

	ekx2 *= hx_inv;
	eky2 *= hy_inv;
	ekz2 *= hz_inv;

	ekx3 *= hx_inv;
	eky3 *= hy_inv;
	ekz3 *= hz_inv;

	ekx4 *= hx_inv;
	eky4 *= hy_inv;
	ekz4 *= hz_inv;

	ekx5 *= hx_inv;
	eky5 *= hy_inv;
	ekz5 *= hz_inv;

	ekx6 *= hx_inv;
	eky6 *= hy_inv;
	ekz6 *= hz_inv;

	// convert D-field to force
	type = atom->type[i];
	lj0 = B[7*type+6];
	lj1 = B[7*type+5];
	lj2 = B[7*type+4];
	lj3 = B[7*type+3];
	lj4 = B[7*type+2];
	lj5 = B[7*type+1];
	lj6 = B[7*type];

	s1 = x[i][0]*hx_inv;
	s2 = x[i][1]*hy_inv;
	s3 = x[i][2]*hz_inv;

	sf = sf_coeff_6[0]sin(2MY_PI*s1);
	sf += sf_coeff_6[1]sin(4MY_PI*s1);
	sf = 4lj0lj6 + 4lj1lj5 + 4lj2lj4 + 2lj3*lj3;
	f[i][0] += lj0ekx0 + lj1ekx1 + lj2ekx2 + lj3ekx3 + lj4ekx4 + lj5ekx5 + lj6*ekx6 - sf;

	sf = sf_coeff_6[2]sin(2MY_PI*s2);
	sf += sf_coeff_6[3]sin(4MY_PI*s2);
	sf = 4lj0lj6 + 4lj1lj5 + 4lj2lj4 + 2lj3*lj3;
	f[i][1] += lj0eky0 + lj1eky1 + lj2eky2 + lj3eky3 + lj4eky4 + lj5eky5 + lj6*eky6 - sf;

	sf = sf_coeff_6[4]sin(2MY_PI*s3);
	sf += sf_coeff_6[5]sin(4MY_PI*s3);
	sf = 4lj0lj6 + 4lj1lj5 + 4lj2lj4 + 2lj3*lj3;
	if (slabflag != 2) f[i][2] += lj0ekz0 + lj1ekz1 + lj2ekz2 + lj3ekz3 + lj4ekz4 + lj5ekz5 + lj6*ekz6 - sf;
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	interpolate from grid to get per-atom energy/virial for dispersion
	interaction and arithmetic mixing rule
	------------------------------------------------------------------------- */

	void PPPMDispOMP::fieldforce_a_peratom()
	{
	const int nlocal = atom->nlocal;

	// no local atoms => nothing to do

	if (nlocal == 0) return;

	// loop over my charges, interpolate from nearby grid points
	// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
	// (dx,dy,dz) = distance to "lower left" grid pt
	// (mx,my,mz) = global coords of moving stencil pt

	const double * const * const x = atom->x;

	#if defined(_OPENMP)
	const int nthreads = comm->nthreads;
	#pragma omp parallel default(none)
	#endif
	{
	#if defined(_OPENMP)
	// each thread works on a fixed chunk of atoms.
	const int tid = omp_get_thread_num();
	const int inum = nlocal;
	const int idelta = 1 + inum/nthreads;
	const int ifrom = tid*idelta;
	const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
	#else
	const int ifrom = 0;
	const int ito = nlocal;
	const int tid = 0;
	#endif
	ThrData *thr = fix->get_thr(tid);
	FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());

	int l,m,n,nx,ny,nz,mx,my,mz;
	FFT_SCALAR dx,dy,dz,x0,y0,z0;
	FFT_SCALAR u0,v00,v10,v20,v30,v40,v50;
	FFT_SCALAR u1,v01,v11,v21,v31,v41,v51;
	FFT_SCALAR u2,v02,v12,v22,v32,v42,v52;
	FFT_SCALAR u3,v03,v13,v23,v33,v43,v53;
	FFT_SCALAR u4,v04,v14,v24,v34,v44,v54;
	FFT_SCALAR u5,v05,v15,v25,v35,v45,v55;
	FFT_SCALAR u6,v06,v16,v26,v36,v46,v56;
	int type;
	double lj0,lj1,lj2,lj3,lj4,lj5,lj6;

	// this if protects against having more threads than local atoms
	if (ifrom < nlocal) {
	for (int i = ifrom; i < ito; i++) {

	nx = part2grid_6[i][0];
	ny = part2grid_6[i][1];
	nz = part2grid_6[i][2];
	dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
	dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
	dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;

	compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);

	u0 = v00 = v10 = v20 = v30 = v40 = v50 = ZEROF;
	u1 = v01 = v11 = v21 = v31 = v41 = v51 = ZEROF;
	u2 = v02 = v12 = v22 = v32 = v42 = v52 = ZEROF;
	u3 = v03 = v13 = v23 = v33 = v43 = v53 = ZEROF;
	u4 = v04 = v14 = v24 = v34 = v44 = v54 = ZEROF;
	u5 = v05 = v15 = v25 = v35 = v45 = v55 = ZEROF;
	u6 = v06 = v16 = v26 = v36 = v46 = v56 = ZEROF;
	for (n = nlower_6; n <= nupper_6; n++) {
	mz = n+nz;
	z0 = r1d[2][n];
	for (m = nlower_6; m <= nupper_6; m++) {
	my = m+ny;
	y0 = z0*r1d[1][m];
	for (l = nlower_6; l <= nupper_6; l++) {
	mx = l+nx;
	x0 = y0*r1d[0][l];
	if (eflag_atom) {
	u0 += x0*u_brick_a0[mz][my][mx];
	u1 += x0*u_brick_a1[mz][my][mx];
	u2 += x0*u_brick_a2[mz][my][mx];
	u3 += x0*u_brick_a3[mz][my][mx];
	u4 += x0*u_brick_a4[mz][my][mx];
	u5 += x0*u_brick_a5[mz][my][mx];
	u6 += x0*u_brick_a6[mz][my][mx];
	}
	if (vflag_atom) {
	v00 += x0*v0_brick_a0[mz][my][mx];
	v10 += x0*v1_brick_a0[mz][my][mx];
	v20 += x0*v2_brick_a0[mz][my][mx];
	v30 += x0*v3_brick_a0[mz][my][mx];
	v40 += x0*v4_brick_a0[mz][my][mx];
	v50 += x0*v5_brick_a0[mz][my][mx];
	v01 += x0*v0_brick_a1[mz][my][mx];
	v11 += x0*v1_brick_a1[mz][my][mx];
	v21 += x0*v2_brick_a1[mz][my][mx];
	v31 += x0*v3_brick_a1[mz][my][mx];
	v41 += x0*v4_brick_a1[mz][my][mx];
	v51 += x0*v5_brick_a1[mz][my][mx];
	v02 += x0*v0_brick_a2[mz][my][mx];
	v12 += x0*v1_brick_a2[mz][my][mx];
	v22 += x0*v2_brick_a2[mz][my][mx];
	v32 += x0*v3_brick_a2[mz][my][mx];
	v42 += x0*v4_brick_a2[mz][my][mx];
	v52 += x0*v5_brick_a2[mz][my][mx];
	v03 += x0*v0_brick_a3[mz][my][mx];
	v13 += x0*v1_brick_a3[mz][my][mx];
	v23 += x0*v2_brick_a3[mz][my][mx];
	v33 += x0*v3_brick_a3[mz][my][mx];
	v43 += x0*v4_brick_a3[mz][my][mx];
	v53 += x0*v5_brick_a3[mz][my][mx];
	v04 += x0*v0_brick_a4[mz][my][mx];
	v14 += x0*v1_brick_a4[mz][my][mx];
	v24 += x0*v2_brick_a4[mz][my][mx];
	v34 += x0*v3_brick_a4[mz][my][mx];
	v44 += x0*v4_brick_a4[mz][my][mx];
	v54 += x0*v5_brick_a4[mz][my][mx];
	v05 += x0*v0_brick_a5[mz][my][mx];
	v15 += x0*v1_brick_a5[mz][my][mx];
	v25 += x0*v2_brick_a5[mz][my][mx];
	v35 += x0*v3_brick_a5[mz][my][mx];
	v45 += x0*v4_brick_a5[mz][my][mx];
	v55 += x0*v5_brick_a5[mz][my][mx];
	v06 += x0*v0_brick_a6[mz][my][mx];
	v16 += x0*v1_brick_a6[mz][my][mx];
	v26 += x0*v2_brick_a6[mz][my][mx];
	v36 += x0*v3_brick_a6[mz][my][mx];
	v46 += x0*v4_brick_a6[mz][my][mx];
	v56 += x0*v5_brick_a6[mz][my][mx];
	}
	}
	}
	}

	// convert D-field to force
	type = atom->type[i];
	lj0 = B[7type+6]0.5;
	lj1 = B[7type+5]0.5;
	lj2 = B[7type+4]0.5;
	lj3 = B[7type+3]0.5;
	lj4 = B[7type+2]0.5;
	lj5 = B[7type+1]0.5;
	lj6 = B[7type]0.5;

	if (eflag_atom)
	eatom[i] += u0lj0 + u1lj1 + u2*lj2 +
	u3lj3 + u4lj4 + u5lj5 + u6lj6;
	if (vflag_atom) {
	vatom[i][0] += v00lj0 + v01lj1 + v02lj2 + v03lj3 +
	v04lj4 + v05lj5 + v06*lj6;
	vatom[i][1] += v10lj0 + v11lj1 + v12lj2 + v13lj3 +
	v14lj4 + v15lj5 + v16*lj6;
	vatom[i][2] += v20lj0 + v21lj1 + v22lj2 + v23lj3 +
	v24lj4 + v25lj5 + v26*lj6;
	vatom[i][3] += v30lj0 + v31lj1 + v32lj2 + v33lj3 +
	v34lj4 + v35lj5 + v36*lj6;
	vatom[i][4] += v40lj0 + v41lj1 + v42lj2 + v43lj3 +
	v44lj4 + v45lj5 + v46*lj6;
	vatom[i][5] += v50lj0 + v51lj1 + v52lj2 + v53lj3 +
	v54lj4 + v55lj5 + v56*lj6;
	}
	}
	}
	}
	}

	/* ----------------------------------------------------------------------
	charge assignment into rho1d
	dx,dy,dz = distance of particle from "lower left" grid point
	------------------------------------------------------------------------- */
	void PPPMDispOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
	const FFT_SCALAR &dy, const FFT_SCALAR &dz,
	const int ord, FFT_SCALAR * const * const rho_c)
	{
	int k,l;
	FFT_SCALAR r1,r2,r3;

	for (k = (1-ord)/2; k <= ord/2; k++) {
	r1 = r2 = r3 = ZEROF;

	for (l = ord-1; l >= 0; l--) {
	r1 = rho_c[l][k] + r1*dx;
	r2 = rho_c[l][k] + r2*dy;
	r3 = rho_c[l][k] + r3*dz;
	}
	r1d[0][k] = r1;
	r1d[1][k] = r2;
	r1d[2][k] = r3;
	}
	}

	/* ----------------------------------------------------------------------
	charge assignment into drho1d
	dx,dy,dz = distance of particle from "lower left" grid point
	------------------------------------------------------------------------- */

	void PPPMDispOMP::compute_drho1d_thr(FFT_SCALAR * const * const dr1d, const FFT_SCALAR &dx,
	const FFT_SCALAR &dy, const FFT_SCALAR &dz,
	const int ord, FFT_SCALAR * const * const drho_c)
	{
	int k,l;
	FFT_SCALAR r1,r2,r3;

	for (k = (1-ord)/2; k <= ord/2; k++) {
	r1 = r2 = r3 = ZEROF;

	for (l = ord-2; l >= 0; l--) {
	r1 = drho_c[l][k] + r1*dx;
	r2 = drho_c[l][k] + r2*dy;
	r3 = drho_c[l][k] + r3*dz;
	}
	dr1d[0][k] = r1;
	dr1d[1][k] = r2;
	dr1d[2][k] = r3;
	}
	}

pppm_disp_omp.cppNo OneTemporaryActions

File Metadata

pppm_disp_omp.cppView Options

Event Timeline

pppm_disp_omp.cpp
No OneTemporary
Actions

pppm_disp_omp.cpp
View Options